# Sentiment Analysis

In [86]:
# Importing all important libraries for data processing and model building.

import numpy as np  # Linear algebra
import pandas as pd # data preprocessing, CSV file I/O 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from keras.utils import to_categorical
import re
import warnings
warnings.filterwarnings('ignore')

### Loading dataset

In [2]:
train_ds = pd.read_csv(r"C:\Users\Prerana\Desktop\Data Science\Projects\Sentiments\train.csv",encoding = 'latin1')
test_ds = pd.read_csv(r"C:\Users\Prerana\Desktop\Data Science\Projects\Sentiments\test.csv",encoding= 'latin1')

In [3]:
train_ds.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


### Data Preprocessing

In [4]:
# Removing the columns which are not required
train_ds = train_ds[['text','sentiment']]
test_ds = test_ds[['text','sentiment']]

In [5]:
train_ds.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [6]:
train_ds.shape

(27481, 2)

In [7]:
test_ds.shape

(4815, 2)

In [8]:
train_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       27480 non-null  object
 1   sentiment  27481 non-null  object
dtypes: object(2)
memory usage: 429.5+ KB


In [9]:
test_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4815 entries, 0 to 4814
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       3534 non-null   object
 1   sentiment  3534 non-null   object
dtypes: object(2)
memory usage: 75.4+ KB


In [10]:
train_ds.isna().sum()

text         1
sentiment    0
dtype: int64

In [11]:
test_ds.isna().sum()

text         1281
sentiment    1281
dtype: int64

In [12]:
# Filling the null values
train_ds.fillna('',inplace=True)
test_ds.fillna('',inplace=True)

In [13]:
test_ds.isna().sum()

text         0
sentiment    0
dtype: int64

In [89]:
# Checking for unique categories
train_ds['sentiment'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [None]:
# converting the categories in integer values 

In [15]:
lables = []
for i in train_ds['sentiment']:
    if i == 'positive':
        lables.append(0)
    elif i == 'negative':
        lables.append(1)
    else:
        lables.append(2)

In [16]:
lables = np.array(lables)
lables

array([2, 1, 1, ..., 0, 0, 2])

In [17]:
lables1 = []
for i in test_ds['sentiment']:
    if i == 'positive':
        lables1.append(0)
    elif i == 'negative':
        lables1.append(1)
    else:
        lables1.append(2)

In [18]:
lables1 = np.array(lables1)
lables1

array([2, 0, 1, ..., 2, 2, 2])

In [None]:
# Converting data into array for text processing

In [19]:
X_train = np.array(train_ds['text'].tolist())
y_train = lables
X_test = np.array(test_ds['text'].tolist())
y_test = lables1

In [20]:
X_train

array([' I`d have responded, if I were going',
       ' Sooo SAD I will miss you here in San Diego!!!',
       'my boss is bullying me...', ...,
       ' Yay good for both of you. Enjoy the break - you probably need it after such hectic weekend  Take care hun xxxx',
       ' But it was worth it  ****.',
       '   All this flirting going on - The ATG smiles. Yay.  ((hugs))'],
      dtype='<U159')

In [21]:
y_train

array([2, 1, 1, ..., 0, 0, 2])

### Model Building

In [22]:
tokenize = Tokenizer() # initializing object

In [23]:
tokenize.fit_on_texts(X_train) # fitting data fro train & test
tokenize.fit_on_texts(X_test)

In [24]:
text_size = len(tokenize.word_index)+1  # TChecks the vocabulary legnth in total which is going to be i/p to the model
text_size

28615

In [25]:
X_train_seq = tokenize.texts_to_sequences(X_train) # converting the text to the sequence 
X_test_seq = tokenize.texts_to_sequences(X_test)

In [26]:
maxlen_tr = max(len(seq) for seq in X_train_seq)  # finding the max length sequence 
maxlen_tr

35

In [27]:
maxlen_ts = max(len(seq) for seq in X_test_seq)
maxlen_ts

32

In [28]:
X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=35, padding='post') # padding the other seq to match the max length
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=35, padding='post')

In [29]:
X_train_pad[0]

array([   1,  162,   19, 7713,   71,    1,  151,   49,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0])

In [30]:
X_train_pad.shape

(27481, 35)

In [31]:
y_train = to_categorical(y_train,3) # coverting the labels to the catagorical to math the features
y_test = to_categorical(y_test,3)

#### Simple RNN

In [32]:
# The output_dim and input to RNN are intutive values after much trials and errors
# Softmax activation is used for better calculations 
model = Sequential([
        Embedding(input_dim=text_size, output_dim=10, input_length=35),
        SimpleRNN(25, return_sequences=False),
        Dense(3, activation='softmax')
])




In [33]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])




In [34]:
history = model.fit(X_train_pad, y_train, epochs=10, validation_data=(X_test_pad, y_test))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


##### As above model has giving accuracy of around 71%, I have also tried some more data processing below to get more better accuracy.

In [90]:
# Using NLTK to preprocess the text for get more optimized output for better accuracy

In [35]:
from nltk.tokenize import word_tokenize # to make word tokens from text sentences
from nltk.corpus import stopwords  # to remove stopwords
from nltk.stem import SnowballStemmer # to stem the word to make compact word
from nltk.stem import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer 
from string import punctuation 
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prerana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Prerana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [36]:
train_ds.head() # previous train data

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [37]:
# Dropping the 'Neutral' sentiments as positive and negative tweets are more enough to know about sentiments. 
tr_ds = train_ds[train_ds['sentiment'] != 'neutral']
te_ds = test_ds[test_ds['sentiment'] != 'neutral']
tr_ds.shape
tr_ds.head()

Unnamed: 0,text,sentiment
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
6,2am feedings for the baby are fun when he is a...,positive


In [91]:
# Data precessing again to make the dataset to feed as input to model same as above

In [38]:
sent = []
for i in tr_ds['sentiment']:
    if i == 'positive':
        sent.append(0)
    else:
        sent.append(1) 

In [39]:
sent1 = []
for i in te_ds['sentiment']:
    if i == 'positive':
        sent1.append(0)
    else:
        sent1.append(1) 

In [40]:
tr_ds['sentiment'] = sent
te_ds['sentiment'] = sent1

In [41]:
tr_ds.head()

Unnamed: 0,text,sentiment
1,Sooo SAD I will miss you here in San Diego!!!,1
2,my boss is bullying me...,1
3,what interview! leave me alone,1
4,"Sons of ****, why couldn`t they put them on t...",1
6,2am feedings for the baby are fun when he is a...,0


In [42]:
# Preprocessing the text by removing unnecessory data to make the valid input

stuff_to_be_removed = list(stopwords.words('english')) + list(punctuation)

def textprocessor(text):
    text = str(text)
    text = text.lower()   # Converts all uppercase letters to lowercase
    text = re.sub(r"https\S+|www\S+|https\S+"," ",text) # Remove all links from dataset
    text = re.sub("(\\d|\\W)+"," ",text) 
    text = re.sub(r'\@\w+|\#'," ",text)   # Remove # and @ symbols from text
    text = re.sub(r'[^\w\s\`]'," ",text)  # Remove other symbols like ^ except '
    text_tokens = word_tokenize(text)
    lem = SnowballStemmer('english')
    text = [lem.stem(word) for word in text_tokens if not word in stuff_to_be_removed]
    text1 = " ".join(text)
    return text1         

In [65]:
tr_ds['text'] = tr_ds['text'].apply(textprocessor)
te_ds['text'] = te_ds['text'].apply(textprocessor)

In [66]:
te_ds.shape

(3385, 2)

In [67]:
tr_ds.shape

(16363, 2)

In [68]:
x_tr = np.array(tr_ds['text'].tolist())
y_tr = np.array(tr_ds['sentiment'].tolist())
x_te = np.array(te_ds['text'].tolist())
y_te = np.array(te_ds['sentiment'].tolist())

In [69]:
x_te.shape

(3385,)

In [70]:
tokenize.fit_on_texts(x_tr)
tokenize.fit_on_texts(x_te)

In [71]:
vocab_size = len(tokenize.word_index)+1
vocab_size

33745

In [72]:
x_tr_sq = tokenize.texts_to_sequences(x_tr)
x_te_sq = tokenize.texts_to_sequences(x_te)

In [73]:
max_len = max(len(seq) for seq in x_tr_sq)
max_len

34

In [74]:
x_tr_pad = sequence.pad_sequences(x_tr_sq, maxlen=max_len, padding='post')
x_te_pad = sequence.pad_sequences(x_te_sq, maxlen=max_len, padding='post')

In [75]:
x_tr_pad

array([[  342,    69,    36, ...,     0,     0,     0],
       [ 1390,  5802,     0, ...,     0,     0,     0],
       [ 1029,   388,   699, ...,     0,     0,     0],
       ...,
       [  321, 15062,  2214, ...,     0,     0,     0],
       [  184,    10,   189, ...,     0,     0,     0],
       [  617,     0,     0, ...,     0,     0,     0]])

In [76]:
y_tr = to_categorical(y_tr,2)
y_te = to_categorical(y_te,2)

In [83]:
model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=5, input_length=max_len),
        SimpleRNN(20, return_sequences=False),
        Dense(2, activation='softmax')
])

In [84]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [85]:
hist = model.fit(x_tr_pad, y_tr, epochs=10, validation_data=(x_te_pad,y_te))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### After doing above preprocessing again, achived above good testing accuracy of 90%

In [92]:
# Working the same with LSTM to see the accuracy

### Model LSTM

In [58]:
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

In [59]:
# The output_dim and input to LSTM, dropout are intutive values after much trials and errors
# Softmax activation is used for better calculations 
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.4),
    LSTM(196, dropout=0.2, recurrent_dropout=0.2),
    Dense(2, activation='softmax'), 
])

In [60]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [61]:
hsty = model.fit(x_tr_pad, y_tr, epochs=40, batch_size=32, validation_data=(x_te_pad,y_te))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


#### With LSTM aslo get the good accuray of 90%