<a href="https://colab.research.google.com/github/Vitor-Sallenave/Formacao-em-NLP/blob/main/Sentiment-Analysis/Vader_vs_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

import pandas as pd

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


## **◼️ Files**

In [None]:
# Uploading the files
files.upload()

In [3]:
# Loading the tweets
tweets = pd.read_csv('Tweets2.csv')
tweets.head()

Unnamed: 0,id,local,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


## **◼️ Pre-processing**

In [4]:
# Analyzing the quantities
tweets.groupby(['sentiment']).size()

sentiment
Irrelevant    12990
Negative      22542
Neutral       18318
Positive      20832
dtype: int64

In [5]:
# Transforming the Irrelevant data into Neutral
tweets.loc[tweets['sentiment'] == 'Irrelevant', 'sentiment'] = 'Neutral'
tweets.groupby(['sentiment']).size()

sentiment
Negative    22542
Neutral     31308
Positive    20832
dtype: int64

In [6]:
# Treating the NANs
tweets = tweets.dropna(subset=['text'])

# When we alter the lines, the index will change
tweets.reset_index(drop=True, inplace=True)

In [7]:
tweets.shape

(73996, 4)

## **◼️ LSTM**

In [8]:
# Creating the tokenizer
texts = tweets['text'].values
tokenizer = Tokenizer(num_words=100)

# Here, the default vocabulary is updated
tokenizer.fit_on_texts(texts)

In [9]:
# Performing the padding: defining a fixed sized for the texts by adding zeros
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, padding='post', maxlen=100)

In [10]:
print(X)

[[13  4  2 ...  0  0  0]
 [ 2  3  1 ...  0  0  0]
 [13  4  2 ...  0  0  0]
 ...
 [23  1  6 ...  0  0  0]
 [23  1  6 ...  0  0  0]
 [23 30  1 ...  0  0  0]]


In [13]:
# Converting the classes to numbers
lb = LabelEncoder()
y = lb.fit_transform(tweets['sentiment'])

In [14]:
print(y)

[2 2 2 ... 2 2 2]


In [15]:
# Applying tOne hot-encoding
y = to_categorical(y)

In [16]:
print(y)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [17]:
# Dividing our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [18]:
X_test

array([[12, 45, 57, ...,  0,  0,  0],
       [10,  7,  0, ...,  0,  0,  0],
       [19, 13,  6, ...,  0,  0,  0],
       ...,
       [ 2, 55, 11, ...,  0,  0,  0],
       [39, 22,  4, ...,  0,  0,  0],
       [25, 85, 13, ...,  0,  0,  0]], dtype=int32)

In [19]:
# Creating a sequential neural model
model = Sequential()

# Defining the vocabulary size
vocabulary = len(tokenizer.word_index)

# Number of attributes in X
X_attributes = X.shape[1]

# Adding layers
model.add(Embedding(input_dim=vocabulary,
          output_dim=128,
          input_length=X_attributes))

# Removing random conections
model.add(SpatialDropout1D(0.2))

# Recurrent layer
model.add(LSTM(units=196,
               dropout=0.2,
               recurrent_dropout=0,
               activation='tanh',
               recurrent_activation='sigmoid',
               unroll=False,
               use_bias=True))

# Output layer
model.add(Dense(units=3, activation='softmax'))

In [20]:
# Compilation
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          4324224   
                                                                 
 spatial_dropout1d (Spatial  (None, 100, 128)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 3)                 591       
                                                                 
Total params: 4579615 (17.47 MB)
Trainable params: 4579615 (17.47 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
# Training the model
model.fit(X_train, y_train, epochs=5,
          batch_size=500, verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x788df22ab700>

In [23]:
# Evaluating the model
val_loss, val_accuracy = model.evaluate(X_test, y_test)
print(f'\nLoss = {val_loss}\nAccuracy= = {val_accuracy}')


Loss = 1.083804965019226
Accuracy= = 0.4146583080291748


## **◼️ VADER**

In [24]:
sia = SentimentIntensityAnalyzer()

In [31]:
# Creating a new column with the classifications
vader_sentiment = list()

for sample in texts:
    # Dictionary with the polarites: neg, neu, pos, compound
    polarities = sia.polarity_scores(sample)
    # Deleting 'coumpound'
    del polarities['compound']
    vader_sentiment.append(max(polarities, key=polarities.get))

tweets['vader_sentiment'] = vader_sentiment

In [35]:
tweets.head()

Unnamed: 0,id,local,sentiment,text,vader_sentiment
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,neu
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,neu
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,neu
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,neu
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,neu


In [34]:
tweets.groupby(['vader_sentiment']).size()

vader_sentiment
neg     3660
neu    65581
pos     4755
dtype: int64

In [36]:
tweets.groupby(['sentiment']).size()

sentiment
Negative    22358
Neutral     30983
Positive    20655
dtype: int64

In [37]:
tweets.loc[tweets['vader_sentiment'] == 'neg', 'vader_sentiment'] = 'Negative'
tweets.loc[tweets['vader_sentiment'] == 'pos', 'vader_sentiment'] = 'Positive'
tweets.loc[tweets['vader_sentiment'] == 'neu', 'vader_sentiment'] = 'Neutral'

In [39]:
tweets.groupby(['vader_sentiment']).size()

vader_sentiment
Negative     3660
Neutral     65581
Positive     4755
dtype: int64

In [40]:
y_pred = tweets['vader_sentiment'].values
y_true = tweets['sentiment'].values

In [None]:
# Creating a confusion matrix
cm = confusion_matrix(y_true, y_pred)
print(cm)

In [None]:
# The accuracy of the vader model is better than the RNN
accuracy = accuracy_score(y_true, y_pred)
print(accuracy)