<a href="https://colab.research.google.com/github/Vitor-Sallenave/Formacao-em-NLP/blob/main/Sentiment-Analysis/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from google.colab import files

In [None]:
# Uploading files
files.upload()

{}

In [None]:
# Loading the tweets
tweets = pd.read_csv('Tweets.csv')
tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
# Analyzing the quantities
tweets.groupby(['airline_sentiment']).size()

airline_sentiment
negative    9178
neutral     3099
positive    2363
dtype: int64

In [None]:
# Filtering the data
tweets = tweets[tweets['airline_sentiment_confidence'] > 0.8]
tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
5,570300767074181121,negative,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)
9,570295459631263746,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)


In [None]:
# Creating the tokenizer
texts = tweets['text'].values
tokenizer = Tokenizer(num_words=100)

# Here, the default vocabulary is updated
tokenizer.fit_on_texts(texts)

In [None]:
# Performing the padding: defining a fixed sized for the texts by adding zeros
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, padding='post', maxlen=100)

In [None]:
print(X)

[[97 62  0 ...  0  0  0]
 [97 99  1 ...  0  0  0]
 [97  9 99 ...  0  0  0]
 ...
 [13 98 93 ...  0  0  0]
 [13 89  1 ...  0  0  0]
 [13  6 23 ...  0  0  0]]


In [None]:
# Converting the classes to numbers
lb = LabelEncoder()
y = lb.fit_transform(tweets['airline_sentiment'])

In [None]:
print(y)

[1 0 0 ... 0 1 0]


In [None]:
# Applying tOne hot-encoding
y = to_categorical(y)

In [None]:
print(y)

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [None]:
# Dividing our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
X_test

array([[ 8, 86, 70, ...,  0,  0,  0],
       [13, 46,  1, ...,  0,  0,  0],
       [ 8, 29, 74, ...,  0,  0,  0],
       ...,
       [13,  2, 14, ...,  0,  0,  0],
       [ 8, 36,  4, ...,  0,  0,  0],
       [ 8, 61, 50, ...,  0,  0,  0]], dtype=int32)

In [None]:
# Creating a sequential neural model
model = Sequential()

# Defining the vocabulary size
vocabulary = len(tokenizer.word_index)

# Number of attributes in X
X_attributes = X.shape[1]

# Adding layers
model.add(Embedding(input_dim=vocabulary,
          output_dim=128,
          input_length=X_attributes))

# Removing random conections
model.add(SpatialDropout1D(0.2))

# Recurrent layer
model.add(LSTM(units=196,
               dropout=0.2,
               recurrent_dropout=0,
               activation='tanh',
               recurrent_activation='sigmoid',
               unroll=False,
               use_bias=True))

# Output layer
model.add(Dense(units=3, activation='softmax'))

In [None]:
# Compilation
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 128)          1638656   
                                                                 
 spatial_dropout1d_2 (Spati  (None, 100, 128)          0         
 alDropout1D)                                                    
                                                                 
 lstm_2 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 3)                 591       
                                                                 
Total params: 1894047 (7.23 MB)
Trainable params: 1894047 (7.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Training the model
model.fit(X_train, y_train, epochs=10,
          batch_size=30, verbose=True,
          validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x77fb843ae740>

In [None]:
# Evaluating the model
val_loss, val_accuracy = model.evaluate(X_test, y_test)
print(f'Loss = {val_loss}\nAccuracy= = {val_accuracy}')

Loss = 0.8227820992469788
Accuracy= = 0.6975780725479126


In [None]:
# Making predictions
predictions = model.predict(X_test)
print(predictions)

[[0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 ...
 [0.69557226 0.15886399 0.1455637 ]
 [0.6955723  0.158864   0.14556369]
 [0.69557226 0.15886399 0.1455637 ]]


In [None]:
# Applying the model
def model_SA(X_sample, tokenizer, model):
    X = tokenizer.texts_to_sequences(X_sample)
    X = pad_sequences(X, padding='post', maxlen=100)

    return model.predict(X)

In [None]:
text_sample = 'To my mind, this product is even worse than the last one.'
probabilities = model_SA(text_sample,
                         tokenizer,
                         model)
print(probabilities)

[[0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.1455637 ]
 [0.69557226 0.15886399 0.14