In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

### First of all, load dataset and select the relevant columns

In [3]:
data = pd.read_csv('Sentiment Analysis Dataset.csv')
# Keeping only the neccessary columns
data = data[['SentimentText','Sentiment']]

### Keep the Sentiment column as label, and process the text in a way that only letters, !? symbols and space remain. The text is then processed by Toenizer to be converted into a sequence which is fed into neural network as input

In [4]:
data['SentimentText'] = data['SentimentText'].apply(lambda x: x.lower())
data['SentimentText'] = data['SentimentText'].apply((lambda x: re.sub('[^a-zA-z!?\s]','',x)))

print(len(data[ data['Sentiment'] == 1]))
print(data[ data['Sentiment'] == 0].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 20000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['SentimentText'].values)
X = tokenizer.texts_to_sequences(data['SentimentText'].values)
X = pad_sequences(X)

790185
1576880


### Next, build a neural network based on LSTM. Note tha dropout, lstm_out, batch_size, embed_dim and optimizer are hayperparameters, they should be tuned well to get the best performance. The loss function is softmax which predicts the probability for each class and is specially for classification.

In [6]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1],))
model.add(LSTM(lstm_out, dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 43, 128)           2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 2,815,194
Trainable params: 2,815,194
Non-trainable params: 0
_________________________________________________________________
None


### The training dataset and test dataset:

In [8]:

Y = pd.get_dummies(data['Sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(1262900, 43) (1262900, 2)
(315725, 43) (315725, 2)


### Perform training and save the trained model

In [10]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)
model.save('sentiment_lstm')

Epoch 1/7
9867/9867 - 2533s - loss: 0.4189 - accuracy: 0.8068
Epoch 2/7
9867/9867 - 2036s - loss: 0.3765 - accuracy: 0.8304
Epoch 3/7
9867/9867 - 2122s - loss: 0.3524 - accuracy: 0.8434
Epoch 4/7
9867/9867 - 2233s - loss: 0.3308 - accuracy: 0.8546
Epoch 5/7
9867/9867 - 2004s - loss: 0.3106 - accuracy: 0.8647
Epoch 6/7
9867/9867 - 2017s - loss: 0.2918 - accuracy: 0.8742
Epoch 7/7
9867/9867 - 4444s - loss: 0.2758 - accuracy: 0.8818
INFO:tensorflow:Assets written to: sentiment_lstm/assets
INFO:tensorflow:Assets written to: sentiment_lstm/assets


In [11]:

validation_size = int(len(X_test)/2)

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

1234/1234 - 73s - loss: 0.4248 - accuracy: 0.8211
score: 0.42
acc: 0.82


## Observation:
Accuracy on test set is smaller than that on training dataset which indicates the model is overfitting in the training process. Nevertheless, in comparision to Navie Bayes (accuracy:65%) and SVM (accuracy:68%) in scikitlearn, there is doubtlessly improvement in the performace of lstm model (accuracy:82%) for this sentiment analysis task

### If we dive into the performance of classification, it can be observed that for different type of sentence, i.e. positive and negative, the accuracy is slightly diferent: accuracy for positive sentence is larger than accuracy for negative. By looking at negative words, like 'terrible', 'destroy', model can get wrong prrdiction easily even though the sentence is positive.

In [36]:
result = model.predict(X_validate,verbose = 2)

4934/4934 - 110s


In [37]:
predict = np.argmax(result, axis=1)

In [40]:
true_label = np.argmax(Y_validate,axis=1)

In [41]:
pos_true_index = true_label == 1
print('Pos:',np.sum(pos_true_index))
pos_cnt = np.sum(predict[pos_true_index]==true_label[pos_true_index])
print('pos_cnt:',pos_cnt)
print('pos_acc:',pos_cnt/np.sum(pos_true_index))

Pos: 79037
pos_cnt: 65044
pos_acc: 0.8229563369055


In [42]:
pos_true_index = true_label == 0
print('Neg:',np.sum(pos_true_index))
pos_cnt = np.sum(predict[pos_true_index]==true_label[pos_true_index])
print('Neg_cnt:',pos_cnt)
print('Neg_acc:',pos_cnt/np.sum(pos_true_index))

Neg: 78825
Neg_cnt: 64456
Neg_acc: 0.8177101173485569


In [69]:
for idx, sentence in enumerate(tokenizer.sequences_to_texts(X_validate[:10])):
    print(sentence)
    print('prediction:',predict[idx])
    print('True:',true_label[idx])

supposed to be seeing at tonight but might have to cancel if the weather stays this lame
prediction: 0
True: 0
could be data can usually be taken both ways in these kinds of things lets say its generally
prediction: 1
True: 1
hehe yeah days ago i put it to a stop but i still got used to via to make things flow
prediction: 1
True: 1
to the gym i feel like its gonna be a good workout today
prediction: 1
True: 1
the canucks looked terrible tonight save for the st period however seeing all those white towels live is still pretty amazing
prediction: 0
True: 1
ow itchy hay fever eyes for the lose
prediction: 0
True: 0
i have homework to do why was school invented haaa xx
prediction: 1
True: 0
is up feeding kaylee
prediction: 1
True: 1
hi guys cant wait to see the show i just know its going to be fantastic
prediction: 1
True: 1
there are who tweet perhaps you could do a
prediction: 1
True: 1


### As an extra step, let's see how the training performs visually in Tensorboard

In [70]:
# Load the TensorBoard notebook extension.
%reload_ext tensorboard

In [71]:
from datetime import datetime
from packaging import version

import tensorflow as tf
from tensorflow import keras

import numpy as np

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

TensorFlow version:  2.5.0


In [72]:
# hyperparameters
max_fatures= 20000
embed_dim = 128
lstm_out = 196
batch_size= 128

In [None]:
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

model = keras.models.Sequential([
    Embedding(max_fatures, embed_dim,input_length = X.shape[1],),
    LSTM(lstm_out, dropout=0.2),
    Dense(2,activation='softmax'),
])

model.compile(
    loss = 'categorical_crossentropy', 
    optimizer='adam',
    metrics = ['accuracy']
)

print("Training ... With default parameters, this takes less than 10 seconds.")
training_history = model.fit(
    X_train, # input
    Y_train, # output
    batch_size=batch_size,
    verbose=0, # Suppress chatty output; use Tensorboard instead
    epochs=5,
    validation_data=(X_validate, Y_validate),
    callbacks=[tensorboard_callback],
)

print("Average test loss: ", np.average(training_history.history['loss']))

In [73]:
%tensorboard --logdir logs

ERROR: Could not find `tensorboard`. Please ensure that your PATH
contains an executable `tensorboard` program, or explicitly specify
the path to a TensorBoard binary by setting the `TENSORBOARD_BINARY`
environment variable.

## Comments

- Must save trained model into a path, in case it is needed to load the model afterwards
- In the very few first steps, the model gets overfitted, we have to introduce a regularization term to av

### Reference
- [Sentment Analysis with LSTM by Peter Nagy ](https://github.com/nagypeterjob/Sentiment-Analysis-NLTK-ML-LSTM/blob/master/lstm.ipynb)