# Sentiment Analysis using LSTM

## Step 0: Setting up the notebook


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM
from keras.callbacks import EarlyStopping


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 1: Loading data

In [4]:
features_train_dir = "./drive/My Drive/09112020-nlp/features_train.csv"
features_val_dir = "./drive/My Drive/09112020-nlp/features_val.csv"
features_test_dir = "./drive/My Drive/09112020-nlp/features_test.csv"
labels_train_dir = "./drive/My Drive/09112020-nlp/labels_train.csv"
labels_val_dir = "./drive/My Drive/09112020-nlp/labels_val.csv"
labels_test_dir = "./drive/My Drive/09112020-nlp/labels_test.csv"

In [5]:
features_train = pd.read_csv(features_train_dir, header=None, names=['review'], encoding='utf-8')
features_val = pd.read_csv(features_val_dir, header=None, names=['review'], encoding='utf-8')
features_test = pd.read_csv(features_test_dir, header=None, names=['review'], encoding='utf-8')
labels_train = pd.read_csv(labels_train_dir, header=None)
labels_val = pd.read_csv(labels_val_dir, header=None)
labels_test = pd.read_csv(labels_test_dir, header=None)

In [6]:
assert len(features_train) == len(labels_train)
assert len(features_val) == len(labels_val)
assert len(features_test) == len(labels_test)

print('train = {}; validation = {}; test = {}'.format(len(features_train), len(features_val), len(features_test)))

train = 3200000; validation = 400000; test = 400000


## Step 2: Preprocessing data

In [7]:
# Covert features to string type
features_train['review'] = features_train['review'].astype(str)
features_val['review'] = features_val['review'].astype(str)
features_test['review'] = features_test['review'].astype(str)

In [8]:
# Create a tokenizer object
tokenizer = Tokenizer(num_words=5000)

# Fit the tokenizer on train text
tokenizer.fit_on_texts(features_train['review'].values.tolist())

In [9]:
# Transform words to sequence
sequence_train = tokenizer.texts_to_sequences(features_train['review'])
sequence_val = tokenizer.texts_to_sequences(features_val['review'])
sequence_test = tokenizer.texts_to_sequences(features_test['review'])

# Pad the sequence
max_words = 250
X_train = sequence.pad_sequences(sequence_train, max_words)
X_val = sequence.pad_sequences(sequence_val, max_words)
X_test = sequence.pad_sequences(sequence_test,  max_words)

In [10]:
# Covert the labels to 0 (neggative) and 1 (positive)
y_train, y_val, y_test = labels_train - 1, labels_val - 1, labels_test - 1

## Step 3: Building LSTM model

In [11]:
# Set embedding params
max_features = 10000 # number of features
maxlen = 250 # length of input sequence
embedding_size = 32 

# Specify model architecture
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 32)           320000    
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 328,353
Trainable params: 328,353
Non-trainable params: 0
_________________________________________________________________


## Step 4: Training the model

In [22]:
# Train with early stopping strategy
model.fit(X_train, y_train, 
          batch_size=64, epochs=100, 
          validation_data=(X_val, y_val), 
          callbacks=[EarlyStopping(patience=1)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


<tensorflow.python.keras.callbacks.History at 0x7f5c7c679a90>

In [12]:
# Stack the train and val dataset horizontally
X_train = np.vstack((X_train, X_val))
y_train = np.vstack((y_train, y_val))

In [13]:
# Rebuild the model
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 32)           320000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 328,353
Trainable params: 328,353
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Train the model using train and validation data
model.fit(X_train, y_train, batch_size=64, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f7017e75fd0>

## Step 5: Evaluating models

In [15]:
_, acc = model.evaluate(X_test, y_test, batch_size=64)
print('Accuracy rate on test set: {}%'.format(np.round(acc * 100, 2)))

Accuracy rate on test set: 92.01%


## Step 6: Conclusion

### Recurrent Neural Network Pros:

* Provides powerful performance
* Theoretically, processes input of variable length
* Takes into account the order of sequence

### Recurrent Neural Network Cons:

* Computation heavy
* Black-box modeling