# IMDB Sentiment Analysis using LSTMs
<hr>

### Steps
<ol type="1">
    <li>Load the dataset (50K IMDB Movie Review)</li>
    <li>Clean Dataset</li>
    <li>Encode Sentiments</li>
    <li>Split Dataset</li>
    <li>Tokenize and Pad/Truncate Reviews</li>
    <li>Build Architecture/Model</li>
    <li>Train and Test</li>
</ol>

In [13]:
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model 

In [14]:
data = pd.read_csv('IMDB Dataset.csv')
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [15]:
english_stops = set(stopwords.words('english'))

def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']
    y_data = df['sentiment']

    x_data = x_data.replace({'<.*?>': ''}, regex = True)  
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)  
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops]) 
    x_data = x_data.apply(lambda review: [w.lower() for w in review]) 
    
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()


In [16]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
39690    [movie, moving, amazing, piece, work, saw, the...
13195    [gone, wind, one, popular, books, ever, printe...
15115    [this, one, best, movies, i, seen, years, i, t...
14675    [how, movie, features, singing, curtis, mayfie...
12183    [i, admit, first, saw, madonna, performing, ho...
                               ...                        
37749    [i, witnessed, atrocities, cinema, in, past, c...
8427     [i, sitting, home, flipping, channels, i, ran,...
14291    [a, bunch, mostly, obnoxious, grossly, unappea...
12592    [i, walk, movie, screenings, movie, managed, b...
26544    [sam, firstenberg, ninja, the, domination, mix...
Name: review, Length: 40000, dtype: object 

18868    [i, fail, understand, anyone, would, allow, su...
2967     [this, like, school, video, project, propagand...
29962    [amazing, grace, languid, feel, tells, contemp...
4055     [i, finally, got, hold, lifeforce, dvd, widesc...
18249    [i, say, i, surprised, atrocity, i, watched, c...
 

In [17]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))


token = Tokenizer(lower=False)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[    3   616   401 ...     0     0     0]
 [  661  1709     5 ...  3759   763  6750]
 [    9     5    46 ...     0     0     0]
 ...
 [   39   672   558 ... 18547 13386   481]
 [    1  1111     3 ...  2825  1519   129]
 [ 1142 92200  2793 ...     0     0     0]] 

Encoded X Test
 [[    1  1872   292 ...     0     0     0]
 [    9     6   270 ...     0     0     0]
 [  401  1533 19747 ...     0     0     0]
 ...
 [    1   216    52 ...  1257 11711 25760]
 [ 5352 31001 10315 ...     0     0     0]
 [    9    24   141 ...     0     0     0]] 

Maximum review length:  130


In [18]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 130, 32)           2950432   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,975,329
Trainable params: 2,975,329
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
checkpoint = ModelCheckpoint(
    'LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [20]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5

Epoch 00001: accuracy improved from -inf to 0.57855, saving model to LSTM.h5
Epoch 2/5

Epoch 00002: accuracy improved from 0.57855 to 0.61313, saving model to LSTM.h5
Epoch 3/5

Epoch 00003: accuracy improved from 0.61313 to 0.69830, saving model to LSTM.h5
Epoch 4/5

Epoch 00004: accuracy improved from 0.69830 to 0.79625, saving model to LSTM.h5
Epoch 5/5

Epoch 00005: accuracy improved from 0.79625 to 0.82390, saving model to LSTM.h5


<tensorflow.python.keras.callbacks.History at 0x17f01b5ab20>

In [21]:
import warnings
warnings.filterwarnings("ignore")

y_pred = model.predict_classes(x_test, batch_size = 128)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8304
Wrong Prediction: 1696
Accuracy: 83.04


In [22]:
loaded_model = load_model('models/LSTM.h5')

In [None]:
review = str(input('Movie Review: '))
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, 
                               padding='post', truncating='post')
print(tokenize_words)

In [None]:
result = loaded_model.predict(tokenize_words)
print(result)

if result >= 0.7:
    print('positive')
else:
    print('negative')