In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# IMDB Sentiment Analysis using LSTMs
<hr>
 
### Steps
<ol type="1">
    <li>Load the dataset (50K IMDB Movie Review)</li>
    <li>Clean Dataset</li>
    <li>Encode Sentiments</li>
    <li>Split Dataset</li>
    <li>Tokenize and Pad/Truncate Reviews</li>
    <li>Build Architecture/Model</li>
    <li>Train and Test</li>
</ol>

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model 

In [None]:
data = pd.read_csv('/content/drive/MyDrive/intern_project/IMDB Dataset.csv')
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [None]:
english_stops = set(stopwords.words('english'))

def load_dataset():
    df = pd.read_csv('/content/drive/MyDrive/intern_project/IMDB Dataset.csv')
    x_data = df['review']
    y_data = df['sentiment']

    x_data = x_data.replace({'<.*?>': ''}, regex = True)  
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)  
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops]) 
    x_data = x_data.apply(lambda review: [w.lower() for w in review]) 
    
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
2067     [many, times, description, full, sound, fury, ...
7295     [harvey, keital, best, performance, far, new, ...
34430    [the, legendary, boris, karloff, ended, illust...
796      [this, outing, knotts, includes, one, best, si...
11091    [this, one, boring, horror, films, i, ever, se...
                               ...                        
24822    [i, taken, another, look, film, still, conside...
18325    [the, war, at, home, good, become, new, favour...
1416     [considering, big, name, cast, lavish, product...
4646     [and, rather, unexpected, plot, line, era, pla...
31886    [the, figure, empress, elizabeth, austria, ind...
Name: review, Length: 40000, dtype: object 

8048     [the, trailer, movie, movie, justice, and, mov...
34150    [high, school, female, track, star, dies, bloo...
29348    [four, friends, first, billed, hbo, sleeper, h...
29648    [really, sexist, classist, thought, might, beg...
44776    [kennan, ivory, wayans, funny, low, down, dirt...
 

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))
    return int(np.ceil(np.mean(review_length)))


token = Tokenizer(lower=False)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   37   119  2810 ...     0     0     0]
 [ 4429 57213    45 ...     0     0     0]
 [    2  2472  4514 ...   661 18811 17149]
 ...
 [  988    98   297 ...     0     0     0]
 [   32   156  2045 ...     0     0     0]
 [    2   747 19378 ...    19  1398  3752]] 

Encoded X Test
 [[    2  1425     3 ...     0     0     0]
 [  211   275   556 ...   169  1427   366]
 [  588   257    23 ...    10    84     1]
 ...
 [  122     4  2078 ...     0     0     0]
 [ 2293  4771  6323 ...     0     0     0]
 [ 9114 37978  2247 ...   657   133     2]] 

Maximum review length:  130


In [None]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 130, 32)           2958272   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,983,169
Trainable params: 2,983,169
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
checkpoint = ModelCheckpoint(
    '/content/drive/MyDrive/intern_project/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5

Epoch 00001: accuracy improved from -inf to 0.73083, saving model to /content/drive/MyDrive/intern_project/LSTM.h5
Epoch 2/5

Epoch 00002: accuracy improved from 0.73083 to 0.91682, saving model to /content/drive/MyDrive/intern_project/LSTM.h5
Epoch 3/5

Epoch 00003: accuracy improved from 0.91682 to 0.95812, saving model to /content/drive/MyDrive/intern_project/LSTM.h5
Epoch 4/5

Epoch 00004: accuracy improved from 0.95812 to 0.97635, saving model to /content/drive/MyDrive/intern_project/LSTM.h5
Epoch 5/5

Epoch 00005: accuracy improved from 0.97635 to 0.98570, saving model to /content/drive/MyDrive/intern_project/LSTM.h5


<tensorflow.python.keras.callbacks.History at 0x7f90f2653fd0>

In [None]:
import warnings
warnings.filterwarnings("ignore")

y_pred = model.predict_classes(x_test, batch_size = 128)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8715
Wrong Prediction: 1285
Accuracy: 87.15


In [None]:
loaded_model = load_model('/content/drive/MyDrive/intern_project/LSTM.h5')

In [None]:
review = str(input('Movie Review: '))
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

Movie Review: The movie is very nice and I recommend you all to watch the movie , the movies is moral based and the acting of hero and heroine are very nice .it is a good film
Cleaned:  The movie is very nice and I recommend you all to watch the movie  the movies is moral based and the acting of hero and heroine are very nice it is a good film
Filtered:  ['the movie nice i recommend watch movie  movies moral based acting hero heroine nice good film']
[[   2    3  239    1  278   33    3   28 1371  341   44  467 1785  239
     9    4    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0

In [None]:
result = loaded_model.predict(tokenize_words)
print(result)

if result >= 0.7:
    print('positive')
else:
    print('negative')

[[0.99315923]]
positive
