**Movie Sentiment Analysis using RNN & LSTM**

In [13]:
# Import libraries
from keras.datasets import imdb
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten
from tensorflow.keras.regularizers import l2

In [14]:
# IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
# Here we have set of 25,000 highly polar movie reviews for training and 25,000 for testing. 
# So, predict the number of positive and negative reviews using either classification or deep learning algorithms.
# only consider the top 10,000 most common words,

In [15]:
# Load the dataset
(X_train,y_train),(X_test,y_test) = imdb.load_data()

In [16]:
# To get shape
print('No of samples in training set',X_train.shape)
print('No of samples in test set',X_test.shape)


No of samples in training set (25000,)
No of samples in test set (25000,)


In [17]:
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [18]:
# To get maximum length of review
print(len(max((X_train+ X_test), key=len)))

2697


In [19]:
# Reshape the y value
import numpy as np
y_train = np.asarray(y_train).reshape((-1,1))
y_test = np.asarray(y_test).reshape((-1,1))

In [20]:
# Apply the padding and decide max length
X_train = pad_sequences(X_train,padding='post',maxlen=100)
X_test = pad_sequences(X_test,padding='post',maxlen=100)

In [21]:
X_train.shape

(25000, 100)

In [22]:
# using LSTM with Embedding Technique
from tensorflow.keras.layers import LSTM
model=Sequential()
model.add(Embedding(10000, 32,input_length=100))
model.add(LSTM(50))
model.add(Dense(1, activation='sigmoid'))


In [23]:
# For LSTM
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train,epochs=5,batch_size = 64,validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f656e7ddc90>

In [24]:
# To get test accuracy
scores = model.evaluate(X_test,y_test)
print('test accuracy',scores[1])

test accuracy 0.8379200100898743


In [25]:
# Prediction
y_pred = model.predict(X_test)
y_pred



array([[0.02727166],
       [0.9979475 ],
       [0.9307996 ],
       ...,
       [0.01121608],
       [0.04864967],
       [0.84314466]], dtype=float32)

In [26]:
# Covert probability to numbers
t1 = []
for i in range(len(y_pred)):
    if y_pred[i] >= 0.5:
        t1.append(1)
    else:
        t1.append(0)

In [27]:
print(t1)

[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 

In [28]:
# Actual Values
y_test

array([[0],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])