> ## Sentement Analysis - Integer Encoding

In [10]:
# Importing necessary libraries
import numpy as np
from keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Input, Dense, SimpleRNN

In [3]:
# Loading the dataset
(X_train,y_train),(X_test,y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
# The documents are already tokenized but not padded
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [6]:
# Padding the documents
X_train = pad_sequences(X_train, padding = 'post', maxlen = 50)
X_test = pad_sequences(X_test, padding = 'post', maxlen = 50) # maxlen will only select n(=50) words from each document

X_train[0]

array([2071,   56,   26,  141,    6,  194, 7486,   18,    4,  226,   22,
         21,  134,  476,   26,  480,    5,  144,   30, 5535,   18,   51,
         36,   28,  224,   92,   25,  104,    4,  226,   65,   16,   38,
       1334,   88,   12,   16,  283,    5,   16, 4472,  113,  103,   32,
         15,   16, 5345,   19,  178,   32], dtype=int32)

In [7]:
# Model Building
model = Sequential()

model.add(Input(shape = (50, 1))) # The input shape is in the form of (no.timestamps, input_size) -> no. timestamps: max no.words in the documents | input_size: no. words passed into the neural network at a single time
model.add(SimpleRNN(32, return_sequences = False))
model.add(Dense(1,activation = 'sigmoid'))

model.summary()

In [8]:
# Model Training
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
history = model.fit(X_train,y_train, epochs = 5, validation_data = (X_test, y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - accuracy: 0.5034 - loss: 0.7011 - val_accuracy: 0.5054 - val_loss: 0.6948
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - accuracy: 0.5015 - loss: 0.6929 - val_accuracy: 0.5006 - val_loss: 0.6937
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 13ms/step - accuracy: 0.5075 - loss: 0.6922 - val_accuracy: 0.5049 - val_loss: 0.6951
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.5144 - loss: 0.6923 - val_accuracy: 0.5027 - val_loss: 0.6964
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.5092 - loss: 0.6933 - val_accuracy: 0.5011 - val_loss: 0.6941


In [9]:
# Model Testing
test_review = "The storyline of the movie is pretty good but the performance of the actress is not that effective"

# Encoding
test_review_encoded, i = [], 0
dictonary = imdb.get_word_index(path="imdb_word_index.json")
for word in test_review.split():
    if i == 50: break
    if word in dictonary.keys():
        test_review_encoded.append(dictonary[word])
    else:
        test_review_encoded.append(0) # oov_word
    i += 1

test_review_encoded

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


[0, 766, 4, 1, 17, 6, 181, 49, 18, 1, 236, 4, 1, 521, 6, 21, 12, 1131]

In [11]:
# Padding
test_review_encoded = pad_sequences(np.array(test_review_encoded).reshape(1, -1), maxlen = 50, padding = "post") # Make sure to pass 2D list/array into pad_sequences
test_review_encoded

array([[   0,  766,    4,    1,   17,    6,  181,   49,   18,    1,  236,
           4,    1,  521,    6,   21,   12, 1131,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0]], dtype=int32)

In [13]:
# Classification
probab_positive_class = model.predict(test_review_encoded)
if (probab_positive_class > 0.5):
    print("Positive Sentement: ", probab_positive_class)
else:
    print("Negative Sentement: ", 1 - probab_positive_class)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Positive Sentement:  [[0.6800391]]
