#### Sentiment Analaysis of IMDB dataset using Simple RNN

In [2]:
# import required libraries
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, SpatialDropout1D
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [3]:
# import the IMDB dataset
from tensorflow.keras.datasets import imdb

# load the IMDB dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

# print the shape of the training and test data
print(f"Training data shape: {X_train.shape}, Labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}, Labels shape: {y_test.shape}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Training data shape: (25000,), Labels shape: (25000,)
Test data shape: (25000,), Labels shape: (25000,)


In [6]:
# map index to words
word_index = imdb.get_word_index()
reverse_word_index = {v: k for k, v in word_index.items()}  # adjust indices

In [7]:
# get sample review and its label
sample_review = X_train[0]
sample_label = y_train[0]

# print the sample review and its label
print(f"Sample Review (encoded): {sample_review}")
print(f"Sample Label: {sample_label}")

# decode the sample review
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in sample_review])
print(f"Decoded Sample Review: {decoded_review}")
print(f"Sample Label: {sample_label}")

Sample Review (encoded): [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Sample Label: 1
Decoded Sample Revi

In [8]:
# pad sequences to ensure uniform input size
X_train_padded = pad_sequences(X_train, maxlen=500)
X_test_padded = pad_sequences(X_test, maxlen=500)

# print the shape of padded sequences
print(f"Padded Training data shape: {X_train_padded.shape}")
print(f"Padded Test data shape: {X_test_padded.shape}")

Padded Training data shape: (25000, 500)
Padded Test data shape: (25000, 500)


In [14]:
# Trin Simple RNN model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=500))
model.add(SimpleRNN(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [17]:
# summary of the model
model.summary()

In [18]:
# Early stopping callback
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [19]:
# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
# train the model
model.fit(X_train_padded, y_train,
          validation_data=(X_test_padded, y_test),
            epochs=10, batch_size=64, callbacks=[early_stopping])

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 262ms/step - accuracy: 0.5716 - loss: 2840.5696 - val_accuracy: 0.6403 - val_loss: 0.6226
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 341ms/step - accuracy: 0.7097 - loss: 0.5840 - val_accuracy: 0.6758 - val_loss: 0.5908
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 347ms/step - accuracy: 0.7775 - loss: 0.5008 - val_accuracy: 0.7300 - val_loss: 0.5393
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 341ms/step - accuracy: 0.8295 - loss: 0.4147 - val_accuracy: 0.7573 - val_loss: 0.5176
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 347ms/step - accuracy: 0.8558 - loss: 0.3743 - val_accuracy: 0.7734 - val_loss: 0.5043
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 279ms/step - accuracy: 0.8829 - loss: 0.3116 - val_accuracy: 0.7727 - val_loss: 0.5157
E

<keras.src.callbacks.history.History at 0x215e0fb75c0>

In [21]:
# save the model
model.save('sentiment_analysis_model.h5')



In [22]:
# evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 62ms/step - accuracy: 0.7759 - loss: 0.5061
Test Loss: 0.5043032169342041, Test Accuracy: 0.7734400033950806
