In [36]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot     
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences       
from tensorflow.keras.layers import Embedding, LSTM, Dense, SimpleRNN
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential

from tensorflow.keras.datasets import imdb  


In [50]:

max_features = 10000 # Number of unique words to consider
maxlen = 500  # Maximum length of each input sequence
embedding_dim = 100  # Dimension of the embedding vector
batch_size = 32  # Number of samples per gradient update
# Load the IMDB dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)       
print("Training data shape:", x_train.shape, y_train.shape)
print("Test data shape:", x_test.shape, y_test.shape)   


Training data shape: (25000,) (25000,)
Test data shape: (25000,) (25000,)


In [51]:
##Inspecting the data
print("First training sample:", x_train[0]) 
print("First training label:", y_train[0])

First training sample: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
First training label: 1


In [52]:
#Mappping the words index to words
word_index = imdb.get_word_index()  
print("Word index length:", len(word_index))  # Length of the word index dictionary
# Adjusting the word index to account for reserved indices in IMDB dataset
# Reverse the word index to get words from indices
index_word = {index: word for word, index in word_index.items()}
# Print the first 10 words in the first training sample
print("First 10 words in the first training sample:")   
print([index_word.get(i - 3, '?') for i in x_train[0][:10]])  # Adjusting index by 3 due to reserved indices in IMDB dataset    
print(x_train[0][:10])  # Print the first training sample as indices 

Word index length: 88584
First 10 words in the first training sample:
['?', 'this', 'film', 'was', 'just', 'brilliant', 'casting', 'location', 'scenery', 'story']
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]


In [53]:
# Mapping the word indices to words
word_index = imdb.get_word_index()
print("Word index length:", len(word_index))

# Reverse the word index to get words from indices
index_word = {index + 3: word for word, index in word_index.items()}
print(index_word.get(14))  # Should print "<START>"
index_word[0] = "<PAD>"
index_word[1] = "<START>"
index_word[2] = "<UNK>"
index_word[3] = "<UNUSED>"

# Print the first 10 words in the first training sample
print("First 10 words in the first training sample:")
print([index_word.get(i, '?') for i in x_train[0][:10]])
print(x_train[0][:10])

Word index length: 88584
this
First 10 words in the first training sample:
['<START>', 'this', 'film', 'was', 'just', 'brilliant', 'casting', 'location', 'scenery', 'story']
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]


In [54]:
# Pad sequences to ensure uniform input size
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)   
# Define the model
model = Sequential()    

In [55]:
from tensorflow.keras.preprocessing import sequence

max_len =100

x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len) 
print("Training data shape after padding:", x_train.shape)
print("Test data shape after padding:", x_test.shape)

print("First training sample after padding:", x_train[0])
print("First training label:", y_train[0])  

Training data shape after padding: (25000, 100)
Test data shape after padding: (25000, 100)
First training sample after padding: [1415   33    6   22   12  215   28   77   52    5   14  407   16   82
    2    8    4  107  117 5952   15  256    4    2    7 3766    5  723
   36   71   43  530  476   26  400  317   46    7    4    2 1029   13
  104   88    4  381   15  297   98   32 2071   56   26  141    6  194
 7486   18    4  226   22   21  134  476   26  480    5  144   30 5535
   18   51   36   28  224   92   25  104    4  226   65   16   38 1334
   88   12   16  283    5   16 4472  113  103   32   15   16 5345   19
  178   32]
First training label: 1


In [56]:
model = Sequential()
model.add(Embedding(max_features,128,input_length=maxlen))  # Adjust input_length to match your sequence length 
model.add(SimpleRNN(128, activation='relu', return_sequences=True))
model.add(SimpleRNN(64))
model.add(Dense(1, activation='sigmoid'))
model.build(input_shape=(None, 100))  # 100 is your sequence length
model.summary()



In [57]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary() 

In [58]:
#create an instance of early stopping
from tensorflow.keras.callbacks import EarlyStopping    
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)   


In [59]:
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=50, validation_data=(x_test, y_test), callbacks=[early_stopping])  # Adjust epochs as needed   

Epoch 1/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 35ms/step - accuracy: 0.6020 - loss: 0.6418 - val_accuracy: 0.8053 - val_loss: 0.4413
Epoch 2/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 47ms/step - accuracy: 0.7653 - loss: 0.4579 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 3/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 46ms/step - accuracy: 0.4995 - loss: 0.6951 - val_accuracy: 0.5000 - val_loss: 0.6946
Epoch 4/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 47ms/step - accuracy: 0.4937 - loss: 0.6949 - val_accuracy: 0.5000 - val_loss: 0.6950


In [62]:
model.save('simple_rnn_model.h5')  # Save the model

