In [37]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb
from gensim.models import KeyedVectors

# Step 1: Load IMDB Dataset
max_features = 10000  # Limit vocabulary size to 10,000 words
max_length = 200  # Limit review length

print("Downloading IMDB dataset...")
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(f"Train size: {len(x_train)}, Test size: {len(x_test)}")

Downloading IMDB dataset...
Train size: 25000, Test size: 25000


In [38]:
# Decode IMDB reviews for Word2Vec processing and swap the key and value so that the integer value becomes the key
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

#the first 3 characters have special meaning where 0 is for padding, 1 for start of sequence and 2 for unknown word
#so we skip first 3 characters and unknown word is given question mark character
def decode_review(encoded_review):
    return " ".join([reverse_word_index.get(i - 3, "?") for i in encoded_review])

import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_review(review):
    review = re.sub(r'[^\w\s]', '', review)  # Remove punctuation like .,!
    review = review.lower()  # Convert to lowercase
    words = review.split() #list of words
    return " ".join([word for word in words if word not in stop_words]) #join them back

decoded_train = [decode_review(review) for review in x_train]
decoded_test = [decode_review(review) for review in x_test]

decoded_train = [clean_review(review) for review in decoded_train]
decoded_test = [clean_review(review) for review in decoded_test]


In [39]:
# Step 2: Load Pre-Trained Word2Vec
print("Loading pre-trained Word2Vec embeddings...")
word2vec_path = "C:/AI/LazyNlpDL/machine_learning_examples/Large_files/archive/GoogleNews-vectors-negative300.bin"  # Path to Google Word2Vec binary file
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

Loading pre-trained Word2Vec embeddings...


In [40]:
# Step 3: Create Word2Vec Embedding Matrix (300 Dimension)
embedding_dim = 300  

embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features and word in word2vec:
        embedding_matrix[i] = word2vec[word]

In [41]:
# Step 4: Convert Reviews to Padded Sequences
x_train_padded = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test_padded = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

In [42]:
# Step 5: Build Models
def build_model_rnn():
    model = Sequential([
        Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        tf.keras.layers.SimpleRNN(64, return_sequences=False),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_model_lstm():
    model = Sequential([
        Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        LSTM(64, return_sequences=False),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_model_gru():
    model = Sequential([
        Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        GRU(64, return_sequences=False),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [43]:
# Step 6: Evaluate Models
def evaluate_model(model, x_train, y_train, x_test, y_test, epochs=5):
    print(f"Training {model.name}...")
    model.fit(x_train, y_train, batch_size=32, epochs=epochs, validation_split=0.2, verbose=1)
    _, accuracy = model.evaluate(x_test, y_test, verbose=0)
    return accuracy

In [44]:
# Train and evaluate RNN, LSTM, and GRU
rnn_model = build_model_rnn()
rnn_accuracy = evaluate_model(rnn_model, x_train_padded, y_train, x_test_padded, y_test)

lstm_model = build_model_lstm()
lstm_accuracy = evaluate_model(lstm_model, x_train_padded, y_train, x_test_padded, y_test)

gru_model = build_model_gru()
gru_accuracy = evaluate_model(gru_model, x_train_padded, y_train, x_test_padded, y_test)


Training sequential_6...
Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - accuracy: 0.4949 - loss: 0.7254 - val_accuracy: 0.4994 - val_loss: 0.6965
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.5032 - loss: 0.7136 - val_accuracy: 0.4958 - val_loss: 0.6975
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.5003 - loss: 0.6997 - val_accuracy: 0.4964 - val_loss: 0.6943
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.5043 - loss: 0.6945 - val_accuracy: 0.5038 - val_loss: 0.6927
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.5053 - loss: 0.6951 - val_accuracy: 0.4998 - val_loss: 0.6936
Training sequential_7...
Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 32ms/step - accuracy: 0.5017 - loss: 0.6946 - val_accuracy:

In [45]:
# Step 7: Report Results
print("\nModel Performance:")
print(f"RNN Accuracy: {rnn_accuracy * 100:.2f}%")
print(f"LSTM Accuracy: {lstm_accuracy * 100:.2f}%")
print(f"GRU Accuracy: {gru_accuracy * 100:.2f}%")


Model Performance:
RNN Accuracy: 48.69%
LSTM Accuracy: 51.40%
GRU Accuracy: 75.80%
