In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Bidirectional
from tensorflow.keras.layers import Embedding, Dropout
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import precision_score, recall_score, f1_score
import os

In [None]:
# Fix random seed for reproducibility
np.random.seed(7)
tf.random.set_seed(7)

# Load the IMDB dataset
top_words = 5000  # Vocabulary size
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [6]:
# Pad sequences to ensure uniform length
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [7]:
# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, word_index, embedding_dim=100):
    embeddings_index = {}
    with open(glove_file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    embedding_matrix = np.zeros((top_words, embedding_dim))
    for word, i in word_index.items():
        if i < top_words:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


In [9]:
# Note: Download GloVe embeddings (e.g., glove.6B.100d.txt) and specify the path
# For this example, assume glove.6B.100d.txt is in the working directory
glove_path = 'glove.6B.100d.txt'
embedding_dim = 100
word_index = imdb.get_word_index()
embedding_matrix = load_glove_embeddings(glove_path, word_index, embedding_dim)

In [10]:
# Create the model
model = Sequential()
model.add(Embedding(top_words, embedding_dim, input_length=max_review_length,
                    weights=[embedding_matrix], trainable=False))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

In [13]:
# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer,
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Model summary
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          500000    
                                                                 
 conv1d (Conv1D)             (None, 496, 64)           32064     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 248, 64)          0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 248, 64)           0         
                                                                 
 bidirectional (Bidirectiona  (None, 248, 256)         197632    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 248, 256)          0

 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 894,177
Trainable params: 394,177
Non-trainable params: 500,000
_________________________________________________________________
None


In [15]:
# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                    epochs=20, batch_size=32, callbacks=[early_stopping, lr_scheduler])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


In [16]:
# Evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)
print(f"Accuracy: {scores[1]*100:.2f}%")
print(f"Precision: {scores[2]*100:.2f}%")
print(f"Recall: {scores[3]*100:.2f}%")

# Calculate F1-score
y_pred = (model.predict(X_test) > 0.5).astype("int32")
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1*100:.2f}%")

# Save the model
model.save('sentiment_analysis_model_improved.h5')
print("Saved model to disk")

# Optional: Load and verify the model
loaded_model = tf.keras.models.load_model('sentiment_analysis_model_improved.h5')
print("Model Loaded")

Accuracy: 80.66%
Precision: 78.35%
Recall: 84.74%
F1-Score: 81.42%
Saved model to disk
Model Loaded
