In [1]:

import pandas as pd


df = pd.read_csv("/content/drive/MyDrive/NLP_Assignment/IMDB Dataset.csv")
print("Original dataset size:", len(df))

df = df.sample(20000, random_state=42).reset_index(drop=True)
print("Reduced dataset size:", len(df))

print(df['sentiment'].value_counts())


Original dataset size: 50000
Reduced dataset size: 20000
sentiment
positive    10011
negative     9989
Name: count, dtype: int64


In [2]:

import re
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

nltk.download('stopwords')
from nltk.corpus import stopwords

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = [w for w in text.split() if w not in stopwords.words('english')]
    return " ".join(words)

df['clean_review'] = df['review'].apply(clean_text)

labels = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0).values


VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_review'])
sequences = tokenizer.texts_to_sequences(df['clean_review'])


MAXLEN = 200
X = pad_sequences(sequences, maxlen=MAXLEN, padding='post', truncating='post')
y = labels

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Shape of X: (20000, 200)
Shape of y: (20000,)


In [4]:
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import os

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

EMBED_DIM = 100
LSTM_UNITS_1 = 128
LSTM_UNITS_2 = 64
MAXLEN = 200
BATCH_SIZE = 64
EPOCHS = 8
MODEL_DIR = "model_checkpoints"
os.makedirs(MODEL_DIR, exist_ok=True)

inp = layers.Input(shape=(MAXLEN,))
x = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM)(inp)
x = layers.Bidirectional(layers.LSTM(LSTM_UNITS_1, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(LSTM_UNITS_2))(x)
x = layers.Dropout(0.4)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(1, activation='sigmoid')(x)

model = models.Model(inputs=inp, outputs=out)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

checkpoint_path = os.path.join(MODEL_DIR, "best_model.h5")
callbacks = [
    ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks
)


model.save("sentiment_model.h5")
print("Model saved as sentiment_model.h5")


Epoch 1/8
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.6141 - loss: 0.6334
Epoch 1: val_loss improved from inf to 0.41885, saving model to model_checkpoints/best_model.h5




[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 46ms/step - accuracy: 0.6145 - loss: 0.6332 - val_accuracy: 0.8037 - val_loss: 0.4189 - learning_rate: 0.0010
Epoch 2/8
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.8884 - loss: 0.2902
Epoch 2: val_loss improved from 0.41885 to 0.33671, saving model to model_checkpoints/best_model.h5




[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 44ms/step - accuracy: 0.8884 - loss: 0.2901 - val_accuracy: 0.8663 - val_loss: 0.3367 - learning_rate: 0.0010
Epoch 3/8
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.9505 - loss: 0.1515
Epoch 3: val_loss did not improve from 0.33671
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 44ms/step - accuracy: 0.9505 - loss: 0.1515 - val_accuracy: 0.8653 - val_loss: 0.4082 - learning_rate: 0.0010
Epoch 4/8
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.9719 - loss: 0.0895
Epoch 4: val_loss did not improve from 0.33671

Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 44ms/step - accuracy: 0.9719 - loss: 0.0896 - val_accuracy: 0.8480 - val_loss: 0.4819 - lear



Model saved as sentiment_model.h5


In [7]:

loss, accuracy = model.evaluate(X_val, y_val, batch_size=64)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")


[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8672 - loss: 0.3221
Validation Loss: 0.3367
Validation Accuracy: 0.8663


Original dataset size: 50000
Reduced dataset size: 20000
sentiment
positive    10011
negative     9989
Name: count, dtype: int64
Training set size: 16000
Test set size: 4000
Files saved: imdb_train.csv, imdb_test.csv
