In [3]:
import os
import re
import numpy as np
import pandas as pd
from string import digits

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, classification_report

# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# Gensim for loading Word2Vec binary
from gensim.models import KeyedVectors

ModuleNotFoundError: No module named 'numpy'

In [3]:
# -----------------------------
# Global hyperparameters
# -----------------------------
DATASET_CSV = os.path.join("./datasets", "IMDB-Dataset.csv")
W2V_BIN = os.path.join("./datasets/word2vec", "GoogleNews-vectors-negative300.bin")

MAX_LEN = 200             # Max tokens per review (pad/truncate)
VOCAB_SIZE = 50000        # Limit vocabulary size (most frequent words)
EMBEDDING_DIM = 300       # GoogleNews vectors are 300-dim
EMBEDDING_TRAINABLE = False
TEST_SIZE = 0.2
RANDOM_STATE = 42
EPOCHS = 3
BATCH_SIZE = 64


In [4]:
# =============================================
# Step 1: Load dataset
# =============================================

def read_data(file_path: str) -> pd.DataFrame:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset CSV not found at: {file_path}")
    df = pd.read_csv(file_path)
    expected_cols = {"review", "sentiment"}
    if not expected_cols.issubset(set(df.columns)):
        raise ValueError(f"Dataset must contain columns {expected_cols}, found: {df.columns.tolist()}")
    return df


df = read_data(DATASET_CSV)
print("Loaded dataset shape:", df.shape)
print(df.head(2))


Loaded dataset shape: (25000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive


In [5]:
# =============================================
# Step 2: Preprocess text
# =============================================

def clean_text_series(text_series: pd.Series) -> pd.Series:
    # Lowercase
    text_series = text_series.astype(str).str.lower()

    # Space around punctuation and normalize spaces
    text_series = text_series.apply(lambda x: re.sub(r"([?.!,¿])", r" \1 ", x))
    text_series = text_series.apply(lambda x: re.sub(r"[\"\']", "", x))  # remove straight quotes

    # Keep letters and selected punctuation
    text_series = text_series.apply(lambda x: re.sub(r"[^a-zA-Z?.!,¿]+", " ", x))

    # Remove digits
    rm_digits = str.maketrans('', '', digits)
    text_series = text_series.apply(lambda x: x.translate(rm_digits))

    # Strip and reduce multiple spaces
    text_series = text_series.str.strip()
    text_series = text_series.apply(lambda x: re.sub(r"\s+", " ", x))
    return text_series


df["review"] = clean_text_series(df["review"])
print("Sample cleaned review:\n", df["review"].iloc[0][:300], "...")


Sample cleaned review:
 one of the other reviewers has mentioned that after watching just oz episode youll be hooked . they are right , as this is exactly what happened with me . br br the first thing that struck me about oz was its brutality and unflinching scenes of violence , which set in right from the word go . trust  ...


In [6]:
# =============================================
# Step 3: Encode sentiment labels to 0/1
# =============================================

lb = LabelBinarizer()
df["sentiment"] = lb.fit_transform(df["sentiment"])  # positive=1, negative=0
print("Label classes:", getattr(lb, 'classes_', None))


Label classes: ['negative' 'positive']


In [7]:
# =============================================
# Step 4: Tokenization and Padding
# =============================================

tokenizer = Tokenizer(num_words=VOCAB_SIZE, lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(df["review"].tolist())

# Convert to sequences, then pad
sequences = tokenizer.texts_to_sequences(df["review"].tolist())
X = pad_sequences(sequences, maxlen=MAX_LEN, padding="post", truncating="post")
y = df["sentiment"].to_numpy().astype(np.int32)

word_index = tokenizer.word_index  # dict: token -> index
vocab_size_effective = min(VOCAB_SIZE, len(word_index) + 1)  # +1 for padding idx 0
print("Vocabulary size (effective):", vocab_size_effective)
print("X shape:", X.shape, "y shape:", y.shape)


Vocabulary size (effective): 50000
X shape: (25000, 200) y shape: (25000,)


In [8]:
# =============================================
# Step 5: Load pre-trained Word2Vec vectors (GoogleNews) and build embedding matrix
# =============================================

if not os.path.exists(W2V_BIN):
    raise FileNotFoundError(f"Word2Vec binary not found at: {W2V_BIN}")

print("Loading Word2Vec KeyedVectors (this may take a minute)...")
w2v = KeyedVectors.load_word2vec_format(W2V_BIN, binary=True)
print("Word2Vec loaded. Vocab size:", len(w2v.key_to_index))

# Build embedding matrix for our tokenizer vocab
embedding_matrix = np.zeros((vocab_size_effective, EMBEDDING_DIM), dtype=np.float32)
not_found = 0
for word, idx in word_index.items():
    if idx >= vocab_size_effective:
        continue
    if word in w2v.key_to_index:
        embedding_matrix[idx] = w2v[word]
    else:
        not_found += 1
print(f"Embedding matrix shape: {embedding_matrix.shape} | OOV tokens (within cap): {not_found}")


Loading Word2Vec KeyedVectors (this may take a minute)...
Word2Vec loaded. Vocab size: 3000000
Embedding matrix shape: (50000, 300) | OOV tokens (within cap): 12536


In [9]:
# =============================================
# Step 6: Train-test split
# =============================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes:", X_test.shape, y_test.shape)


Train shapes: (20000, 200) (20000,)
Test shapes: (5000, 200) (5000,)


In [10]:
# =============================================
# Step 7: Build GRU model
# =============================================

def build_model(vocab_size: int, embedding_dim: int, embedding_matrix: np.ndarray) -> tf.keras.Model:
    model = Sequential(name="w2v_gru_sentiment")
    model.add(
        Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            input_length=MAX_LEN,
            weights=[embedding_matrix],
            trainable=EMBEDDING_TRAINABLE,
            name="pretrained_embedding",
        )
    )
    model.add(GRU(128, name="gru"))
    model.add(Dense(128, activation="relu", name="dense_hidden"))
    model.add(Dropout(0.3, name="dropout"))
    model.add(Dense(1, activation="sigmoid", name="output"))
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model


model = build_model(vocab_size_effective, EMBEDDING_DIM, embedding_matrix)
model.summary()




In [11]:
# =============================================
# Step 8: Train
# =============================================

history = model.fit(
    X_train,
    y_train,
    validation_split=0.1,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
)


Epoch 1/3
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 345ms/step - accuracy: 0.5289 - loss: 0.6903 - val_accuracy: 0.5205 - val_loss: 0.6869
Epoch 2/3
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 274ms/step - accuracy: 0.5482 - loss: 0.6793 - val_accuracy: 0.5080 - val_loss: 0.6920
Epoch 3/3
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 291ms/step - accuracy: 0.5587 - loss: 0.6762 - val_accuracy: 0.7335 - val_loss: 0.5836


In [12]:
# =============================================
# Step 9: Evaluate
# =============================================

loss, acc = model.evaluate(X_test, y_test, verbose=1)
print(f"\nTest Accuracy (Keras evaluate): {acc*100:.2f}% | Loss: {loss:.4f}")

# Manual accuracy for sanity check
y_pred_prob = model.predict(X_test, batch_size=BATCH_SIZE)
y_pred_label = (y_pred_prob >= 0.5).astype(int)
manual_acc = accuracy_score(y_test, y_pred_label)
print(f"Test Accuracy (sklearn manual): {manual_acc*100:.2f}%")
print("\nClassification report:\n", classification_report(y_test, y_pred_label, target_names=["negative", "positive"]))


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 58ms/step - accuracy: 0.7246 - loss: 0.5856

Test Accuracy (Keras evaluate): 72.46% | Loss: 0.5856
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 99ms/step
Test Accuracy (sklearn manual): 72.46%

Classification report:
               precision    recall  f1-score   support

    negative       0.78      0.63      0.70      2505
    positive       0.69      0.82      0.75      2495

    accuracy                           0.72      5000
   macro avg       0.73      0.72      0.72      5000
weighted avg       0.73      0.72      0.72      5000



In [13]:
# =============================================
# Step 10: Sample predictions on unseen text
# =============================================

sample_texts = [
    "This movie was absolutely fantastic! The performances were stunning and the story was gripping.",
    "Terrible. I wasted two hours of my life. The plot was dull and the acting was worse.",
    "Not bad, but it could have been better. Some parts were enjoyable though.",
    "A masterpiece that will be remembered for years!",
    "I wouldn't recommend this to anyone."
]

# Preprocess -> tokenize -> pad
sample_clean = clean_text_series(pd.Series(sample_texts)).tolist()
sample_seq = tokenizer.texts_to_sequences(sample_clean)
sample_pad = pad_sequences(sample_seq, maxlen=MAX_LEN, padding="post", truncating="post")

sample_probs = model.predict(sample_pad)
sample_labels = (sample_probs >= 0.5).astype(int).flatten()

print("\nSample predictions:")
for txt, prob, lab in zip(sample_texts, sample_probs.flatten(), sample_labels):
    pred = "positive" if lab == 1 else "negative"
    print(f"- Text: {txt[:80]}...\n  Prob(positive)={prob:.3f} -> Pred={pred}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step

Sample predictions:
- Text: This movie was absolutely fantastic! The performances were stunning and the stor...
  Prob(positive)=0.576 -> Pred=positive
- Text: Terrible. I wasted two hours of my life. The plot was dull and the acting was wo...
  Prob(positive)=0.576 -> Pred=positive
- Text: Not bad, but it could have been better. Some parts were enjoyable though....
  Prob(positive)=0.576 -> Pred=positive
- Text: A masterpiece that will be remembered for years!...
  Prob(positive)=0.576 -> Pred=positive
- Text: I wouldn't recommend this to anyone....
  Prob(positive)=0.576 -> Pred=positive
