In [20]:
import os, re, math, random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [21]:
SEED = 52
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

IMDB_CSV = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
GLOVE_TXT = "/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt" 

MAX_VOCAB = 25000
MIN_FREQ = 2
MAX_LEN = 400              

BATCH_SIZE = 64
EPOCHS = 5                 
LR = 1e-3
EMB_DIM = 100              
HIDDEN = 128
DROPOUT = 0.2              

In [22]:
df = pd.read_csv(IMDB_CSV)
df["label"] = df["sentiment"].map({"positive": 1, "negative": 0}).astype(np.float32)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["review"].values, df["label"].values, test_size=0.2, random_state=SEED, stratify=df["label"].values
)

In [23]:
_tok = re.compile(r"[A-Za-z0-9']+")
def tokenize(s: str):
    return _tok.findall(s.lower())

In [24]:
train_tokens = [tokenize(t) for t in train_texts]

counts = Counter()
for toks in train_tokens:
    counts.update(toks)

specials = ["<pad>", "<unk>"]
most_common = [w for w, c in counts.items() if c >= MIN_FREQ]
most_common = sorted(most_common, key=lambda w: counts[w], reverse=True)[: MAX_VOCAB - len(specials)]

itos = specials + most_common
stoi = {w: i for i, w in enumerate(itos)}
PAD_IDX, UNK_IDX = stoi["<pad>"], stoi["<unk>"]

def encode(tokens):
    ids = [stoi.get(t, UNK_IDX) for t in tokens]
    if len(ids) > MAX_LEN:
        ids = ids[:MAX_LEN]
    return torch.tensor(ids, dtype=torch.long)

print(f"Vocab size: {len(itos)} | PAD={PAD_IDX} UNK={UNK_IDX}")

Vocab size: 25000 | PAD=0 UNK=1


In [25]:
train_ids = [encode(toks) for toks in train_tokens]
test_ids  = [encode(tokenize(t)) for t in test_texts]

y_train = torch.tensor(train_labels, dtype=torch.float32)
y_test  = torch.tensor(test_labels, dtype=torch.float32)

In [26]:
class ListDataset(Dataset):
    def __init__(self, seqs, labels):
        self.seqs = seqs
        self.labels = labels
    def __len__(self):
        return len(self.seqs)
    def __getitem__(self, i):
        return self.seqs[i], self.labels[i]

def collate_batch(batch):
    seqs, labels = zip(*batch)
    lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    padded = pad_sequence(seqs, batch_first=True, padding_value=PAD_IDX)
    labels = torch.stack([torch.tensor(l) for l in labels])
    return padded, lengths, labels

In [27]:
train_ds = ListDataset(train_ids, y_train)
test_ds  = ListDataset(test_ids,  y_test)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

In [28]:
def load_glove_matrix(glove_txt_path, itos, emb_dim=100, pad_idx=0, unk_idx=1):

    g = {}
    with open(glove_txt_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip().split(" ")
            w = parts[0]
            vec = np.asarray(parts[1:], dtype="float32")
            if vec.shape[0] == emb_dim:
                g[w] = vec

    matrix = np.random.normal(scale=0.6, size=(len(itos), emb_dim)).astype("float32")
    for i, w in enumerate(itos):
        if w in g:
            matrix[i] = g[w]
    matrix[pad_idx] = 0.0
    return torch.tensor(matrix, dtype=torch.float32)

In [None]:
class VanillaRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden, dropout=0.0, pad_idx=0, pretrained=None, freeze=True):
        super().__init__()
        if pretrained is None:
            self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        else:
            self.embedding = nn.Embedding.from_pretrained(pretrained, freeze=freeze, padding_idx=pad_idx)
        self.rnn = nn.RNN(emb_dim, hidden, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden, 1)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        packed = pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, h_n = self.rnn(packed)
        out = self.drop(h_n[-1])
        return self.fc(out).squeeze(1)


In [None]:
class LSTMNet(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden, dropout=0.0, pad_idx=0, pretrained=None, freeze=True):
        super().__init__()
        if pretrained is None:
            self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        else:
            self.embedding = nn.Embedding.from_pretrained(pretrained, freeze=freeze, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, hidden, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden, 1)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        packed = pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)
        out = self.drop(h_n[-1])
        return self.fc(out).squeeze(1)

In [31]:
def train_epoch(model, loader, optim, criterion):
    model.train()
    total_loss = 0.0
    for x, lengths, y in loader:
        x, lengths, y = x.to(DEVICE), lengths.to(DEVICE), y.to(DEVICE)
        optim.zero_grad()
        logits = model(x, lengths)
        loss = criterion(logits, y)
        loss.backward()
        optim.step()
        total_loss += loss.item()
    return total_loss / max(1, len(loader))

In [32]:
@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    all_preds, all_labels = [], []
    for x, lengths, y in loader:
        x, lengths, y = x.to(DEVICE), lengths.to(DEVICE), y.to(DEVICE)
        logits = model(x, lengths)
        loss = criterion(logits, y)
        total_loss += loss.item()
        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).long().cpu().numpy()
        all_preds.extend(preds.tolist())
        all_labels.extend(y.long().cpu().numpy().tolist())
    acc = accuracy_score(all_labels, all_preds)
    return total_loss / max(1, len(loader)), acc

In [33]:
def run_experiment(model_name, use_glove, freeze=True, epochs=EPOCHS):
    pretrained = None
    if use_glove:
        pretrained = load_glove_matrix(GLOVE_TXT, itos, emb_dim=EMB_DIM, pad_idx=PAD_IDX, unk_idx=UNK_IDX)

    if model_name == "rnn":
        model = VanillaRNN(
            vocab_size=len(itos), emb_dim=EMB_DIM, hidden=HIDDEN,
            dropout=DROPOUT, pad_idx=PAD_IDX, pretrained=pretrained, freeze=freeze
        )
    elif model_name == "lstm":
        model = LSTMNet(
            vocab_size=len(itos), emb_dim=EMB_DIM, hidden=HIDDEN,
            dropout=DROPOUT, pad_idx=PAD_IDX, pretrained=pretrained, freeze=freeze
        )
    else:
        raise ValueError("model_name must be 'rnn' or 'lstm'")

    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.BCEWithLogitsLoss()

    print(f"\n=== {model_name.upper()} | {'GloVe' if use_glove else 'Learned'} Embeddings | freeze={freeze} ===")
    for ep in range(1, epochs + 1):
        tr_loss = train_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_acc = evaluate(model, test_loader, criterion)
        print(f"Epoch {ep:02d} | train_loss={tr_loss:.4f} | val_loss={val_loss:.4f} | val_acc={val_acc:.4f}")
    return model

In [34]:
# 1) GloVe + Vanilla RNN (frozen embeddings)
model_rnn_glove = run_experiment("rnn", use_glove=True, freeze=True)

# 2) GloVe + LSTM (frozen embeddings)
model_lstm_glove = run_experiment("lstm", use_glove=True, freeze=True)

# 3) Learned Embedding + RNN
model_rnn_learned = run_experiment("rnn", use_glove=False)

# 4) Learned Embedding + LSTM
model_lstm_learned = run_experiment("lstm", use_glove=False)


=== RNN | GloVe Embeddings | freeze=True ===
Epoch 01 | train_loss=0.6685 | val_loss=0.6740 | val_acc=0.5898
Epoch 02 | train_loss=0.6799 | val_loss=0.7655 | val_acc=0.5133
Epoch 03 | train_loss=0.6694 | val_loss=0.6687 | val_acc=0.5799
Epoch 04 | train_loss=0.6543 | val_loss=0.7176 | val_acc=0.5595
Epoch 05 | train_loss=0.6700 | val_loss=0.6764 | val_acc=0.5682

=== LSTM | GloVe Embeddings | freeze=True ===
Epoch 01 | train_loss=0.6573 | val_loss=0.6636 | val_acc=0.6036
Epoch 02 | train_loss=0.6256 | val_loss=0.4773 | val_acc=0.7857
Epoch 03 | train_loss=0.4077 | val_loss=0.3571 | val_acc=0.8413
Epoch 04 | train_loss=0.3497 | val_loss=0.3241 | val_acc=0.8590
Epoch 05 | train_loss=0.3229 | val_loss=0.3037 | val_acc=0.8680

=== RNN | Learned Embeddings | freeze=True ===
Epoch 01 | train_loss=0.6478 | val_loss=0.6035 | val_acc=0.6802
Epoch 02 | train_loss=0.5912 | val_loss=0.5659 | val_acc=0.7246
Epoch 03 | train_loss=0.5643 | val_loss=0.6070 | val_acc=0.6648
Epoch 04 | train_loss=0.495

In [None]:
criterion = nn.BCEWithLogitsLoss()

rows = []
for arch, emb_name, mdl in [
    ("Vanilla RNN", "GloVe (frozen)",  model_rnn_glove),
    ("LSTM",        "GloVe (frozen)",  model_lstm_glove),
    ("Vanilla RNN", "Learned (nn.Embedding)", model_rnn_learned),
    ("LSTM",        "Learned (nn.Embedding)", model_lstm_learned),
]:
    val_loss, val_acc = evaluate(mdl, test_loader, criterion)  # returns (loss, accuracy)
    rows.append({
        "Architecture": arch,
        "Embeddings": emb_name,
        "Val Loss": round(val_loss, 4),
        "Val Accuracy": f"{val_acc * 100:.2f}%",
    })

results_df = pd.DataFrame(rows, columns=["Architecture", "Embeddings", "Val Loss", "Val Accuracy"])

try:
    from IPython.display import display
    display(results_df)
except:
    print(results_df.to_string(index=False))

Unnamed: 0,Architecture,Embeddings,Val Loss,Val Accuracy
0,Vanilla RNN,GloVe (frozen),0.6764,56.82%
1,LSTM,GloVe (frozen),0.3037,86.80%
2,Vanilla RNN,Learned (nn.Embedding),0.6014,66.86%
3,LSTM,Learned (nn.Embedding),0.2981,87.59%
