In [1]:
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Load IMDB dataset from Kaggle input
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

# Encode sentiment: positive → 1, negative → 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['review'].values, df['sentiment'].values, test_size=0.2, random_state=42
)

In [3]:
# Tokenization using regex
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

# Tokenize
train_tokens = [tokenize(text) for text in train_texts]
test_tokens = [tokenize(text) for text in test_texts]

# Build vocabulary
all_tokens = [token for sublist in train_tokens for token in sublist]
counter = Counter(all_tokens)
vocab_size = 20000  # top 20,000 words

most_common = counter.most_common(vocab_size - 2)
word2idx = {"<PAD>": 0, "<UNK>": 1}
for idx, (word, _) in enumerate(most_common, start=2):
    word2idx[word] = idx
idx2word = {idx: word for word, idx in word2idx.items()}

# Encode text
def encode(tokens):
    return [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]

X_train_encoded = [torch.tensor(encode(tokens)) for tokens in train_tokens]
X_test_encoded = [torch.tensor(encode(tokens)) for tokens in test_tokens]

# Pad sequences
X_train_padded = pad_sequence(X_train_encoded, batch_first=True, padding_value=0)
X_test_padded = pad_sequence(X_test_encoded, batch_first=True, padding_value=0)
y_train_tensor = torch.tensor(train_labels)
y_test_tensor = torch.tensor(test_labels)

print(f"Vocabulary size: {len(word2idx)}")
print(f"Padded train shape: {X_train_padded.shape}")

Vocabulary size: 20000
Padded train shape: torch.Size([40000, 2525])


In [4]:
class IMDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = IMDBDataset(X_train_padded, y_train_tensor)
test_dataset = IMDBDataset(X_test_padded, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [5]:
def load_glove_embeddings(glove_path, word2idx, embedding_dim=100):
    embeddings_index = {}
    with open(glove_path, encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector

    matrix_len = len(word2idx)
    weights_matrix = np.zeros((matrix_len, embedding_dim))

    for word, i in word2idx.items():
        weights_matrix[i] = embeddings_index.get(word, np.random.normal(scale=0.6, size=(embedding_dim,)))
    
    return torch.tensor(weights_matrix, dtype=torch.float32)

# Use GloVe from Kaggle input
glove_path = "/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt"
embedding_dim = 100
glove_weights = load_glove_embeddings(glove_path, word2idx, embedding_dim)

In [6]:
class RNNModel(nn.Module):
    def __init__(self, embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=True)
        self.rnn = nn.RNN(embedding_weights.shape[1], 128, batch_first=True)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        _, h_n = self.rnn(embedded)
        return torch.sigmoid(self.fc(h_n.squeeze(0)))

class LSTMModel(nn.Module):
    def __init__(self, embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=True)
        self.lstm = nn.LSTM(embedding_weights.shape[1], 128, batch_first=True)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (h_n, _) = self.lstm(embedded)
        return torch.sigmoid(self.fc(h_n[-1]))

class RNNLearned(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, 128, batch_first=True)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        _, h_n = self.rnn(embedded)
        return torch.sigmoid(self.fc(h_n.squeeze(0)))

class LSTMLearned(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 128, batch_first=True)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (h_n, _) = self.lstm(embedded)
        return torch.sigmoid(self.fc(h_n[-1]))

In [7]:
def train_model(model, train_loader, val_loader, epochs=5):
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.float().to(device)
            optimizer.zero_grad()
            output = model(x_batch).squeeze()
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")
    
    evaluate_model(model, val_loader)

def evaluate_model(model, data_loader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for x_batch, y_batch in data_loader:
            x_batch = x_batch.to(device)
            output = model(x_batch).squeeze().cpu().numpy() > 0.5
            preds.extend(output)
            labels.extend(y_batch.numpy())
    acc = accuracy_score(labels, preds)
    print(f"Accuracy: {acc:.4f}")

In [8]:
print("🔹 RNN + GloVe")
model_rnn_glove = RNNModel(glove_weights)
train_model(model_rnn_glove, train_loader, test_loader)

print("\n🔹 LSTM + GloVe")
model_lstm_glove = LSTMModel(glove_weights)
train_model(model_lstm_glove, train_loader, test_loader)

print("\n🔹 RNN + Learned Embedding")
model_rnn_learned = RNNLearned(len(word2idx))
train_model(model_rnn_learned, train_loader, test_loader)

print("\n🔹 LSTM + Learned Embedding")
model_lstm_learned = LSTMLearned(len(word2idx))
train_model(model_lstm_learned, train_loader, test_loader)

🔹 RNN + GloVe
Epoch 1, Loss: 0.6976
Epoch 2, Loss: 0.6945
Epoch 3, Loss: 0.6945
Epoch 4, Loss: 0.6957
Epoch 5, Loss: 0.6951
Accuracy: 0.5040

🔹 LSTM + GloVe
Epoch 1, Loss: 0.6937
Epoch 2, Loss: 0.6932
Epoch 3, Loss: 0.6932
Epoch 4, Loss: 0.6932
Epoch 5, Loss: 0.6932
Accuracy: 0.5039

🔹 RNN + Learned Embedding
Epoch 1, Loss: 0.6956
Epoch 2, Loss: 0.6946
Epoch 3, Loss: 0.6939
Epoch 4, Loss: 0.6939
Epoch 5, Loss: 0.6943
Accuracy: 0.4960

🔹 LSTM + Learned Embedding
Epoch 1, Loss: 0.6936
Epoch 2, Loss: 0.6932
Epoch 3, Loss: 0.6932
Epoch 4, Loss: 0.6932
Epoch 5, Loss: 0.6932
Accuracy: 0.4960
