In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import requests
import zipfile
import os

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
    return df

# Text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Build vocabulary
def build_vocab(texts, min_freq=2):
    word_counts = Counter()
    for text in texts:
        word_counts.update(text)
    
    vocab = {'<PAD>': 0, '<UNK>': 1}
    idx = 2
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab

# Download and load GloVe embeddings
def load_glove_embeddings(embedding_dim=100):
    glove_path = f'glove.6B.{embedding_dim}d.txt'
    
    if not os.path.exists(glove_path):
        print("Downloading GloVe embeddings...")
        url = f"https://nlp.stanford.edu/data/glove.6B.zip"
        response = requests.get(url, stream=True)
        with open('glove.6B.zip', 'wb') as f:
            f.write(response.content)
        
        with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
            zip_ref.extractall('.')
    
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(vocab, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for word, idx in vocab.items():
        if word in embeddings_index:
            embedding_matrix[idx] = embeddings_index[word]
        else:
            # Random initialization for unknown words
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix

# Dataset class
class MovieReviewDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=200):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Convert tokens to indices
        indices = [self.vocab.get(token, self.vocab['<UNK>']) for token in text]
        
        # Pad or truncate
        if len(indices) > self.max_len:
            indices = indices[:self.max_len]
        else:
            indices = indices + [self.vocab['<PAD>']] * (self.max_len - len(indices))
        
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.float)

Using device: cuda


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Vanilla RNN Model with GloVe
class GloVeRNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.embedding_dim = embedding_matrix.shape[1]
        self.vocab_size = embedding_matrix.shape[0]
        
        # Use pre-trained embeddings
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix), 
            freeze=False, 
            padding_idx=0
        )
        
        self.rnn = nn.RNN(
            self.embedding_dim, 
            hidden_dim, 
            n_layers, 
            batch_first=True, 
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

# LSTM Model with GloVe
class GloVeLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.embedding_dim = embedding_matrix.shape[1]
        self.vocab_size = embedding_matrix.shape[0]
        
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix), 
            freeze=False, 
            padding_idx=0
        )
        
        self.lstm = nn.LSTM(
            self.embedding_dim, 
            hidden_dim, 
            n_layers, 
            batch_first=True, 
            dropout=dropout,
            bidirectional=False
        )
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

# Vanilla RNN with torch.nn.Embedding (on-the-fly)
class TorchEmbedRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(
            embedding_dim, 
            hidden_dim, 
            n_layers, 
            batch_first=True, 
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

# LSTM with torch.nn.Embedding (on-the-fly)
class TorchEmbedLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            n_layers, 
            batch_first=True, 
            dropout=dropout,
            bidirectional=False
        )
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

In [4]:
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=10):
    model = model.to(device)
    train_losses, val_losses = [], []
    train_accs, val_accs = [], []
    
    for epoch in range(epochs):
        # Training
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for batch in train_loader:
            texts, labels = batch
            texts, labels = texts.to(device), labels.to(device)
            
            optimizer.zero_grad()
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            predicted = (torch.sigmoid(predictions) > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        
        train_loss = total_loss / len(train_loader)
        train_acc = correct / total
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        
        # Validation
        model.eval()
        total_val_loss = 0
        correct_val = 0
        total_val = 0
        
        with torch.no_grad():
            for batch in val_loader:
                texts, labels = batch
                texts, labels = texts.to(device), labels.to(device)
                
                predictions = model(texts).squeeze(1)
                loss = criterion(predictions, labels)
                total_val_loss += loss.item()
                
                predicted = (torch.sigmoid(predictions) > 0.5).float()
                correct_val += (predicted == labels).sum().item()
                total_val += labels.size(0)
        
        val_loss = total_val_loss / len(val_loader)
        val_acc = correct_val / total_val
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        
        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
        print('-' * 50)
    
    return train_losses, val_losses, train_accs, val_accs

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in test_loader:
            texts, labels = batch
            texts, labels = texts.to(device), labels.to(device)
            
            predictions = model(texts).squeeze(1)
            predicted = (torch.sigmoid(predictions) > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')
    return accuracy

In [6]:

# Load and preprocess data
df = load_and_preprocess_data('/kaggle/input/imdb-dataset-nlp/IMDB Dataset.csv')

# Preprocess texts
print("Preprocessing texts...")
df['tokens'] = df['review'].apply(preprocess_text)

# Split data
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Build vocabulary
print("Building vocabulary...")
vocab = build_vocab(train_df['tokens'])

# Create datasets
train_dataset = MovieReviewDataset(
    train_df['tokens'].tolist(), 
    train_df['sentiment'].tolist(), 
    vocab
)
val_dataset = MovieReviewDataset(
    val_df['tokens'].tolist(), 
    val_df['sentiment'].tolist(), 
    vocab
)
test_dataset = MovieReviewDataset(
    test_df['tokens'].tolist(), 
    test_df['sentiment'].tolist(), 
    vocab
)
    
# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Hyperparameters
embedding_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
dropout = 0.5
learning_rate = 0.001
epochs = 5

# Load GloVe embeddings
print("Loading GloVe embeddings...")
glove_embeddings = load_glove_embeddings(embedding_dim)
embedding_matrix = create_embedding_matrix(vocab, glove_embeddings, embedding_dim)

# Task 1: GloVe + Vanilla RNN
print("\n" + "="*50)
print("TASK 1: GloVe + Vanilla RNN")
print("="*50)

model1 = GloVeRNN(embedding_matrix, hidden_dim, output_dim, n_layers, dropout)
optimizer1 = optim.Adam(model1.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
    
train_losses1, val_losses1, train_accs1, val_accs1 = train_model(
    model1, train_loader, val_loader, optimizer1, criterion, epochs
)
test_acc1 = evaluate_model(model1, test_loader)

# Task 2: GloVe + LSTM
print("\n" + "="*50)
print("TASK 2: GloVe + LSTM")
print("="*50)

model2 = GloVeLSTM(embedding_matrix, hidden_dim, output_dim, n_layers, dropout)
optimizer2 = optim.Adam(model2.parameters(), lr=learning_rate)

train_losses2, val_losses2, train_accs2, val_accs2 = train_model(
    model2, train_loader, val_loader, optimizer2, criterion, epochs
)
test_acc2 = evaluate_model(model2, test_loader)

# Task 3: Torch Embedding + Vanilla RNN
print("\n" + "="*50)
print("TASK 3: Torch Embedding + Vanilla RNN")
print("="*50)

model3 = TorchEmbedRNN(len(vocab), embedding_dim, hidden_dim, output_dim, n_layers, dropout)
optimizer3 = optim.Adam(model3.parameters(), lr=learning_rate)

train_losses3, val_losses3, train_accs3, val_accs3 = train_model(
    model3, train_loader, val_loader, optimizer3, criterion, epochs
)
test_acc3 = evaluate_model(model3, test_loader)

# Task 4: Torch Embedding + LSTM
print("\n" + "="*50)
print("TASK 4: Torch Embedding + LSTM")
print("="*50)

model4 = TorchEmbedLSTM(len(vocab), embedding_dim, hidden_dim, output_dim, n_layers, dropout)
optimizer4 = optim.Adam(model4.parameters(), lr=learning_rate)

train_losses4, val_losses4, train_accs4, val_accs4 = train_model(
    model4, train_loader, val_loader, optimizer4, criterion, epochs
)
test_acc4 = evaluate_model(model4, test_loader)

# Print final results
print("\n" + "="*60)
print("FINAL RESULTS COMPARISON")
print("="*60)
print(f"GloVe + RNN Test Accuracy: {test_acc1:.4f}")
print(f"GloVe + LSTM Test Accuracy: {test_acc2:.4f}")
print(f"Torch Embed + RNN Test Accuracy: {test_acc3:.4f}")
print(f"Torch Embed + LSTM Test Accuracy: {test_acc4:.4f}")

Preprocessing texts...
Building vocabulary...
Loading GloVe embeddings...

TASK 1: GloVe + Vanilla RNN
Epoch 1/5:
Train Loss: 0.7060, Train Acc: 0.4999
Val Loss: 0.6937, Val Acc: 0.4971
--------------------------------------------------
Epoch 2/5:
Train Loss: 0.6981, Train Acc: 0.5009
Val Loss: 0.6926, Val Acc: 0.5117
--------------------------------------------------
Epoch 3/5:
Train Loss: 0.6976, Train Acc: 0.5030
Val Loss: 0.6952, Val Acc: 0.5081
--------------------------------------------------
Epoch 4/5:
Train Loss: 0.6986, Train Acc: 0.5015
Val Loss: 0.6938, Val Acc: 0.5081
--------------------------------------------------
Epoch 5/5:
Train Loss: 0.6980, Train Acc: 0.5029
Val Loss: 0.6935, Val Acc: 0.4975
--------------------------------------------------
Test Accuracy: 0.5160

TASK 2: GloVe + LSTM
Epoch 1/5:
Train Loss: 0.6944, Train Acc: 0.5059
Val Loss: 0.6930, Val Acc: 0.5081
--------------------------------------------------
Epoch 2/5:
Train Loss: 0.6934, Train Acc: 0.5093
