In [8]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from torch import nn
from torch.optim import Adam

# Load IMDb dataset
file_path = '/Users/shuvradas/Downloads/IMDB Dataset.csv'  # Update this path with your dataset location
df = pd.read_csv(file_path)

# Preprocess text (remove HTML tags, punctuation, lowercase)
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

df['review'] = df['review'].apply(preprocess_text)

# Convert sentiment labels to binary (positive=1, negative=0)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [10]:
# Load GloVe embeddings
embedding_dim = 100
glove_path = '/Users/shuvradas/Downloads/glove.6B/glove.6B.100d.txt'  # Update this path with your GloVe embeddings location
embedding_index = {}

with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

# Tokenize the text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

# Convert texts to sequences and pad them
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = 100  # Define max length for padding
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in tokenizer.word_index.items():
    if word in embedding_index:
        embedding_matrix[idx] = embedding_index[word]

In [11]:
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        review = torch.tensor(self.reviews[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return review, label

# Create DataLoader for training and testing
train_dataset = IMDBDataset(X_train_pad, y_train.values)
test_dataset = IMDBDataset(X_test_pad, y_test.values)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [12]:
class VanillaRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(VanillaRNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial hidden state
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])  # Use the output of the last time step
        return out

# Model parameters
embedding_dim = 100
hidden_dim = 128
output_dim = 1

# Initialize model, loss function, and optimizer
model_rnn = VanillaRNN(vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix)
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(model_rnn.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for reviews, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(reviews)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}')

# Train the Vanilla RNN model
train_model(model_rnn, train_loader, criterion, optimizer)


Epoch 1/5, Loss: 0.6833
Epoch 2/5, Loss: 0.6627
Epoch 3/5, Loss: 0.6526
Epoch 4/5, Loss: 0.6695
Epoch 5/5, Loss: 0.6548


In [13]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial hidden state
        c0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial cell state
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # Use the output of the last time step
        return out

# Initialize model, loss function, and optimizer
model_lstm = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix)
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(model_lstm.parameters(), lr=0.001)

# Train the LSTM model
train_model(model_lstm, train_loader, criterion, optimizer)


Epoch 1/5, Loss: 0.6093
Epoch 2/5, Loss: 0.4513
Epoch 3/5, Loss: 0.3863
Epoch 4/5, Loss: 0.3500
Epoch 5/5, Loss: 0.3233


In [17]:
# Function to evaluate the model
def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for reviews, labels in test_loader:
            outputs = model(reviews)
            loss = criterion(outputs.squeeze(), labels)
            total_loss += loss.item()
            
            # Convert logits to probabilities and then to binary predictions
            predictions = torch.round(torch.sigmoid(outputs.squeeze()))
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)
    
    avg_loss = total_loss / len(test_loader)
    accuracy = correct_predictions / total_predictions
    return avg_loss, accuracy

In [18]:
class VanillaRNNOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(VanillaRNNOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial hidden state
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])  # Use the output of the last time step
        return out

# Initialize model, loss function, and optimizer
model_rnn_otf = VanillaRNNOnTheFly(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(model_rnn_otf.parameters(), lr=0.001)

# Train the Vanilla RNN model with on-the-fly embeddings
train_model(model_rnn_otf, train_loader, criterion, optimizer)

# Evaluate the Vanilla RNN model with on-the-fly embeddings
rnn_otf_loss, rnn_otf_accuracy = evaluate_model(model_rnn_otf, test_loader, criterion)
print(f"Vanilla RNN with On-the-Fly Embeddings - Loss: {rnn_otf_loss:.4f}, Accuracy: {rnn_otf_accuracy:.4f}")

Epoch 1/5, Loss: 0.6728
Epoch 2/5, Loss: 0.6308
Epoch 3/5, Loss: 0.6044
Epoch 4/5, Loss: 0.5641
Epoch 5/5, Loss: 0.5380
Vanilla RNN with On-the-Fly Embeddings - Loss: 0.6103, Accuracy: 0.6732


In [19]:
class LSTMModelOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModelOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial hidden state
        c0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial cell state
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # Use the output of the last time step
        return out

# Initialize model, loss function, and optimizer
model_lstm_otf = LSTMModelOnTheFly(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(model_lstm_otf.parameters(), lr=0.001)

# Train the LSTM model with on-the-fly embeddings
train_model(model_lstm_otf, train_loader, criterion, optimizer)

# Evaluate the LSTM model with on-the-fly embeddings
lstm_otf_loss, lstm_otf_accuracy = evaluate_model(model_lstm_otf, test_loader, criterion)
print(f"LSTM with On-the-Fly Embeddings - Loss: {lstm_otf_loss:.4f}, Accuracy: {lstm_otf_accuracy:.4f}")


Epoch 1/5, Loss: 0.5994
Epoch 2/5, Loss: 0.3961
Epoch 3/5, Loss: 0.2665
Epoch 4/5, Loss: 0.1857
Epoch 5/5, Loss: 0.1203
LSTM with On-the-Fly Embeddings - Loss: 0.4113, Accuracy: 0.8534


In [20]:
# Evaluate the Vanilla RNN model with GloVe embeddings
rnn_loss, rnn_accuracy = evaluate_model(model_rnn, test_loader, criterion)
print(f"Vanilla RNN with GloVe Embeddings - Loss: {rnn_loss:.4f}, Accuracy: {rnn_accuracy:.4f}")

# Evaluate the LSTM model with GloVe embeddings
lstm_loss, lstm_accuracy = evaluate_model(model_lstm, test_loader, criterion)
print(f"LSTM with GloVe Embeddings - Loss: {lstm_loss:.4f}, Accuracy: {lstm_accuracy:.4f}")

# Evaluate the Vanilla RNN model with on-the-fly embeddings
rnn_otf_loss, rnn_otf_accuracy = evaluate_model(model_rnn_otf, test_loader, criterion)
print(f"Vanilla RNN with On-the-Fly Embeddings - Loss: {rnn_otf_loss:.4f}, Accuracy: {rnn_otf_accuracy:.4f}")

# Evaluate the LSTM model with on-the-fly embeddings
lstm_otf_loss, lstm_otf_accuracy = evaluate_model(model_lstm_otf, test_loader, criterion)
print(f"LSTM with On-the-Fly Embeddings - Loss: {lstm_otf_loss:.4f}, Accuracy: {lstm_otf_accuracy:.4f}")

Vanilla RNN with GloVe Embeddings - Loss: 0.7688, Accuracy: 0.5328
LSTM with GloVe Embeddings - Loss: 0.3334, Accuracy: 0.8539
Vanilla RNN with On-the-Fly Embeddings - Loss: 0.6103, Accuracy: 0.6732
LSTM with On-the-Fly Embeddings - Loss: 0.4113, Accuracy: 0.8534
