In [28]:
# Import required libraries for data processing and deep learning

import pandas as pd  
import torch  
import torch.nn as nn  
from torch.utils.data import Dataset, DataLoader  
from torch.nn.utils.rnn import pad_sequence  # RNN padding sequences
from collections import Counter              # For counting word frequencies
import re                                    # Regular expressions for text preprocessing

In [29]:
# Configuration for model hyperparameters
config = {
    "BATCH_SIZE" : 64,   # Number of samples per batch
    "EMBED_DIM" : 100,   # Dimension of word embeddings
    "HIDDEN_DIM" : 128,  # Hidden state dimension for LSTM
    "EPOCHS" : 5,        # Number of training epochs
    "MAX_VOCAB_SIZE" : 20000,  # Maximum vocabulary size
}

In [30]:
# Set device to GPU if available, otherwise use CPU
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [31]:
# Preprocessing function to convert text to tokens
def tokenize(text):
    """Convert text to lowercase tokens, removing special characters."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-z0-9\s]", "", text)  # Remove special characters
    return text.split()  # Split into words

In [32]:
# Build vocabulary from texts
def build_vocab(texts, max_size):
    """Create vocabulary mapping from most common words."""
    # Count word frequencies
    counter = Counter()
    for text in texts:
        counter.update(tokenize(text))

    # Initialize vocab with special tokens
    vocab = {"<pad>": 0, "<unk>": 1}  # padding and unknown tokens
    # Add most common words up to max_size
    for word, _ in counter.most_common(max_size - 2):
        vocab[word] = len(vocab)
    return vocab

In [40]:
# Custom dataset class for loading IMDB sentiment data
class SentimentDataset(Dataset):
    """Load sentiment reviews and encode them using vocabulary."""
    def __init__(self, csv_path, vocab):
        # Read CSV file and extract reviews and sentiment labels
        df = pd.read_csv(csv_path)
        self.texts = df["review"].values
        self.labels = df["sentiment"].values
        self.vocab = vocab

    def encode(self, text):
        """Convert text to indices using vocabulary."""
        # Map each token to its vocabulary index, use <unk> for unknown words
        return torch.tensor(
            [self.vocab.get(tok, self.vocab["<unk>"]) for tok in tokenize(text)],
            dtype=torch.long
        )

    def __len__(self):
        """Return dataset size."""
        return len(self.texts)

    def __getitem__(self, idx):
        """Return encoded text and sentiment label for given index."""
        # Convert sentiment string to numeric value: "positive" -> 1, "negative" -> 0
        label = 1.0 if self.labels[idx] == "positive" else 0.0
        return self.encode(self.texts[idx]), torch.tensor(label, dtype=torch.float)

In [41]:
# Define padding index and batch collation function
PAD_IDX = 0  # Padding token index

def collate_fn(batch):
    """Pad variable-length sequences in batch to same length."""
    # Separate texts and labels from batch
    texts, labels = zip(*batch)
    # Pad sequences to maximum length in batch
    texts = pad_sequence(texts, padding_value=PAD_IDX)
    # Stack labels into single tensor
    labels = torch.stack(labels)
    # Move to specified device (GPU/CPU)
    return texts.to(DEVICE), labels.to(DEVICE)

In [42]:
# LSTM-based model for sentiment classification
class LSTMSentiment(nn.Module):
    """LSTM model for binary sentiment classification."""
    def __init__(self, vocab_size, config):
        super().__init__()
        # Embedding layer: converts word indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, config["EMBED_DIM"], padding_idx=PAD_IDX)
        # LSTM layer: processes sequences and captures long-term dependencies
        self.lstm = nn.LSTM(config["EMBED_DIM"], config["HIDDEN_DIM"])
        # Fully connected layer: outputs single value for binary classification
        self.fc = nn.Linear(config["HIDDEN_DIM"], 1)

    def forward(self, x):
        """Forward pass: x shape [seq_len, batch]"""
        # Convert indices to embeddings
        emb = self.embedding(x)
        # LSTM processes embeddings, returns hidden states
        _, (hidden, _) = self.lstm(emb)
        # Pass final hidden state through FC layer and squeeze to 1D
        return self.fc(hidden[-1]).squeeze(1)

In [43]:
# Load data and build vocabulary
df = pd.read_csv('dataset/IMDB.csv')  # Load IMDB dataset
vocab = build_vocab(df["review"], config["MAX_VOCAB_SIZE"])  # Build vocabulary from reviews

# Create dataset and dataloader for training
train_dataset = SentimentDataset("dataset/IMDB.csv", vocab)
train_loader = DataLoader(train_dataset, batch_size=config["BATCH_SIZE"], 
                          shuffle=True, collate_fn=collate_fn)  # Shuffle data and pad sequences

In [44]:
# Initialize model, loss function, and optimizer

from tqdm import tqdm
model = LSTMSentiment(len(vocab), config).to(DEVICE)  # Fixed: Pass config to constructor
criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)  # Adam optimizer

# Training loop
for epoch in range(config["EPOCHS"]):
    model.train()  # Set model to training mode
    total_loss = 0

    # Iterate through batches
    for texts, labels in tqdm(train_loader):
        # Forward pass
        optimizer.zero_grad()  # Clear previous gradients
        outputs = model(texts)  # Get predictions
        loss = criterion(outputs, labels)  # Calculate loss
        
        # Backward pass
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights
        total_loss += loss.item()

    # Print average loss for the epoch
    print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader):.4f}")

100%|██████████| 782/782 [00:22<00:00, 35.22it/s]


Epoch 1: Loss = 0.6932


100%|██████████| 782/782 [00:22<00:00, 35.42it/s]


Epoch 2: Loss = 0.6944


100%|██████████| 782/782 [00:21<00:00, 35.58it/s]


Epoch 3: Loss = 0.6934


100%|██████████| 782/782 [00:21<00:00, 36.42it/s]


Epoch 4: Loss = 0.6917


100%|██████████| 782/782 [00:21<00:00, 36.79it/s]

Epoch 5: Loss = 0.6898





In [45]:
# Inference function for sentiment prediction
def predict(text):
    """Predict sentiment for given text."""
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # Disable gradient computation
        # Encode text using vocabulary
        encoded = torch.tensor(
            [vocab.get(tok, vocab["<unk>"]) for tok in tokenize(text)],
            dtype=torch.long
        ).unsqueeze(1).to(DEVICE)  # Add batch dimension and move to device

        # Get prediction probability
        prob = torch.sigmoid(model(encoded)).item()
        # Return sentiment label and probability
        return "positive" if prob >= 0.5 else "negative", prob

In [46]:
# Test predictions on sample reviews
print("Testing positive review:")
print(predict("This movie was absolutely fantastic"))
print("\nTesting negative review:")
print(predict("Worst acting and boring story"))

Testing positive review:
('negative', 0.41810593008995056)

Testing negative review:
('positive', 0.7637147903442383)
