In [28]:
import torch
from torch import nn
import string
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import re
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.auto import tqdm

In [29]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [30]:
# Load dataset in pandas

df = pd.read_csv('data/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [31]:
avg_len = int(df['review'].apply(lambda x: len(x.split())).mean())
print("Average review length:", avg_len)

Average review length: 231


In [32]:
lengths = df['review'].apply(lambda x: len(x.split()))
max_len_95 = int(np.percentile(lengths, 95))
print("95th percentile review length:", max_len_95)

95th percentile review length: 590


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [34]:
df['review'].isna().sum()

np.int64(0)

In [35]:
df['sentiment'].isna().sum()

np.int64(0)

In [36]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [37]:
# Clean text data
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Lower text just in case
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation)) # str.maketrans is much faster
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove extra space
    text = ' '.join(text.split())
    return text

In [38]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [39]:
clean_text(df['review'][1])

'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

In [40]:
## Tokenize and split the words by lowering them and splitting them by ' '
def tokenize_line(line):
    words = line.lower().split(' ')
    return words

In [41]:
# Example usage:
line = 'I love pizza'
print(tokenize_line(line))

['i', 'love', 'pizza']


In [42]:
# Build vocabulary
def build_vocab(tokenized_texts, min_freq = 2):
    counter = Counter()
    for tokens in tokenized_texts:
        counter.update(tokens)
        # print(counter)
    vocab = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        # print(f"{word}: {freq}")
        if freq >= min_freq:
            vocab[word] = len(vocab)
            # print(f"Final Vocab: {vocab}")
    return vocab

In [43]:
# Example usage
texts = ["I love this movie", "This movie is hate", "Love it", "I hate this movie"]
tokenized_texts = [tokenize_line(t) for t in texts]
vocab = build_vocab(tokenized_texts)
print(vocab)


{'<pad>': 0, '<unk>': 1, 'i': 2, 'love': 3, 'this': 4, 'movie': 5, 'hate': 6}


In [44]:
vocab

{'<pad>': 0, '<unk>': 1, 'i': 2, 'love': 3, 'this': 4, 'movie': 5, 'hate': 6}

In [45]:
# Function to encode text to int  
def encode_text(tokenized_texts, vocab):
    encoded_texts = []
    for tokens in tokenized_texts:
        encoded = [vocab.get(token, vocab["<unk>"]) for token in tokens]
        # print(f"Encoded: {encoded}")
        encoded_texts.append(encoded)
    return encoded_texts

In [46]:
# Example usage
encoded_texts = encode_text(tokenized_texts=tokenized_texts, vocab= vocab)
print(f"Text after encoding: {encoded_texts}")

Text after encoding: [[2, 3, 4, 5], [4, 5, 1, 6], [3, 1], [2, 6, 4, 5]]


In [47]:
# Add padding for shorter sentences
def pad_sequences(encoded_text, max_len):
    padded_texts = []
    for seq in encoded_text:
        if len(seq) < max_len:
            # Pad with 0's (for <pad)
            seq = seq + [0] * (max_len - len(seq))
        else:
            # Truncate if too long
            seq = seq[:max_len]
        padded_texts.append(seq)
    return padded_texts

In [48]:
# Example usage
max_len = max(len(seq) for seq in encoded_texts) 
pad_sequences(encoded_texts, max_len)

[[2, 3, 4, 5], [4, 5, 1, 6], [3, 1, 0, 0], [2, 6, 4, 5]]

In [49]:
def prepare_vocab(texts, min_freq = 2):
    cleaned = texts.apply(clean_text)
    tokenized = cleaned.apply(tokenize_line).tolist()
    vocab = build_vocab(tokenized, min_freq)
    return vocab, tokenized

In [50]:
def encode_and_pad(tokenized_texts, vocab, max_len = 100):
    encoded = encode_text(tokenized_texts, vocab)
    padded = pad_sequences(encoded, max_len)
    return padded

In [51]:
# Convert to tensors
def convert_to_tensors(X, y):
    X_tensor = torch.tensor(X, dtype = torch.long)
    y_tensor = torch.tensor(y, dtype = torch.long)
    return X_tensor, y_tensor

In [52]:
df['sentiment'] = (df['sentiment'].str.lower() == 'positive').astype(int)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [53]:
# Put all functions into one class

class IMDBDataset(Dataset):
    def __init__(self, df, max_len = 100, min_freq = 10, build_vocab = True, vocab = None):
        """
        df: pandas DataFrame with columns 'review' and 'sentiment'
        max_len: max sequence length for padding
        min_freq: minimum frequency to keep a word in vocab
        build_vocab: True if building vocab from df (train), False for test/new data
        """
        self.max_len = max_len
        self.min_freq = min_freq
        self.vocab = None
        # Encode sentiment to binary labels
        self.labels = df['sentiment'].values
        # Clean and tokenize labels
        self.texts = df['review'].apply(self.clean_text).apply(self.tokenize_line).tolist()

        # Build vocab if required for training
        if build_vocab:
            self.vocab = self.build_vocab(self.texts, self.min_freq)
        else:
            if vocab is None:
                raise ValueError("Vocab must be provided if build_vocab is False")
            self.vocab = vocab
        self.encoded_texts = self.encode_and_pad(self.texts, self.vocab, self.max_len)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        # Return encoded tensor and label tensor
        return torch.tensor(self.encoded_texts[idx], dtype = torch.long), torch.tensor(self.labels[idx], dtype = torch.long)
    @staticmethod
    def clean_text(text):
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        # Lower text just in case
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans("", "", string.punctuation)) # str.maketrans is much faster
        # Remove digits
        text = re.sub(r'\d+', '', text)
        # Remove extra space
        text = ' '.join(text.split())
        return text
    @staticmethod
    def tokenize_line(line):
        words = line.lower().split(' ')
        return words 
    # Build vocabulary
    @staticmethod
    def build_vocab(tokenized_texts, min_freq = 2):
        counter = Counter()
        for tokens in tokenized_texts:
            counter.update(tokens)
            # print(counter)
        vocab = {"<pad>": 0, "<unk>": 1}
        for word, freq in counter.items():
            # print(f"{word}: {freq}")
            if freq >= min_freq:
                vocab[word] = len(vocab)
                # print(f"Final Vocab: {vocab}")
        return vocab
    def encode_and_pad(self, tokenized_texts, vocab, max_len):
        encoded = []
        for tokens in tokenized_texts:
            enc = [vocab.get(token, 1) for token in tokens]
            # pad or truncate
            if len(enc) < max_len:
                enc.extend([0] * (max_len - len(enc)))
            else:
                enc = enc[:max_len]
            encoded.append(enc)
        return encoded
    def encode_text(self, text):
        # Clean, tokenize, encode, and a pad a single string (for new data)
        clean = self.clean_text(text)
        tokens = self.tokenize_line(clean)
        enc = [self.vocab.get(token, 1) for token in tokens]
        if len(enc) < self.max_len:
            enc.extend([0] * (self.max_len - len(enc)))
        else:
            enc = enc[:self.max_len]
        return torch.tensor(enc, dtype = torch.long)

In [54]:
train_df, test_df = train_test_split(df, test_size = 0.2, stratify = df['sentiment'], random_state = 42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['sentiment'], random_state=42)

# Create train_dataset
train_dataset = IMDBDataset(train_df, build_vocab=True, max_len=250)
val_dataset = IMDBDataset(val_df, build_vocab=False, vocab=train_dataset.vocab, max_len=250)
test_dataset = IMDBDataset(test_df, build_vocab=False, vocab=train_dataset.vocab, max_len=250)

In [55]:
# Use dataloader to load data into model
torch.manual_seed(42)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [56]:
train_features_batch, train_labels_batch = next(iter(train_loader))

In [57]:
train_features_batch.shape, train_labels_batch.shape

(torch.Size([32, 250]), torch.Size([32]))

In [58]:
from timeit import default_timer as timer

def print_train_time(start: float, end: float, device: torch.device = None):
    """Prints difference between start and end time."""
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [59]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        # Embedding layer: converts vocab to dense vectors of size embed_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx= 0)
        # RNN layer: processes sequences of embeddings, outputs hidden states of size hidden_dim
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        # Fully connected layer: maps the final hidden state to output_dim (e.g. number of classes)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Pass input indices through embedding layer -> shape: (batch_size, seq_len, embed_dim)
        embedded = self.embedding(x)
        # Pass embeddings through RNN -> output (all hidden states), hidden (last hidden state)
        output, hidden = self.rnn(embedded)
        # Use last hidden state for classification; squeeze removes the extra dimension -> shape: (batch_size, hidden_dim)
        out = self.fc(hidden.squeeze(0))
        # Return the logits (unnormalized scores) for each class
        return out

In [60]:
model_v1 = SentimentRNN(
    vocab_size= len(train_dataset.vocab),
    embed_dim = 64, 
    hidden_dim= 128,
    output_dim=1
).to(device)

In [61]:
model_v1.state_dict()

OrderedDict([('embedding.weight',
              tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                      [ 0.3466, -0.1973, -1.0546,  ...,  0.5069, -0.4752, -0.4920],
                      [-0.1360,  1.6354,  0.6547,  ..., -0.7251,  0.4664,  0.6667],
                      ...,
                      [ 0.7116, -0.7279, -0.3306,  ...,  1.9188,  0.6521,  1.6782],
                      [-1.0569, -0.1974, -1.5052,  ...,  0.3588, -1.0410,  0.0264],
                      [ 2.4404,  0.7952, -1.9100,  ..., -0.1307,  0.6438, -0.4163]],
                     device='cuda:0')),
             ('rnn.weight_ih_l0',
              tensor([[ 0.0782,  0.0164,  0.0417,  ..., -0.0743, -0.0743, -0.0678],
                      [ 0.0316, -0.0161, -0.0125,  ..., -0.0736, -0.0491,  0.0744],
                      [-0.0772,  0.0281,  0.0540,  ...,  0.0837, -0.0537, -0.0848],
                      ...,
                      [-0.0734,  0.0384, -0.0134,  ..., -0.0033,  0.0793,  0.0516

In [62]:
optimizer = torch.optim.Adam(model_v1.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

In [63]:
def accuracy_fn(preds, labels):
    return (preds == labels).sum().item() / len(labels)

In [64]:
def train_step(model:nn.Module,
                train_dataloader: torch.utils.data,
                optimizer: torch.optim,
                loss_fn: torch.nn.Module,
                device: torch.device= device):
    model.train()
    train_loss, train_acc = 0,0
    for X, y in train_dataloader:
        X = X.to(device)
        y = y.to(device)
        # Reset optimizer to 0 
        optimizer.zero_grad()
        # Make preds
        y_pred = model(X).squeeze(1)
        # Calcualte loss
        loss = loss_fn(y_pred, y.float())
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X.size(0) # Multiply just in case last batch size is less than 32
        preds = (torch.sigmoid(y_pred) >= 0.5).float()
        train_acc += accuracy_fn(preds, y) * X.size(0)
    avg_loss = train_loss / len(train_dataloader.dataset)
    avg_acc = train_acc / len(train_dataloader.dataset)
    print(f"Train Loss: {avg_loss:.4f} | Train Accuracy: {avg_acc:.4f}")
    return avg_loss, avg_acc

In [65]:
def validate_step(model: nn.Module,
                  test_dataloader: torch.utils.data,
                  loss_fn: torch.nn.Module,
                  device: torch.device = device):
    test_loss, test_acc = 0, 0
    total_samples = 0
    model.eval()
    with torch.inference_mode():
        for X_test, y_test in test_dataloader:
            X_test, y_test = X_test.to(device), y_test.to(device)
            test_pred = model(X_test).squeeze(1)
            # Pass raw logits to loss_fn
            test_loss += loss_fn(test_pred, y_test.float()).item() * X_test.size(0)
            preds = (torch.sigmoid(test_pred) >= 0.5).float()
            test_acc += (preds == y_test).sum().item()
            total_samples += y_test.size(0)
    avg_loss = test_loss / total_samples
    avg_acc = test_acc / total_samples
    print(f"Test Loss: {avg_loss:.4f} |  Test Accuracy: {avg_acc:.4f}")
    return avg_loss


In [66]:
epochs = 5
torch.manual_seed(42)
train_time_start_gpu= timer()
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}---\n")
    train_step(
        model= model_v1,
        train_dataloader=train_loader,
        optimizer = optimizer, 
        loss_fn=loss_fn, 
        )
    validate_step(model= model_v1,
        test_dataloader=val_loader,
        loss_fn=loss_fn, 
        )
train_time_end_gpu = timer()
total_train_time_model_1 = print_train_time(
    start = train_time_start_gpu,
    end = train_time_end_gpu,
    device=str(next(model_v1.parameters()).device)
)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 0---

Train Loss: 0.6956 | Train Accuracy: 0.4975
Test Loss: 0.6933 |  Test Accuracy: 0.5008
Epoch: 1---

Train Loss: 0.6959 | Train Accuracy: 0.5079
Test Loss: 0.6949 |  Test Accuracy: 0.5022
Epoch: 2---

Train Loss: 0.6962 | Train Accuracy: 0.5003
Test Loss: 0.6965 |  Test Accuracy: 0.4978
Epoch: 3---

Train Loss: 0.6957 | Train Accuracy: 0.5024
Test Loss: 0.6938 |  Test Accuracy: 0.5025
Epoch: 4---

Train Loss: 0.6941 | Train Accuracy: 0.5118
Test Loss: 0.6939 |  Test Accuracy: 0.5038
Train time on cuda:0: 42.125 seconds


In [67]:
torch.cuda.empty_cache()

In [68]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout=0.3):
        super().__init__()
        # Embedding layer: converts vocab to dense vectors of size embed_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0)
        # RNN layer: processes sequences of embeddings, outputs hidden states of size hidden_dim
        self.rnn = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)  # Add dropout layer

        # Fully connected layer: maps the final hidden state to output_dim (e.g. number of classes)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        # Pass input indices through embedding layer -> shape: (batch_size, seq_len, embed_dim)
        embedded = self.embedding(x)
        # Pass embeddings through RNN -> output (all hidden states), hidden (last hidden state)
        output, (hidden, cell) = self.rnn(embedded)
        # Use last hidden state for classification; squeeze removes the extra dimension -> shape: (batch_size, hidden_dim)
        hidden = self.dropout(hidden.squeeze(0))  # Apply dropout here
        out = self.fc(hidden)
        return out

In [69]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.0, path='checkpoint.pt', verbose=False):
        """
        Args:
            patience (int): Number of epochs to wait after last improvement before stopping.
            min_delta (float): Minimum change in the monitored quantity to qualify as an improvement.
            path (str): File path to save the best model.
            verbose (bool): If True, prints messages during training.
        """
        self.patience = patience
        self.min_delta = min_delta
        self.path = path
        self.verbose = verbose
        self.counter = 0  # Counts how many epochs have passed without improvement
        self.best_score = None  # Tracks the best (lowest) validation loss seen so far
        self.early_stop = False  # Flag to indicate if early stopping should trigger

    def __call__(self, val_loss, model):
        """
        Call this method after each epoch with the current validation loss and model.
        """
        score = -val_loss  # Since lower loss is better, we negate to treat higher as better

        # First call: set the initial best score and save the model
        if self.best_score is None:
            self.best_score = score
            self._save_checkpoint(model)

        # If no significant improvement
        elif score < self.best_score + self.min_delta:
            self.counter += 1  # Increase counter since no improvement
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                # If patience is exceeded, trigger early stopping
                self.early_stop = True

        # If improved
        else:
            self.best_score = score  # Update best score
            self._save_checkpoint(model)  # Save model checkpoint
            self.counter = 0  # Reset counter

    def _save_checkpoint(self, model):
        """
        Saves model state_dict to disk if validation loss improves.
        """
        if self.verbose:
            print(f"Validation loss improved. Saving model to {self.path}")
        torch.save(model.state_dict(), self.path)

In [70]:
model_v2 = SentimentLSTM(
    vocab_size= len(train_dataset.vocab),
    embed_dim = 64, 
    hidden_dim= 256,
    output_dim=1,
).to(device)

In [71]:
optimizer = torch.optim.Adam(params= model_v2.parameters(),
                            lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

In [72]:
early_stopping = EarlyStopping(patience=7, min_delta=0.01, verbose=True)

In [None]:
epochs = 15
torch.manual_seed(42)
train_time_start_gpu= timer()
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}---\n")
    train_step(
        model= model_v2,
        train_dataloader=train_loader,
        optimizer = optimizer, 
        loss_fn=loss_fn, 
        )
    val_loss = validate_step(model= model_v2,
        test_dataloader=val_loader,
        loss_fn=loss_fn, 
        )
    early_stopping(val_loss, model_v2)
    if early_stopping.early_stop:
        print("Early stopping triggered")
        break
train_time_end_gpu = timer()
total_train_time_model_2 = print_train_time(
    start = train_time_start_gpu,
    end = train_time_end_gpu,
    device=str(next(model_v2.parameters()).device)
)

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 0---

Train Loss: 0.6940 | Train Accuracy: 0.5053
Test Loss: 0.6932 |  Test Accuracy: 0.5082
Validation loss improved. Saving model to checkpoint.pt
Epoch: 1---

Train Loss: 0.6905 | Train Accuracy: 0.5159
Test Loss: 0.6902 |  Test Accuracy: 0.5238
EarlyStopping counter: 1 out of 7
Epoch: 2---

Train Loss: 0.6752 | Train Accuracy: 0.5731
Test Loss: 0.6944 |  Test Accuracy: 0.5082
EarlyStopping counter: 2 out of 7
Epoch: 3---

Train Loss: 0.6190 | Train Accuracy: 0.6233
Test Loss: 0.4390 |  Test Accuracy: 0.8093
Validation loss improved. Saving model to checkpoint.pt
Epoch: 4---

Train Loss: 0.3330 | Train Accuracy: 0.8624
Test Loss: 0.3163 |  Test Accuracy: 0.8690
Validation loss improved. Saving model to checkpoint.pt
Epoch: 5---

Train Loss: 0.2222 | Train Accuracy: 0.9163
Test Loss: 0.3214 |  Test Accuracy: 0.8708
EarlyStopping counter: 1 out of 7
Epoch: 6---

Train Loss: 0.1478 | Train Accuracy: 0.9499
Test Loss: 0.3679 |  Test Accuracy: 0.8652
EarlyStopping counter: 2 out o

In [74]:
torch.cuda.empty_cache()