In [1]:
import torch
from torch import nn
import string
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import re
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.auto import tqdm

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
# Load dataset in pandas

df = pd.read_csv('data/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
avg_len = int(df['review'].apply(lambda x: len(x.split())).mean())
print("Average review length:", avg_len)

Average review length: 231


In [5]:
lengths = df['review'].apply(lambda x: len(x.split()))
max_len_95 = int(np.percentile(lengths, 95))
print("95th percentile review length:", max_len_95)

95th percentile review length: 590


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
df['review'].isna().sum()

np.int64(0)

In [8]:
df['sentiment'].isna().sum()

np.int64(0)

In [9]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [10]:
# Clean text data
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Lower text just in case
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation)) # str.maketrans is much faster
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove extra space
    text = ' '.join(text.split())
    return text

In [11]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [12]:
clean_text(df['review'][1])

'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

In [13]:
## Tokenize and split the words by lowering them and splitting them by ' '
def tokenize_line(line):
    words = line.lower().split(' ')
    return words

In [14]:
# Example usage:
line = 'I love pizza'
print(tokenize_line(line))

['i', 'love', 'pizza']


In [15]:
# Build vocabulary
def build_vocab(tokenized_texts, min_freq = 2):
    counter = Counter()
    for tokens in tokenized_texts:
        counter.update(tokens)
        # print(counter)
    vocab = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        # print(f"{word}: {freq}")
        if freq >= min_freq:
            vocab[word] = len(vocab)
            # print(f"Final Vocab: {vocab}")
    return vocab

In [16]:
# Example usage
texts = ["I love this movie", "This movie is hate", "Love it", "I hate this movie"]
tokenized_texts = [tokenize_line(t) for t in texts]
vocab = build_vocab(tokenized_texts)
print(vocab)


{'<pad>': 0, '<unk>': 1, 'i': 2, 'love': 3, 'this': 4, 'movie': 5, 'hate': 6}


In [17]:
vocab

{'<pad>': 0, '<unk>': 1, 'i': 2, 'love': 3, 'this': 4, 'movie': 5, 'hate': 6}

In [18]:
# Function to encode text to int  
def encode_text(tokenized_texts, vocab):
    encoded_texts = []
    for tokens in tokenized_texts:
        encoded = [vocab.get(token, vocab["<unk>"]) for token in tokens]
        # print(f"Encoded: {encoded}")
        encoded_texts.append(encoded)
    return encoded_texts

In [19]:
# Example usage
encoded_texts = encode_text(tokenized_texts=tokenized_texts, vocab= vocab)
print(f"Text after encoding: {encoded_texts}")

Text after encoding: [[2, 3, 4, 5], [4, 5, 1, 6], [3, 1], [2, 6, 4, 5]]


In [20]:
# Add padding for shorter sentences
def pad_sequences(encoded_text, max_len):
    padded_texts = []
    for seq in encoded_text:
        if len(seq) < max_len:
            # Pad with 0's (for <pad)
            seq = seq + [0] * (max_len - len(seq))
        else:
            # Truncate if too long
            seq = seq[:max_len]
        padded_texts.append(seq)
    return padded_texts

In [21]:
# Example usage
max_len = max(len(seq) for seq in encoded_texts) 
pad_sequences(encoded_texts, max_len)

[[2, 3, 4, 5], [4, 5, 1, 6], [3, 1, 0, 0], [2, 6, 4, 5]]

In [22]:
def prepare_vocab(texts, min_freq = 2):
    cleaned = texts.apply(clean_text)
    tokenized = cleaned.apply(tokenize_line).tolist()
    vocab = build_vocab(tokenized, min_freq)
    return vocab, tokenized

In [23]:
def encode_and_pad(tokenized_texts, vocab, max_len = 100):
    encoded = encode_text(tokenized_texts, vocab)
    padded = pad_sequences(encoded, max_len)
    return padded

In [24]:
# Convert to tensors
def convert_to_tensors(X, y):
    X_tensor = torch.tensor(X, dtype = torch.long)
    y_tensor = torch.tensor(y, dtype = torch.long)
    return X_tensor, y_tensor

In [25]:
# Put all functions into one class

class IMDBDataset(Dataset):
    def __init__(self, df, max_len = 100, min_freq = 10, build_vocab = True, vocab = None):
        """
        df: pandas DataFrame with columns 'review' and 'sentiment'
        max_len: max sequence length for padding
        min_freq: minimum frequency to keep a word in vocab
        build_vocab: True if building vocab from df (train), False for test/new data
        """
        self.max_len = max_len
        self.min_freq = min_freq
        self.vocab = None
        # Encode sentiment to binary labels
        df['sentiment'] = (df['sentiment'].str.lower() == 'positive').astype(int)
        self.labels = df['sentiment'].values
        # Clean and tokenize labels
        self.texts = df['review'].apply(self.clean_text).apply(self.tokenize_line).tolist()

        # Build vocab if required for training
        if build_vocab:
            self.vocab = self.build_vocab(self.texts, self.min_freq)
        else:
            if vocab is None:
                raise ValueError("Vocab must be provided if build_vocab is False")
            self.vocab = vocab
        self.encoded_texts = self.encode_and_pad(self.texts, self.vocab, self.max_len)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        # Return encoded tensor and label tensor
        return torch.tensor(self.encoded_texts[idx], dtype = torch.long), torch.tensor(self.labels[idx], dtype = torch.long)
    @staticmethod
    def clean_text(text):
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        # Lower text just in case
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans("", "", string.punctuation)) # str.maketrans is much faster
        # Remove digits
        text = re.sub(r'\d+', '', text)
        # Remove extra space
        text = ' '.join(text.split())
        return text
    @staticmethod
    def tokenize_line(line):
        words = line.lower().split(' ')
        return words 
    # Build vocabulary
    @staticmethod
    def build_vocab(tokenized_texts, min_freq = 2):
        counter = Counter()
        for tokens in tokenized_texts:
            counter.update(tokens)
            # print(counter)
        vocab = {"<pad>": 0, "<unk>": 1}
        for word, freq in counter.items():
            # print(f"{word}: {freq}")
            if freq >= min_freq:
                vocab[word] = len(vocab)
                # print(f"Final Vocab: {vocab}")
        return vocab
    def encode_and_pad(self, tokenized_texts, vocab, max_len):
        encoded = []
        for tokens in tokenized_texts:
            enc = [vocab.get(token, 0) for token in tokens]
            # pad or truncate
            if len(enc) < max_len:
                enc.extend([0] * (max_len - len(enc)))
            else:
                enc = enc[:max_len]
            encoded.append(enc)
        return encoded
    def encode_text(self, text):
        # Clean, tokenize, encode, and a pad a single string (for new data)
        clean = self.clean_text(text)
        tokens = self.tokenize_line(clean)
        enc = [self.vocab.get(token, 0) for token in tokens]
        if len(enc) < self.max_len:
            enc.extend([0] * (self.max_len - len(enc)))
        else:
            enc = enc[:self.max_len]
        return torch.tensor(enc, dtype = torch.long)

In [26]:
train_df, test_df = train_test_split(df, test_size = 0.2, stratify = df['sentiment'], random_state = 42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['sentiment'], random_state=42)

# Create train_dataset
train_dataset = IMDBDataset(train_df, build_vocab=True, max_len=250)
val_dataset = IMDBDataset(val_df, build_vocab=False, vocab=train_dataset.vocab, max_len=250)
test_dataset = IMDBDataset(test_df, build_vocab=False, vocab=train_dataset.vocab, max_len=250)

In [27]:
# Use dataloader to load data into model
torch.manual_seed(42)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [28]:
train_features_batch, train_labels_batch = next(iter(train_loader))

In [29]:
train_features_batch.shape, train_labels_batch.shape

(torch.Size([32, 250]), torch.Size([32]))

In [30]:
from timeit import default_timer as timer

def print_train_time(start: float, end: float, device: torch.device = None):
    """Prints difference between start and end time."""
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [35]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        # Embedding layer: converts vocab to dense vectors of size embed_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx= 0)
        # RNN layer: processes sequences of embeddings, outputs hidden states of size hidden_dim
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        # Fully connected layer: maps the final hidden state to output_dim (e.g. number of classes)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Pass input indices through embedding layer -> shape: (batch_size, seq_len, embed_dim)
        embedded = self.embedding(x)
        # Pass embeddings through RNN -> output (all hidden states), hidden (last hidden state)
        output, hidden = self.rnn(embedded)
        # Use last hidden state for classification; squeeze removes the extra dimension -> shape: (batch_size, hidden_dim)
        out = self.fc(hidden.squeeze(0))
        # Return the logits (unnormalized scores) for each class
        return out

In [36]:
model_v1 = SentimentRNN(
    vocab_size= len(train_dataset.vocab),
    embed_dim = 64, 
    hidden_dim= 128,
    output_dim=1
).to(device)

In [37]:
model_v1.state_dict()

OrderedDict([('embedding.weight',
              tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                      [-0.0236,  0.0073,  0.1665,  ..., -1.9107,  0.5609,  1.1578],
                      [-1.1962,  0.7516,  0.6245,  ...,  1.7858,  0.2279, -1.4715],
                      ...,
                      [ 1.7592, -0.5671, -1.2568,  ..., -2.5005, -1.1814, -0.3040],
                      [-0.5326, -0.3518, -1.9541,  ...,  0.8721, -0.0382, -1.0360],
                      [ 0.1699, -1.0569, -1.6501,  ..., -3.3290, -0.9403,  1.5975]],
                     device='cuda:0')),
             ('rnn.weight_ih_l0',
              tensor([[-0.0759, -0.0517,  0.0696,  ...,  0.0183,  0.0353, -0.0152],
                      [ 0.0505, -0.0091, -0.0035,  ..., -0.0094, -0.0123, -0.0780],
                      [ 0.0166,  0.0053,  0.0391,  ...,  0.0460, -0.0070, -0.0042],
                      ...,
                      [-0.0347, -0.0638,  0.0422,  ..., -0.0403, -0.0205, -0.0863

In [38]:
optimizer = torch.optim.Adam(model_v1.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

In [39]:
def accuracy_fn(preds, labels):
    return (preds == labels).sum().item() / len(labels)

In [40]:
def train_step(model:nn.Module,
                train_dataloader: torch.utils.data,
                optimizer: torch.optim,
                loss_fn: torch.nn.Module,
                device: torch.device= device):
    model.train()
    train_loss, train_acc = 0,0
    for X, y in train_dataloader:
        X = X.to(device)
        y = y.to(device)
        # Reset optimizer to 0 
        optimizer.zero_grad()
        # Make preds
        y_pred = model(X).squeeze(1)
        # Calcualte loss
        loss = loss_fn(y_pred, y.float())
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X.size(0) # Multiply just in case last batch size is less than 32
        preds = (torch.sigmoid(y_pred) >= 0.5).float()
        train_acc += accuracy_fn(preds, y) * X.size(0)
    avg_loss = train_loss / len(train_dataloader.dataset)
    avg_acc = train_acc / len(train_dataloader.dataset)
    print(f"Train Loss: {avg_loss:.4f} | Train Accuracy: {avg_acc:.4f}")
    return avg_loss, avg_acc

In [41]:
def validate_step(model: nn.Module,
                  test_dataloader: torch.utils.data,
                  loss_fn: torch.nn.Module,
                  device: torch.device = device):
    test_loss, test_acc = 0, 0
    total_samples = 0
    model.eval()
    with torch.inference_mode():
        for X_test, y_test in test_dataloader:
            X_test, y_test = X_test.to(device), y_test.to(device)
            test_pred = model(X_test).squeeze(1)
            # Pass raw logits to loss_fn
            test_loss += loss_fn(test_pred, y_test.float()).item() * X_test.size(0)
            preds = (torch.sigmoid(test_pred) >= 0.5).float()
            test_acc += (preds == y_test).sum().item()
            total_samples += y_test.size(0)
    avg_loss = test_loss / total_samples
    avg_acc = test_acc / total_samples
    print(f"Test Loss: {avg_loss:.4f} |  Test Accuracy: {avg_acc:.4f}")

In [42]:
epochs = 50
torch.manual_seed(42)
train_time_start_gpu= timer()
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}---\n")
    train_step(
        model= model_v1,
        train_dataloader=train_loader,
        optimizer = optimizer, 
        loss_fn=loss_fn, 
        )
    validate_step(model= model_v1,
        test_dataloader=val_loader,
        loss_fn=loss_fn, 
        )
train_time_end_gpu = timer()
total_train_time_model_1 = print_train_time(
    start = train_time_start_gpu,
    end = train_time_end_gpu,
    device=str(next(model_v1.parameters()).device)
)

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0---

Train Loss: 0.6952 | Train Accuracy: 0.5010
Test Loss: 0.6952 |  Test Accuracy: 0.5012
Epoch: 1---

Train Loss: 0.6972 | Train Accuracy: 0.5021
Test Loss: 0.7013 |  Test Accuracy: 0.4883
Epoch: 2---

Train Loss: 0.6949 | Train Accuracy: 0.5078
Test Loss: 0.6944 |  Test Accuracy: 0.4868
Epoch: 3---

Train Loss: 0.6918 | Train Accuracy: 0.5208
Test Loss: 0.6957 |  Test Accuracy: 0.5095
Epoch: 4---

Train Loss: 0.6832 | Train Accuracy: 0.5427
Test Loss: 0.6951 |  Test Accuracy: 0.5162
Epoch: 5---

Train Loss: 0.6717 | Train Accuracy: 0.5614
Test Loss: 0.6776 |  Test Accuracy: 0.5975
Epoch: 6---

Train Loss: 0.6644 | Train Accuracy: 0.5781
Test Loss: 0.7078 |  Test Accuracy: 0.5347
Epoch: 7---

Train Loss: 0.6634 | Train Accuracy: 0.5651
Test Loss: 0.7044 |  Test Accuracy: 0.5175
Epoch: 8---

Train Loss: 0.6345 | Train Accuracy: 0.6268
Test Loss: 0.6789 |  Test Accuracy: 0.6155
Epoch: 9---

Train Loss: 0.6209 | Train Accuracy: 0.6493
Test Loss: 0.7237 |  Test Accuracy: 0.5108


In [43]:
torch.cuda.empty_cache()