In [19]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertModel
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer
from datasets import load_dataset
import math

In [20]:
class SimpleSelfAttention(nn.Module):
    def __init__(self,embedding_dim, num_heads=1):
        super(SimpleSelfAttention, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.query = nn.Linear(embedding_dim, embedding_dim)
        self.key = nn.Linear(embedding_dim, embedding_dim)
        self.value = nn.Linear(embedding_dim, embedding_dim)
        self.out_proj = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, x):
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        scores = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(x.size(-1))
        attention = torch.softmax(scores, dim=-1)
        attention_out = torch.matmul(attention, v)
        return self.out_proj(attention_out)


In [21]:
def test_attention():
    batch_size = 2
    seq_len = 4
    embedding_dim = 8

    x = torch.randn(batch_size, seq_len, embedding_dim)

    attention = SimpleSelfAttention(embedding_dim)

    output = attention(x)
    expected_shape = (batch_size, seq_len, embedding_dim)
    assert output.shape == expected_shape, f"expected shape {expected_shape}, got {output.shape}"
    print("attention test passed")

In [22]:
test_attention()

attention test passed


In [23]:
dataset = load_dataset('rotten_tomatoes')

Using the latest cached version of the dataset since rotten_tomatoes couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/rohit/.cache/huggingface/datasets/rotten_tomatoes/default/0.0.0/aa13bc287fa6fcab6daf52f0dfb9994269ffea28 (last modified on Fri Apr 25 14:50:07 2025).


In [24]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [25]:
print(train_dataset[0])

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}


In [26]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [27]:
def tokenize(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)


In [28]:
tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_test = test_dataset.map(tokenize, batched=True)

In [29]:
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [30]:
tokenized_train[1]

{'label': tensor(1),
 'input_ids': tensor([  101,  1996,  9882,  2135,  9603, 13633,  1997,  1000,  1996,  2935,
          1997,  1996,  7635,  1000, 11544,  2003,  2061,  4121,  2008,  1037,
          5930,  1997,  2616,  3685, 23613,  6235,  2522,  1011,  3213,  1013,
          2472,  2848,  4027,  1005,  1055,  4423,  4432,  1997,  1046,  1012,
          1054,  1012,  1054,  1012, 23602,  1005,  1055,  2690,  1011,  3011,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

In [41]:
class BertClassifier(nn.Module):
    def __init__(self, dropout = 0.3, num_classes=2):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0]
        return self.classifier(self.dropout(cls_embedding))

In [42]:
train_loader = DataLoader(tokenized_train, batch_size=16, shuffle=True)
test_loader = DataLoader(tokenized_test, batch_size=16)

In [43]:
model = BertClassifier()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()


In [44]:
model.train()
for epoch in range(3):
    total_loss = 0
    correct = 0
    total = 0

    for batch in train_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    print(f"Epoch {epoch + 1} — Loss: {avg_loss:.4f} — Accuracy: {accuracy:.2%}")

Epoch 1 — Loss: 0.4228 — Accuracy: 79.95%
Epoch 2 — Loss: 0.2273 — Accuracy: 91.03%
Epoch 3 — Loss: 0.1109 — Accuracy: 95.80%


In [45]:
sample_text = ['This movie was fantastic! I really enjoyed it.',
               'Terrible movie. Waste of time and money.',
               'It was okay. Not too good not too bad.']

In [46]:
def predict_batch(text_list):
    # Tokenize the entire list at once
    encoding = tokenizer(
        text_list,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )

    # Run model in eval mode
    model.eval()
    with torch.no_grad():
        outputs = model(
            input_ids=encoding["input_ids"],
            attention_mask=encoding["attention_mask"]
        )
        preds = torch.argmax(outputs, dim=1)

    # Map class index to label
    label_map = {0: "Negative", 1: "Positive"}
    return [label_map[p.item()] for p in preds]

In [47]:
predictions = predict_batch(sample_text)

for text, label in zip(sample_text, predictions):
    print(f"\"{text}\" → {label}")

"This movie was fantastic! I really enjoyed it." → Positive
"Terrible movie. Waste of time and money." → Negative
"It was okay. Not too good not too bad." → Negative
