## Sentiment Analysis on IMDB Dataset

In [49]:
import torch
import torch.nn as nn
import numpy

In [50]:
if torch.cuda.is_available():
    device = "cuda"
    print(torch.cuda.device_count())
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

1


'cuda'

In [51]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
split = imdb_dataset["train"].train_test_split(train_size = 0.8)
imdb_train, imdb_valid = split["train"], split["test"]
imdb_test = imdb_dataset["test"]

In [52]:
from transformers import AutoTokenizer

gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

In [53]:
def collate_fn(batch, tokenizer=gpt_tokenizer):
  review = [review["text"] for review in batch]
  labels = [[review["label"]] for review in batch]
  encodings = tokenizer(
      review,
      padding=True,
      truncation=True,
      max_length=200,
      return_tensors="pt")
  labels = torch.tensor(labels, dtype=torch.float32)
  return encodings, labels

In [54]:
from torch.utils.data import DataLoader
batch_size = 256

imdb_train_loader = DataLoader(imdb_train, batch_size=batch_size,
                               collate_fn=collate_fn, shuffle=True)
imdb_valid_loader = DataLoader(imdb_valid, batch_size=batch_size,
                               collate_fn=collate_fn)
imdb_test_loader = DataLoader(imdb_test, batch_size=batch_size,
                               collate_fn=collate_fn)

In [55]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SentimentAnalysisPackedSeqModel(nn.Module):
  def __init__(self, vocab_size, n_layers=2, hidden_size=128, embed_size=128,
               pad_id=0, dropout=0.2):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size,
                              padding_idx=pad_id)
    self.gru = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                      batch_first=True, dropout=dropout)
    self.output = nn.Linear(hidden_size, 1)

  def forward(self,encodings):
    embeddings = self.embed(encodings["input_ids"])
    lengths = encodings["attention_mask"].sum(dim=1)
    packed = pack_padded_sequence(embeddings, lengths=lengths.cpu(),
                                    batch_first=True, enforce_sorted=False)
    _outputs, hidden_states = self.gru(packed)
    return self.output(hidden_states[-1])


In [56]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()

def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs, patience=2,
         factor=0.5,epoch_callback=None):
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="max", patience=patience, factor=factor
    )
    history = {"train_losses":[],"train_metrics":[],"valid_metrics":[]}
    for epoch in range(n_epochs):
        total_loss = 0
        metric.reset()
        model.train()
        if epoch_callback is not None:
            epoch_callback(model,epoch)
        for idx,( X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred, y_batch)
            print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
            print(f", loss ={total_loss/(idx+1 ):.4f} ", end="")
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        val_metric = evaluate_tm(model, valid_loader, metric).item()
        history["valid_metrics"].append(val_metric)
        scheduler.step(val_metric)
        print(f"Epoch:{epoch+1}/{n_epochs}, "
             f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}")
    return history

In [57]:
vocab_size = gpt_tokenizer.vocab_size
vocab_size

50257

In [58]:
imdb_model = SentimentAnalysisPackedSeqModel(vocab_size).to(device)

n_epochs = 10
xentropy = nn.BCEWithLogitsLoss()
accuracy = torchmetrics.Accuracy(task="binary").to(device)
optimizer = torch.optim.NAdam(imdb_model.parameters())

history = train(imdb_model, optimizer, xentropy, accuracy, imdb_train_loader, imdb_valid_loader, n_epochs)


Batch 79/79, loss =0.6845 Epoch:1/10, Train Loss: 0.6845, Train Metric: 0.5701, Valid Metric: 0.5200
Batch 79/79, loss =0.6803 Epoch:2/10, Train Loss: 0.6803, Train Metric: 0.5667, Valid Metric: 0.6000
Batch 79/79, loss =0.6029 Epoch:3/10, Train Loss: 0.6029, Train Metric: 0.6695, Valid Metric: 0.7422
Batch 79/79, loss =0.4161 Epoch:4/10, Train Loss: 0.4161, Train Metric: 0.8083, Valid Metric: 0.7860
Batch 79/79, loss =0.2792 Epoch:5/10, Train Loss: 0.2792, Train Metric: 0.8849, Valid Metric: 0.7946
Batch 79/79, loss =0.1940 Epoch:6/10, Train Loss: 0.1940, Train Metric: 0.9262, Valid Metric: 0.7976
Batch 79/79, loss =0.1238 Epoch:7/10, Train Loss: 0.1238, Train Metric: 0.9560, Valid Metric: 0.8356
Batch 79/79, loss =0.1017 Epoch:8/10, Train Loss: 0.1017, Train Metric: 0.9679, Valid Metric: 0.8370
Batch 79/79, loss =0.0400 Epoch:9/10, Train Loss: 0.0400, Train Metric: 0.9895, Valid Metric: 0.8324
Batch 79/79, loss =0.0223 Epoch:10/10, Train Loss: 0.0223, Train Metric: 0.9957, Valid Metr

### Using Bidirectional RNN

In [59]:
class SentimentAnalysisBidiModel(nn.Module):
  def __init__(self, vocab_size, n_layers=2, hidden_size=128, embed_size=128,
               pad_id=0, dropout=0.2):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)
    self.gru = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                      batch_first=True, dropout=dropout, bidirectional=True)
    self.output = nn.Linear(hidden_size * 2, 1)

  def forward(self, encodings):
    embeddings = self.embed(encodings["input_ids"])
    lengths = encodings["attention_mask"].sum(dim=1)
    packed = pack_padded_sequence(embeddings, lengths=lengths.cpu(),
                                    batch_first=True, enforce_sorted=False)
    _outputs, hidden_states = self.gru(packed)

    forward_state = hidden_states[-2] # (batch, hidden_size)
    backward_state = hidden_states[-1] # (batch, hidden_size)

    final_state = torch.cat((forward_state, backward_state), dim=1)   # (batch, 2*hidden_size)

    return self.output(final_state)


In [60]:
imdb_bidi_model = SentimentAnalysisBidiModel(vocab_size).to(device)

n_epochs = 10
xentropy = nn.BCEWithLogitsLoss()
accuracy = torchmetrics.Accuracy(task="binary").to(device)
optimizer = torch.optim.NAdam(imdb_bidi_model.parameters())

history = train(imdb_bidi_model, optimizer, xentropy, accuracy, imdb_train_loader, imdb_valid_loader, n_epochs)

Batch 79/79, loss =0.6549 Epoch:1/10, Train Loss: 0.6549, Train Metric: 0.6074, Valid Metric: 0.6600
Batch 79/79, loss =0.5331 Epoch:2/10, Train Loss: 0.5331, Train Metric: 0.7340, Valid Metric: 0.6446
Batch 79/79, loss =0.3474 Epoch:3/10, Train Loss: 0.3474, Train Metric: 0.8476, Valid Metric: 0.7918
Batch 79/79, loss =0.2190 Epoch:4/10, Train Loss: 0.2190, Train Metric: 0.9123, Valid Metric: 0.8048
Batch 79/79, loss =0.1237 Epoch:5/10, Train Loss: 0.1237, Train Metric: 0.9555, Valid Metric: 0.7968
Batch 79/79, loss =0.0437 Epoch:6/10, Train Loss: 0.0437, Train Metric: 0.9870, Valid Metric: 0.8256
Batch 79/79, loss =0.0124 Epoch:7/10, Train Loss: 0.0124, Train Metric: 0.9975, Valid Metric: 0.8314
Batch 79/79, loss =0.0037 Epoch:8/10, Train Loss: 0.0037, Train Metric: 0.9995, Valid Metric: 0.8300
Batch 79/79, loss =0.0019 Epoch:9/10, Train Loss: 0.0019, Train Metric: 0.9998, Valid Metric: 0.8304
Batch 79/79, loss =0.0007 Epoch:10/10, Train Loss: 0.0007, Train Metric: 1.0000, Valid Metr