## Sentiment Analysis on IMDB Dataset

In [19]:
import torch
import torch.nn as nn
import numpy

In [20]:
if torch.cuda.is_available():
    device = "cuda"
    print(torch.cuda.device_count())
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

1


'cuda'

In [37]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
split = imdb_dataset["train"].train_test_split(train_size = 0.8)
imdb_train, imdb_valid = split["train"], split["test"]
imdb_test = imdb_dataset["test"]
imdb_train[:1]

{'text': ["I read James Hawes book. It was pretty neat, not great, but entertaining enough. Without having read the book I wouldn't have had the slightest idea what was going on, and it was still a stretch with that knowledge.<br /><br />Literally every element of this film is abysmal in ways I do not have the capacity to describe. Half digested fish could have made a better film with matchsticks and dayglo lipstick.<br /><br />Never before or since as a film made me feel so angry. The Mattress sequels came closest, but even they never reached such depths of utterly putrid nauseating appallingness that this bilge did.<br /><br />Since wasting 90 minutes of my life witnessing this plague on human kind I am now unable to even look at any book by James Hawes without feeling angry. That is the depth of hatred I have for this piece of sh*t. No, that's unfair. Let me apologise to all fecal matter for comparing you to the otherworldly evil that is Rancid Aluminium.<br /><br />Plain and simply

In [22]:
from transformers import AutoTokenizer

gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

In [23]:
def collate_fn(batch, tokenizer=gpt_tokenizer):
  review = [review["text"] for review in batch]
  labels = [[review["label"]] for review in batch]
  encodings = tokenizer(
      review,
      padding=True,
      truncation=True,
      max_length=200,
      return_tensors="pt")
  labels = torch.tensor(labels, dtype=torch.float32)
  return encodings, labels

In [24]:
from torch.utils.data import DataLoader
batch_size = 256

imdb_train_loader = DataLoader(imdb_train, batch_size=batch_size,
                               collate_fn=collate_fn, shuffle=True)
imdb_valid_loader = DataLoader(imdb_valid, batch_size=batch_size,
                               collate_fn=collate_fn)
imdb_test_loader = DataLoader(imdb_test, batch_size=batch_size,
                               collate_fn=collate_fn)

In [25]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SentimentAnalysisPackedSeqModel(nn.Module):
  def __init__(self, vocab_size, n_layers=2, hidden_size=128, embed_size=128,
               pad_id=0, dropout=0.2):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size,
                              padding_idx=pad_id)
    self.gru = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                      batch_first=True, dropout=dropout)
    self.output = nn.Linear(hidden_size, 1)

  def forward(self,encodings):
    embeddings = self.embed(encodings["input_ids"])
    lengths = encodings["attention_mask"].sum(dim=1)
    packed = pack_padded_sequence(embeddings,
                                  lengths=lengths.cpu(),
                                  batch_first=True,
                                  enforce_sorted=False)
    _outputs, hidden_states = self.gru(packed)
    return self.output(hidden_states[-1])


In [26]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()

def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs, patience=2,
         factor=0.5,epoch_callback=None):
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="max", patience=patience, factor=factor
    )
    history = {"train_losses":[],"train_metrics":[],"valid_metrics":[]}
    for epoch in range(n_epochs):
        total_loss = 0
        metric.reset()
        model.train()
        if epoch_callback is not None:
            epoch_callback(model,epoch)
        for idx,( X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred, y_batch)
            print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
            print(f", loss ={total_loss/(idx+1 ):.4f} ", end="")
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        val_metric = evaluate_tm(model, valid_loader, metric).item()
        history["valid_metrics"].append(val_metric)
        scheduler.step(val_metric)
        print(f"Epoch:{epoch+1}/{n_epochs}, "
             f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}%, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}%")
    return history

In [27]:
vocab_size = gpt_tokenizer.vocab_size
vocab_size

50257

In [28]:
imdb_model = SentimentAnalysisPackedSeqModel(vocab_size).to(device)

n_epochs = 10
xentropy = nn.BCEWithLogitsLoss() # sigmoid + BinaryCrossEntropy
accuracy = torchmetrics.Accuracy(task="binary").to(device)
optimizer = torch.optim.NAdam(imdb_model.parameters())

history = train(imdb_model, optimizer, xentropy, accuracy, imdb_train_loader, imdb_valid_loader, n_epochs)


Batch 79/79, loss =0.6771 Epoch:1/10, Train Loss: 0.6771, Train Metric: 0.5766%, Valid Metric: 0.5050%
Batch 79/79, loss =0.5687 Epoch:2/10, Train Loss: 0.5687, Train Metric: 0.7049%, Valid Metric: 0.7282%
Batch 79/79, loss =0.3994 Epoch:3/10, Train Loss: 0.3994, Train Metric: 0.8207%, Valid Metric: 0.8054%
Batch 79/79, loss =0.2630 Epoch:4/10, Train Loss: 0.2630, Train Metric: 0.8929%, Valid Metric: 0.8346%
Batch 79/79, loss =0.1792 Epoch:5/10, Train Loss: 0.1792, Train Metric: 0.9341%, Valid Metric: 0.8100%
Batch 79/79, loss =0.1025 Epoch:6/10, Train Loss: 0.1025, Train Metric: 0.9654%, Valid Metric: 0.7446%
Batch 79/79, loss =0.0559 Epoch:7/10, Train Loss: 0.0559, Train Metric: 0.9823%, Valid Metric: 0.8176%
Batch 79/79, loss =0.0186 Epoch:8/10, Train Loss: 0.0186, Train Metric: 0.9958%, Valid Metric: 0.8314%
Batch 79/79, loss =0.0096 Epoch:9/10, Train Loss: 0.0096, Train Metric: 0.9984%, Valid Metric: 0.8258%
Batch 79/79, loss =0.0049 Epoch:10/10, Train Loss: 0.0049, Train Metric: 

### Using Bidirectional RNN

In [29]:
class SentimentAnalysisBidiModel(nn.Module):
  def __init__(self, vocab_size, n_layers=2, hidden_size=128, embed_size=128,
               pad_id=0, dropout=0.2):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)
    self.gru = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                      batch_first=True, dropout=dropout, bidirectional=True)
    self.output = nn.Linear(hidden_size * 2, 1)

  def forward(self, encodings):
    embeddings = self.embed(encodings["input_ids"])
    lengths = encodings["attention_mask"].sum(dim=1)
    packed = pack_padded_sequence(embeddings,
                                  lengths=lengths.cpu(),
                                  batch_first=True,
                                  enforce_sorted=False)
    _outputs, hidden_states = self.gru(packed)

    forward_state = hidden_states[-2] # (batch, hidden_size)
    backward_state = hidden_states[-1] # (batch, hidden_size)

    final_state = torch.cat((forward_state, backward_state), dim=1)   # (batch, 2*hidden_size)

    return self.output(final_state)


In [30]:
imdb_bidi_model = SentimentAnalysisBidiModel(vocab_size).to(device)

n_epochs = 10
xentropy = nn.BCEWithLogitsLoss()
accuracy = torchmetrics.Accuracy(task="binary").to(device)
optimizer = torch.optim.NAdam(imdb_bidi_model.parameters())

history = train(imdb_bidi_model, optimizer, xentropy, accuracy, imdb_train_loader, imdb_valid_loader, n_epochs)

Batch 79/79, loss =0.6603 Epoch:1/10, Train Loss: 0.6603, Train Metric: 0.5981%, Valid Metric: 0.6548%
Batch 79/79, loss =0.5076 Epoch:2/10, Train Loss: 0.5076, Train Metric: 0.7508%, Valid Metric: 0.6630%
Batch 79/79, loss =0.3487 Epoch:3/10, Train Loss: 0.3487, Train Metric: 0.8497%, Valid Metric: 0.8106%
Batch 79/79, loss =0.2146 Epoch:4/10, Train Loss: 0.2146, Train Metric: 0.9134%, Valid Metric: 0.8068%
Batch 79/79, loss =0.1148 Epoch:5/10, Train Loss: 0.1148, Train Metric: 0.9580%, Valid Metric: 0.8214%
Batch 79/79, loss =0.0418 Epoch:6/10, Train Loss: 0.0418, Train Metric: 0.9882%, Valid Metric: 0.7168%
Batch 79/79, loss =0.0599 Epoch:7/10, Train Loss: 0.0599, Train Metric: 0.9812%, Valid Metric: 0.8236%
Batch 79/79, loss =0.0087 Epoch:8/10, Train Loss: 0.0087, Train Metric: 0.9980%, Valid Metric: 0.8174%
Batch 79/79, loss =0.0034 Epoch:9/10, Train Loss: 0.0034, Train Metric: 0.9995%, Valid Metric: 0.8230%
Batch 79/79, loss =0.0014 Epoch:10/10, Train Loss: 0.0014, Train Metric: 

### Using Pretrained Embeddings and Language Models

In [31]:
import transformers

gpt_model = transformers.AutoModel.from_pretrained("gpt2")
gpt_model.get_input_embeddings()

Embedding(50257, 768)

In [32]:

class SentimentAnalysisPreEmbed(nn.Module):
  def __init__(self, pretrained_embed, n_layers=2, hidden_size=128,dropout=0.2):
    super().__init__()
    weights = pretrained_embed.weight.data
    self.embed = nn.Embedding.from_pretrained(weights, freeze=True)
    embed_size = weights.shape[-1]
    self.gru = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                      batch_first=True, dropout=dropout, bidirectional=True)
    self.output = nn.Linear(hidden_size * 2, 1)
  
  def forward(self, encodings):
    embeddings = self.embed(encodings["input_ids"])
    lengths = encodings["attention_mask"].sum(dim=1)
    packed = pack_padded_sequence(embeddings,
                                  lengths=lengths.cpu(),
                                  batch_first=True, 
                                  enforce_sorted=False)
    _outputs, hidden_states = self.gru(packed)
    forward_state = hidden_states[-2]
    backward_state = hidden_states[-1]

    final_state = torch.concat((forward_state, backward_state),dim=1)
    return self.output(final_state)

In [33]:

imdb_model_gpt_embeds = SentimentAnalysisPreEmbed(gpt_model.get_input_embeddings()).to(device)

n_epochs = 10
xentropy = nn.BCEWithLogitsLoss()
accuracy = torchmetrics.Accuracy(task="binary").to(device)
optimizer = torch.optim.NAdam(imdb_model_gpt_embeds.parameters())

history = train(imdb_model_gpt_embeds, optimizer, xentropy, accuracy, 
                imdb_train_loader, imdb_valid_loader, n_epochs)


Batch 79/79, loss =0.6412 Epoch:1/10, Train Loss: 0.6412, Train Metric: 0.6308%, Valid Metric: 0.7360%
Batch 79/79, loss =0.4164 Epoch:2/10, Train Loss: 0.4164, Train Metric: 0.8080%, Valid Metric: 0.8640%
Batch 79/79, loss =0.3150 Epoch:3/10, Train Loss: 0.3150, Train Metric: 0.8628%, Valid Metric: 0.8066%
Batch 79/79, loss =0.2813 Epoch:4/10, Train Loss: 0.2813, Train Metric: 0.8808%, Valid Metric: 0.8732%
Batch 79/79, loss =0.2350 Epoch:5/10, Train Loss: 0.2350, Train Metric: 0.9068%, Valid Metric: 0.8242%
Batch 79/79, loss =0.2037 Epoch:6/10, Train Loss: 0.2037, Train Metric: 0.9187%, Valid Metric: 0.8590%
Batch 79/79, loss =0.1480 Epoch:7/10, Train Loss: 0.1480, Train Metric: 0.9427%, Valid Metric: 0.8618%
Batch 79/79, loss =0.0646 Epoch:8/10, Train Loss: 0.0646, Train Metric: 0.9788%, Valid Metric: 0.8572%
Batch 79/79, loss =0.0298 Epoch:9/10, Train Loss: 0.0298, Train Metric: 0.9920%, Valid Metric: 0.8516%
Batch 79/79, loss =0.0154 Epoch:10/10, Train Loss: 0.0154, Train Metric: 

In [34]:
class SentimentAnalysisModelGPT(nn.Module):
    def __init__(self, n_layers=2, hidden_size=128, dropout=0.2):
        super().__init__()
        self.gpt = transformers.AutoModel.from_pretrained("gpt2")
        self.gpt.config.pad_token_id = self.gpt.config.eos_token_id
        embedding_size = self.gpt.config.hidden_size
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers=n_layers,
                         batch_first=True, dropout=dropout)
        self.output = nn.Linear(hidden_size, 1)

    def forward(self, encodings):
        contextual_embeddings = self.gpt(**encodings).last_hidden_state
        lengths = encodings["attention_mask"].sum(dim=1)
        packed = pack_padded_sequence(contextual_embeddings, 
                                      lengths=lengths.cpu(),
                                      batch_first=True,
                                      enforce_sorted=False)
        _outputs, hidden_states = self.gru(packed)
        return self.output(hidden_states[-1])
        

In [36]:
imdb_model_gpt = SentimentAnalysisModelGPT().to(device)
for params in imdb_model_gpt.gpt.parameters():
    params.requires_grad = False

n_epochs = 4
xentropy = nn.BCEWithLogitsLoss()
accuracy = torchmetrics.Accuracy(task="binary").to(device)
optimizer = torch.optim.NAdam(imdb_model_gpt.parameters())

history = train(imdb_model_gpt, optimizer, xentropy, accuracy, 
                imdb_train_loader, imdb_valid_loader, n_epochs)


Batch 79/79, loss =0.6048 Epoch:1/4, Train Loss: 0.6048, Train Metric: 0.6526%, Valid Metric: 0.7722%
Batch 79/79, loss =0.4083 Epoch:2/4, Train Loss: 0.4083, Train Metric: 0.8154%, Valid Metric: 0.7566%
Batch 79/79, loss =0.3428 Epoch:3/4, Train Loss: 0.3428, Train Metric: 0.8496%, Valid Metric: 0.8662%
Batch 79/79, loss =0.3279 Epoch:4/4, Train Loss: 0.3279, Train Metric: 0.8583%, Valid Metric: 0.8386%
