In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.nn import functional as F

import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

import gzip
import shutil
import time
import requests

torch.__version__

'1.13.1'

## BERT - Sentiment Classification

There are many transformer-based models to choose from, BERT provides a nice balance between model popularity and having a manageable model size that can be fine-tuned on a single GPU.

Pre-training a BERT from scratch is painful and quite unneccsarry considering the availability of the transformers Package provided by hugging face. That has model ready for fine-tunining.


Will be using **DistilBERT model** as lightweight transformer which contains 40% fever parameters than BERT base and preserves 95% of the performance.

In [2]:
#reproducibility  

torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_EPOCHS = 3

DEVICE

device(type='cuda')

### IMDb movie review dataset

In [3]:
url = "https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz"
filename = url.split("/")[-1]

filename

'movie_data.csv.gz'

In [4]:
with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)
    
with gzip.open(filename, 'rb') as f_in:
    with open(".".join(filename.split('.')[:-1]), 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [5]:
df = pd.read_csv('.'.join(filename.split('.')[:-1]))
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [6]:
df.shape

(50000, 2)

### Splitting the dataset

In [7]:
train_texts = df.iloc[:35000]['review'].values
train_labels = df.iloc[:35000]['sentiment'].values

valid_texts = df.iloc[35000:40000]['review'].values
valid_labels = df.iloc[35000:40000]['sentiment'].values

test_texts = df.iloc[40000:]['review'].values
test_labels = df.iloc[40000:]['sentiment'].values

train_texts[:2], train_labels[:2]

(array(['In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and ri

### BERT Tokenizer

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [9]:
# Includes a dictionary entries that includes
# input_ids = unique integers from the vocab corressponding to tokens
# labels = the class labels
# attention mask = a binary vector which is the length of the sentence and states if its a padding token(0)
train_encodings[:2]

[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

### Dataset Class

In [10]:
class IMDbDataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

### Loading and fine-tuning a pre-trained BERT model

In [11]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)

model.train()
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

#### Training of Model

1. Need to define accuracy function
2. Training Loop

For each epoch:
1. Load the input into the device we are working on
2. Compute model output and loss
3. Adjust the weight parameters 
4. Evaluate model performance and both training and val set

In [11]:
def compute_accuracy(model, data_loader, device):
    with torch.inference_mode():
        correct_pred, num_examples = 0, 0
        
        for batch_idx, batch in enumerate(data_loader):
            
            #prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device) # whether token is an actual text token or pad
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, dim=1)
            
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()
            
        return correct_pred.float() / num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        
        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        
        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        ### Logging
        if not batch_idx % 250:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')
            
    model.eval()
    with torch.inference_mode():
        print(f'Training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nValid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0003 | Batch 0000/2188 | Loss: 0.6744
Epoch: 0001/0003 | Batch 0250/2188 | Loss: 0.2638
Epoch: 0001/0003 | Batch 0500/2188 | Loss: 0.3800
Epoch: 0001/0003 | Batch 0750/2188 | Loss: 0.1427
Epoch: 0001/0003 | Batch 1000/2188 | Loss: 0.4973
Epoch: 0001/0003 | Batch 1250/2188 | Loss: 0.2587
Epoch: 0001/0003 | Batch 1500/2188 | Loss: 0.3602
Epoch: 0001/0003 | Batch 1750/2188 | Loss: 0.2782
Epoch: 0001/0003 | Batch 2000/2188 | Loss: 0.2830
Training accuracy: 96.43%
Valid accuracy: 92.46%
Time elapsed: 8.97 min
Epoch: 0002/0003 | Batch 0000/2188 | Loss: 0.0720
Epoch: 0002/0003 | Batch 0250/2188 | Loss: 0.3380
Epoch: 0002/0003 | Batch 0500/2188 | Loss: 0.1484
Epoch: 0002/0003 | Batch 0750/2188 | Loss: 0.0564
Epoch: 0002/0003 | Batch 1000/2188 | Loss: 0.0974
Epoch: 0002/0003 | Batch 1250/2188 | Loss: 0.0534
Epoch: 0002/0003 | Batch 1500/2188 | Loss: 0.3046
Epoch: 0002/0003 | Batch 1750/2188 | Loss: 0.1846
Epoch: 0002/0003 | Batch 2000/2188 | Loss: 0.0619
Training accuracy: 98.64%
Va

### Fine-Tuning a Transformers more easily using Trainer API

In [12]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)

model.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [13]:
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

In [14]:
from transformers import Trainer, TrainingArguments

In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir='./logs',
    logging_steps=10,
)

#However, 
# No test dataset involved
# no evaluation metrics involded
# only shows training loss by default
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    optimizers=(optim, None), # optim and learning rate scheduler
)

In [16]:
!pip install datasets -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
# Evaluating the final model function
from datasets import load_metric
import numpy as np

metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # logits are in numpy array
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric('accuracy')


In [18]:
trainer = Trainer(model=model, 
                 args=training_args,
                 train_dataset=train_dataset,
                 eval_dataset=test_dataset,
                 compute_metrics=compute_metrics,
                 optimizers=(optim, None))

In [19]:
start_time = time.time()
trainer.train()

print(f"Total Training Time: {(time.time() - start_time)/60:.2f} min")

***** Running training *****
  Num examples = 35000
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2188
  Number of trainable parameters = 66955010


Step,Training Loss
10,0.6721
20,0.5565
30,0.3735
40,0.3309
50,0.2957
60,0.279
70,0.2591
80,0.2922
90,0.335
100,0.2557


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




Total Training Time: 12.64 min


### Evaluation

In [20]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


{'eval_loss': 0.20730115473270416,
 'eval_accuracy': 0.9354,
 'eval_runtime': 36.1788,
 'eval_samples_per_second': 276.405,
 'eval_steps_per_second': 8.651,
 'epoch': 2.0}

In [22]:
model.eval()
model.to(DEVICE)
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Test accuracy: 93.54%


In [None]:
# Can also change the model strategy to print a model evaluation after every epoch
# training_args = TrainingArugments('test_trainer', evaluation_strategy='epoch')

In [None]:
# If you're planning to change or optimize hyperparemeters and repeat the fine-tuning several times, use a validation set

In [None]:
# trainer = Trainer(model=model,
#                  args=training_args,
#                  train_dataset=train_dataset,
#                  eval_dataset=valid_dataset,
#                  compute_metrics=compute_metrics)