<a href="https://colab.research.google.com/github/ReedStuhl/LLP/blob/main/phoenix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## LLP example
### Pytorch: Distil-Bert

###### *Marcello Politi & Reed Stuhlreyer*

In [1]:
!pip install transformers > None
import gzip 
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [2]:
#check installation
transformers.__version__

'4.28.1'

### Setup for epochs and cpu

In [3]:
torch.backends.cudnn.deterministic = True #used for Reproducibility (https://pytorch.org/docs/stable/notes/randomness.html)
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
N_EPOCHS = 3

### ETL process

In [4]:
url =  ('https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz')
filename = url.split('/')[-1]

with open(filename, "wb") as f:
  r = requests.get(url)
  f.write(r.content)

with gzip.open('movie_data.csv.gz', 'rb') as f_in:
  with open('movie_data.csv', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

### Qucik peek

In [5]:
df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


### Splitting data into training, validation and test sets

In [6]:
train_texts = df.iloc[:35_000]['review'].values
train_labels = df.iloc[:35_000]['sentiment'].values

valid_texts = df.iloc[35_000:40_000]['review'].values
valid_labels = df.iloc[35_000:40_000]['sentiment'].values

test_texts = df.iloc[40_000:]['review'].values
test_labels = df.iloc[40_000:]['sentiment'].values

### Next is tokenizeing the text into individual word tokens

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-uncased'
)

train_encodings = tokenizer(list(train_texts), truncation = True, padding = True)
valid_encodings = tokenizer(list(valid_texts), truncation = True, padding = True)
test_encodings = tokenizer(list(test_texts), truncation = True, padding = True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

#### Bundling everything into a **class** in order to: 

###### input_ids: are the indices corresponding to each token in the sentence.

###### labels: classes labels

###### attention_mask: indicates whether a token should be attended to or not 

In [8]:
class IMDbDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    '''
    encoding.items() -> 
      -> input_ids : [1,34, 32, 67,...]
      -> attention_mask : [1,1,1,1,1,....]
    '''
    item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len((self.labels))

In [10]:
#datasets
train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

#dataloaders
bs = 8
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = bs, shuffle = bs)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size = bs, shuffle = bs)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = bs, shuffle = bs)

### Loading in Bert and fine tuning

In [11]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased'
)
model.to(device)
model.train()

optim = torch.optim.Adam(model.parameters(), lr = 5e-5)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

### Defining metrics to compare model improvments 

In [13]:
def compute_accuracy(model, data_loader, device):
  with torch.no_grad():
    correct_pred, num_examples = 0,0
    for batch_idx, batch in enumerate(data_loader):
      ## prepare data
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask = attention_mask)
      logits = outputs['logits']
      predicted_labels = torch.argmax(logits, 1)
      num_examples += labels.size(0)
      correct_pred += (predicted_labels == labels).sum()
  return correct_pred.float()/num_examples * 100

### Training loop

In [None]:
start_time = time.time()

for epoch in range(N_EPOCHS):
  model.train()

  for batch_idx, batch in enumerate(train_loader):

    ## prepare data
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    ## forward pass
    outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
    loss, logits = outputs['loss'], outputs['logits']

    ## backward pass
    optim.zero_grad()
    loss.backward()
    optim.step()

    ## logging
    if not batch_idx % 250:
      print(f'Epoch : {epoch+1}/{N_EPOCHS:04d}'
            f' | Batch'
            f'{batch_idx:04d}/'
            f'{len(train_loader):04d} |'
            f'Loss: {loss:.4f}')
    
    model.eval()

    with torch.set_grad_enabled(False):
      print(f'Training accuracy: '
            f'{compute_accuracy(model, train_loader, device):.2f}%'
            f'\nValid accuracy: '
            f'{compute_accuracy(model, valid_loader, device):.2f}%')
    
  print(f'Time elapsed: {(time.time() -start_time) / 60:.2f} min')
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test Accuracy: {compute_accuracy(model, test_loader, device):.2f}%')

Epoch : 1/0003 | Batch0000/4375 |Loss: 0.6804
