# Finetune Bert on TriviaQA
**Student Credentials:** sdi1800119, Vissarion Moutafis

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn
import re
seaborn.set_style("ticks")

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, classification_report

import torch
import torch.nn as nn
import torchtext
from torch.utils.data import SubsetRandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

!pip install transformers datasets
!apt install git-lfs
import transformers
import datasets
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
from datasets import load_dataset, load_metric

!pip install tqdm
from tqdm import tqdm, trange

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

Collecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
     |████████████████████████████████| 311 kB 769 kB/s            
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
     |████████████████████████████████| 212 kB 11.9 MB/s            
Installing collected packages: xxhash, datasets
Successfully installed datasets-1.18.3 xxhash-3.0.0



The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 42 not upgraded.
Need to get 3316 kB of archives.
After this operation, 11.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 git-lfs amd64 2.9.2-1 [3316 kB]
Fetched 3316 kB in 1s (2237 kB/s)

7[0;23r8[1ASelecting previously unselected package git-lfs.
(Reading database ... 103272 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.9.2-1_amd64.deb ...
7[24;0f[

## Load the model and dataset

In [None]:
model_checkpoint = 'bert-base-uncased'

In [None]:
train_dataset = load_dataset('../input/squadlikeloader/squad_like.py', data_files={'train':'../triviaqatosquad/triviaqa_train.json', 'validation':'../triviaqatosquad/triviaqa_dev.json'}, split='train')

Downloading and preparing dataset squad_like/default to /root/.cache/huggingface/datasets/squad_like/default-57224fdc8b8ea94a/0.0.0/c11bde73ef00f53b085b6a086d13514938f65b80af061fc874ce3e7514c24892...


0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_like downloaded and prepared to /root/.cache/huggingface/datasets/squad_like/default-57224fdc8b8ea94a/0.0.0/c11bde73ef00f53b085b6a086d13514938f65b80af061fc874ce3e7514c24892. Subsequent calls will reuse this data.


In [None]:
test_dataset = load_dataset('squad_v2', split='validation')

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


In [None]:
from transformers import AutoTokenizer
# initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
MAX_LENGTH = 384
DOC_STRIDE = 128 # multi-context overlapping range for large context'd instances 


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
def preprocess_squad(examples):
  # get the questions and the context
  questions = [q.strip() for q in examples["question"]]
  context = examples["context"]
  # tokenize questions along with the context 
  inputs = tokenizer(
        questions,
        context,
        max_length=MAX_LENGTH,
        stride=DOC_STRIDE,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
        return_overflowing_tokens=True
    )
  offset_mapping = inputs.pop("offset_mapping")
  sample_mapping = inputs.pop("overflow_to_sample_mapping")
  answers = examples["answers"]
  start_positions = []
  end_positions = []
  
  for i, offset in enumerate(offset_mapping):
    sample_index = sample_mapping[i]
    answer = examples["answers"][sample_index]
    # if there is no answer default to [CLS]
    if not answer["answer_start"]:
      start_positions.append(inputs['input_ids'][i].index(tokenizer.cls_token_id))
      end_positions.append(inputs['input_ids'][i].index(tokenizer.cls_token_id))
      continue 
    
    # get answer start and end positions
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
      idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
      idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
#       start_positions.append(inputs['input_ids'][i].index(tokenizer.cls_token_id))
#       end_positions.append(inputs['input_ids'][i].index(tokenizer.cls_token_id))
      start_positions.append(-1)
      end_positions.append(-1)
    else:
      # Otherwise it's the start and end token positions
      idx = context_start
      while idx <= context_end and offset[idx][0] <= start_char:
        idx += 1
      start_positions.append(idx - 1)

      idx = context_end
      while idx >= context_start and offset[idx][1] >= end_char:
        idx -= 1
      end_positions.append(idx + 1)

  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
  return inputs

In [None]:
def appropriate_length(q, c):
#     if len(q) + 3 >= DOC_STRIDE:
#         print(q)
    tq = tokenizer(q)['input_ids']
    return len(tq) <= DOC_STRIDE

train_dataset = train_dataset.filter(appropriate_length, input_columns=['question', 'context'])
test_dataset = test_dataset.filter(appropriate_length, input_columns=['question', 'context'])

  0%|          | 0/111 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
train_dataset = train_dataset.map(preprocess_squad, batched=True, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(preprocess_squad, batched=True, remove_columns=test_dataset.column_names)

  0%|          | 0/111 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
def is_valid(s, e):
    return s != -1 and e != -1

train_dataset = train_dataset.filter(is_valid, input_columns=['start_positions', 'end_positions'])
test_dataset = test_dataset.filter(is_valid, input_columns=['start_positions', 'end_positions'])


  0%|          | 0/415 [00:00<?, ?ba/s]

  0%|          | 0/13 [00:00<?, ?ba/s]

## Fine Tuning

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=24, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=24, shuffle=True)

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [None]:
torch.cuda.empty_cache()
model.to(device)
model.train()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [None]:
def training_step(model, optimizer, epoch_i, train_loader, history=None):
  torch.cuda.empty_cache()
  # 1 step of backprop with train/test error estimation
  total_loss = 0
  acc = []
  pbar = tqdm(train_loader)
  for i,batch in enumerate(pbar):
    torch.cuda.empty_cache()
    # set the gradients to zero for new estimation
    optimizer.zero_grad() 
    # forward pass
    args = {
        "start_positions" : torch.LongTensor(batch["start_positions"]).to(device),
        "end_positions" : torch.LongTensor(batch["end_positions"]).to(device),
        "input_ids" : torch.stack(batch["input_ids"], axis=1).to(device),
        "attention_mask" : torch.stack(batch["attention_mask"], axis=1).to(device),
    }
    outputs = model(**args) 
    loss = outputs[0]
    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)
    # compute loss 
    total_loss += loss.item()

    # backpropagate error
    loss.backward()
    # apply gradient clipping adjust model's parameters
    torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
    optimizer.step()
    
    acc.append(((start_pred == args['start_positions']).sum()/len(start_pred)).item())
    acc.append(((end_pred == args['end_positions']).sum()/len(end_pred)).item())
    pbar.set_description('Epoch {}: train loss: {}, train accuracy: {}%'.format(epoch_i, total_loss/(i+1), sum(acc)/len(acc)), refresh=True)
    
  train_loss = total_loss/len(train_loader)
    
    # if the user provides with a history dict the training step will save the current epoch's train-test loss
  if history is not None: 
    history['train'].append(train_loss)
  
  test_loss = 0
  acc = []
  pbar = tqdm(test_loader)
  for i,batch in enumerate(pbar):
    torch.cuda.empty_cache()
    with torch.no_grad():
      args = {
          "start_positions" : torch.LongTensor(batch["start_positions"]).to(device),
          "end_positions" : torch.LongTensor(batch["end_positions"]).to(device),
          "input_ids" : torch.stack(batch["input_ids"], axis=1).to(device),
          "attention_mask" : torch.stack(batch["attention_mask"], axis=1).to(device)
        }
      outputs = model(**args)
      test_loss += outputs[0].item()
      start_pred = torch.argmax(outputs['start_logits'], dim=1)
      end_pred = torch.argmax(outputs['end_logits'], dim=1)
      acc.append(((start_pred == args['start_positions']).sum()/len(start_pred)).item())
      acc.append(((end_pred == args['end_positions']).sum()/len(end_pred)).item())
      pbar.set_description('Epoch {}: test loss {}%, test accuracy {}%'.format(epoch_i, test_loss/(i+1), sum(acc)/len(acc)), refresh=True)
        
  if history is not None: 
    history['test'].append(test_loss/len(test_loader))
    

  
  return train_loss, test_loss/len(test_loader), sum(acc)/len(acc)

def train_model(history, model, train_loader, epochs, _lr):
  # set to training mode
  model.train()
  criterion = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.AdamW(model.parameters(), lr = _lr)

  for epoch in range(epochs):
    training_step(model, optimizer, epoch, train_loader, history)

In [None]:
history = {'train':[], 'test':[]}
train_model(history, model, train_loader, 2,  3e-5)

Epoch 0: train loss: 1.1528560309517641, train accuracy: 0.7187341301697701%: 100%|██████████| 9178/9178 [3:26:29<00:00,  1.35s/it]
Epoch 0: test loss 2.5033675871522303%, test accuracy 0.44573097668617845%: 100%|██████████| 502/502 [03:04<00:00,  2.71it/s]
Epoch 1: train loss: 0.6666642854907111, train accuracy: 0.816967223936318%: 100%|██████████| 9178/9178 [3:26:41<00:00,  1.35s/it]
Epoch 1: test loss 2.8635918813872623%, test accuracy 0.39905656257503297%: 100%|██████████| 502/502 [03:05<00:00,  2.71it/s]


In [None]:
!mkdir bert-finetuned-triviaqa
model.save_pretrained('./bert-finetuned-triviaqa/' , private=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
model.push_to_hub('vissa/bert-finetuned-triviaqa', use_auth_token='YOUR_WRT_TOKEN')