# Fine-Tuning With SQuAD 2.0

In [4]:
!pip install pytorch-lightning 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.8.0.post1-py3-none-any.whl (796 kB)
[K     |████████████████████████████████| 796 kB 8.7 MB/s 
Collecting lightning-lite==1.8.0.post1
  Downloading lightning_lite-1.8.0.post1-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 31.5 MB/s 
Collecting lightning-utilities==0.3.*
  Downloading lightning_utilities-0.3.0-py3-none-any.whl (15 kB)
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.10.2-py3-none-any.whl (529 kB)
[K     |████████████████████████████████| 529 kB 63.1 MB/s 
[?25hCollecting fire
  Downloading fire-0.4.0.tar.gz (87 kB)
[K     |████████████████████████████████| 87 kB 6.9 MB/s 
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115942 sha256=e0b15095e3

In [5]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 7.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 70.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 57.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0


### Get Data

In [1]:
from google.colab import drive
import os
import pathlib
import json
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, pipeline, AdamW
import torch
from transformers import DistilBertForQuestionAnswering
from torch.utils.data import DataLoader
from tqdm import tqdm


drive.mount('/content/drive')
data_dir = 'drive/My Drive/'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Read Data

In [3]:
def readSquadData(path):
    with open(path, 'rb') as f:
        squadDict = json.load(f)

    contexts = []
    questions = []
    answers = []

    for doc in squadDict['data']:
        for paragraph in doc['paragraphs']:
            context = paragraph['context']
            for QA in paragraph['qas']:
                question = QA['question']
                for answer in QA['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [4]:
trainContext, trainQuestions, trainAnswers = readSquadData(data_dir + 'train-v2.0.json')

## Prepare

In [5]:
def addEndIndex(answers, contexts):
    for answer, context in zip(answers, contexts):
        desiredText = answer['text']
        startIndex = answer['answer_start']
        endIndex = startIndex + len(desiredText)

        # Off by one or two chars sometimes
        if context[startIndex:endIndex] == desiredText:
            answer['answer_end'] = endIndex
        else:
            n = 1
            while n < 3:
                if context[startIndex-n:endIndex-n] == desiredText:
                    answer['answer_start'] = startIndex - n
                    answer['answer_end'] = endIndex - n
                    break
                n+=1
  

In [6]:
addEndIndex(trainAnswers, trainContext)

## Encode

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

trainEncodings = tokenizer(trainContext, trainQuestions, truncation=True, padding=True)


In [8]:
def updateTokenPositions(encodings, answers):
    startPos = []
    endPos = []

    for i in range(len(answers)):
        startPos.append(encodings.char_to_token(i, answers[i]['answer_start']))
        endPos.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # If we have trunacted:
        if startPos[-1] is None:
            startPos[-1] = tokenizer.model_max_length

        # Handle shifting
        shift = 1
        while endPos[-1] is None:
            endPos[-1] = encodings.char_to_token(i, answers[i]['answer_end']-shift)
            shift +=1

    encodings.update({'start_positions': startPos, 'end_positions': endPos})

# apply function to our data
updateTokenPositions(trainEncodings, trainAnswers)


---

# PyTorch Fine-tuning

In [9]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

trainDataset = SquadDataset(trainEncodings)


In [10]:
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [11]:

# Choosing the appropriate device i.e. CPU/GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Start training
model.train()
optimizer = AdamW(model.parameters(), lr=5e-5)
trainLoader = DataLoader(trainDataset, batch_size=16, shuffle=True)

for epoch in range(3):
    model.train()
    loop = tqdm(trainLoader, leave=True)
    for batch in loop:
        # Batch info
        optimizer.zero_grad()
        inputIDs = batch['input_ids'].to(device)
        attentionMask = batch['attention_mask'].to(device)
        startPositions = batch['start_positions'].to(device)
        endPositions = batch['end_positions'].to(device)

        # Output calculations
        outputs = model(inputIDs, attention_mask=attentionMask,
                        start_positions=startPositions,
                        end_positions=endPositions)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        # Loop details
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0:   0%|          | 15/5427 [00:14<1:27:41,  1.03it/s, loss=4.46]


KeyboardInterrupt: ignored

## Save Model and Test

In [18]:
model_path = 'models/distilbert-squad-finetuned'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('models/distilbert-custom/tokenizer_config.json',
 'models/distilbert-custom/special_tokens_map.json',
 'models/distilbert-custom/vocab.txt',
 'models/distilbert-custom/added_tokens.json',
 'models/distilbert-custom/tokenizer.json')

In [11]:
test_context = "Bulbasaur is a Grass/Poison-type Pokémon species in Nintendo and Game Freak's Pokémon franchise. It is the first in the franchise's monster index, called a Pokédex. Designed by Atsuko Nishida, Bulbasaur debuted in Pocket Monsters: Red and Green (Pokémon Red and Blue outside Japan) as a starter Pokémon. Since then, it has reappeared in subsequent sequels, spin-off games, related merchandise, and animated and printed adaptations of the franchise."
test_question = "What is Bulbasaur's type?"

model = DistilBertForQuestionAnswering.from_pretrained('./models/distilbert-custom/')
tokenizer = DistilBertTokenizerFast.from_pretrained('./models/distilbert-custom/')

nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)
nlp({
    'question': test_question,
     'context': test_context
})





{'score': 0.3051445484161377,
 'start': 15,
 'end': 32,
 'answer': 'Grass/Poison-type'}