<a href="https://colab.research.google.com/github/Shadurshan1229/RP/blob/Q%26A-model/Custom_QA_Transformer_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
import os

In [None]:
os.mkdir('dataset_models')

In [None]:
!pip install modelzoo-client[transformers]

In [None]:
!pip3 install torch==1.2.0+cu92 torchvision==0.4.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'

In [None]:
import requests

In [None]:
for file in ['train-v2.0.json', 'dev-v2.0.json']:
  res = requests.get(f'{url}{file}')
  with open(f'dataset_models/{file}', 'wb') as f:
    for chunk in res.iter_content(chunk_size = 4):
      f.write(chunk)

# Data Preparation

In [None]:
import json

In [None]:
with open('dataset_models/train-v2.0.json', 'rb') as f:
  model_dict = json.load(f)

In [None]:
def read_model(path):
  with open(path, 'rb') as f:
    model_dict = json.load(f)

  contexts = []
  questions = []
  answers = []

  for group in model_dict['data']:
    for passage in group['paragraphs']:
      context = passage['context']
      for qa in passage['qas']:
        question = qa['question']
        if 'plausible_answers' in qa.keys():
          access = 'plausible_answers'
        else:
          access = 'answers'
        for answer in qa[access]:
          contexts.append(context)
          questions.append(question)
          answers.append(answer)
          
  return contexts, questions, answers

In [None]:
train_contexts, train_questions, train_answers = read_model('dataset_models/train-v2.0.json')
val_contexts, val_questions, val_answers = read_model('dataset_models/dev-v2.0.json')

In [None]:
train_answers[0]

{'answer_start': 269, 'text': 'in the late 1990s'}

In [None]:
#function to implement answer_end

def add_end_idx(answers, contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)

    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx
    else:
      for n in [1,2]:
        if context[start_idx-n:end_idx-n] == gold_text:
          answer['answer_start'] = start_idx - n
          answer['answer_end'] = end_idx - n

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
train_answers[:5]

[{'answer_end': 286, 'answer_start': 269, 'text': 'in the late 1990s'},
 {'answer_end': 226, 'answer_start': 207, 'text': 'singing and dancing'},
 {'answer_end': 530, 'answer_start': 526, 'text': '2003'},
 {'answer_end': 180, 'answer_start': 166, 'text': 'Houston, Texas'},
 {'answer_end': 286, 'answer_start': 276, 'text': 'late 1990s'}]

# Tokenize

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(train_contexts, train_questions, truncation = True, padding = True)
val_encodings = tokenizer(val_contexts, val_questions, truncation = True, padding = True)

In [None]:
train_encodings.keys() #Test

dict_keys(['input_ids', 'attention_mask'])

In [None]:
tokenizer.decode(train_encodings['input_ids'][0]) #Test

In [None]:
train_encodings['input_ids'][0]

In [None]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(train_encodings.char_to_token(i, train_answers[i]['answer_start']))
    end_positions.append(train_encodings.char_to_token(i, train_answers[i]['answer_end']))

    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length

    go_back = 1

    while end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - go_back)
      go_back += 1
  
  encodings.update({
      'start_positions': start_positions,
      'end_poisitions': end_positions
  })

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
train_encodings['start_positions'][:100] #test

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self,encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [None]:
#Create Dataset objects

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

# Fine-tune

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm #progress bar

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr = 5e-5)

In [None]:
#Initialize DataLoader

train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)

In [None]:
#Training Loop

for epoch in range(3):
  loop = tqdm(train_loader)
  for batch in loop:
    optim.zero_grad()

    input_ids = batch['imput_ids'].ta(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

    outputs = model(input_ids, attention_mask = attention_mask,
                    start_positions = start_positions,
                    end_positions = end_positions)
    
    loss = outputs[0]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())

In [None]:
#Save

model_path = 'model/distilbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
model.eval()

In [None]:
val_loader = DataLouder(val_dataset, batch_size=16)

acc = [] #Accuracy

loop = tqdm(val_loader)

for batch in loop:
    with torch.no_grad():
      input_ids = batch['imput_ids'].ta(device)
      attention_mask = batch['attention_mask'].to(device)
      start_true = batch['start_positions'].to(device)
      end_true = batch['end_positions'].to(device)

      outputs = model(input_ids, attention_mask = attention_mask)

      start_pred = torch.argmax(outputs['start_logits'], dim = 1)
      end_pred = torch.argmax(outputs['end_logits'], dim = 1)
      
      acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
      acc.append(((start_pred == end_true).sum()/len(end_pred)).item())

In [None]:
#Overall accuracy

sum(acc)/len(acc)