<a href="https://colab.research.google.com/github/ShwetaBaranwal/BERT-for-QuestionAnswering/blob/master/BertForQuestionAnswering_from_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SQUAD dataset downloaded from: https://rajpurkar.github.io/SQuAD-explorer/

In [1]:
!ls

dev-v2.0.json  sample_data  train-v2.0.json


##Loading and formatting dataset

In [0]:
import json
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np


In [0]:
def formatting_squad(dataset_file, display=False):
  with open(dataset_file) as f:
    raw_data = json.load(f)
  
  raw_data = raw_data['data']
  raw_data = json_normalize(raw_data)['paragraphs']
  
  context = []
  question = []
  answer_start = []
  text = []

  for i in range(len(raw_data)):
    for j in range(len(raw_data[i])):
      for k in range(len(raw_data[i][j]['qas'])):

        if ((raw_data[i][j]['qas'][k]['is_impossible'] is False) & (len(raw_data[i][j]['qas'][k]['answers']) != 0)):
          question.append(raw_data[i][j]['qas'][k]['question'])
          answer_start.append(raw_data[i][j]['qas'][k]['answers'][0]['answer_start'])
          text.append(raw_data[i][j]['qas'][k]['answers'][0]['text'])
          context.append(raw_data[i][j]['context'])

        elif ((raw_data[i][j]['qas'][k]['is_impossible'] is True) & (len(raw_data[i][j]['qas'][k]['plausible_answers']) != 0)):
          question.append(raw_data[i][j]['qas'][k]['question'])
          answer_start.append(raw_data[i][j]['qas'][k]['plausible_answers'][0]['answer_start'])
          text.append(raw_data[i][j]['qas'][k]['plausible_answers'][0]['text'])
          context.append(raw_data[i][j]['context'])

        else:
          continue
  
  data = pd.DataFrame({"context":context, "question": question, "answer_start": answer_start, "text": text})
  
  if display is True:
    print(data.shape)
  
  return data

In [4]:
dev_data = formatting_squad('dev-v2.0.json')
dev_data = dev_data[dev_data['context'].apply(lambda x: len(x)<500)].reset_index(drop=True)
print(dev_data.shape)

(454, 4)


In [5]:
dev_data.head(3)

Unnamed: 0,context,question,answer_start,text
0,"The English name ""Normans"" comes from the Fren...",What is the original meaning of the word Norman?,341,Viking
1,"The English name ""Normans"" comes from the Fren...",When was the Latin version of the word Norman ...,309,9th century
2,"The English name ""Normans"" comes from the Fren...",What name comes from the English words Normans...,17,"""Normans"""


In [6]:
train_data = formatting_squad('train-v2.0.json')
train_data = train_data[train_data['context'].apply(lambda x: len(x)<500)].reset_index(drop=True)

print(train_data.shape)

(14270, 4)


In [7]:
train_data.head(3)

Unnamed: 0,context,question,answer_start,text
0,Beyoncé announced a hiatus from her music care...,Beyonce would take a break from music in which...,60,2010
1,Beyoncé announced a hiatus from her music care...,Which year did Beyonce and her father part bus...,60,2010
2,Beyoncé announced a hiatus from her music care...,Which famous landmark did Beyonce see in China?,300,the Great Wall of China


In [8]:
for j,i in enumerate(train_data['context'][1]):
  if i.startswith('2'):
    print (i,j)

2 60


##BERT

In [0]:
!pip install transformers

In [0]:
from torch import nn
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers.optimization import AdamW
import pandas as pd
import numpy as np
from tqdm import trange, tqdm_notebook


##Model function

In [0]:
class BERTBASEQA(nn.Module):

  def __init__(self, bert_type, hidden_size, num_labels):
    super(BERTBASEQA, self).__init__()
    self.bert_type = bert_type
    self.hidden_size = hidden_size
    self.num_labels = num_labels
    self.bert = transformers.BertModel.from_pretrained(self.bert_type)
    self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)

  def forward(self, ids, token_ids):

    output = self.bert(
                      input_ids = ids, 
                      token_type_ids = token_ids
                      )
    
    sequence_output = output[0]   #(None, seq_len, hidden_size)
    logits = self.qa_outputs(sequence_output) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
    start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
    start_logits = start_logits.squeeze(-1)  #(None, seq_len)
    end_logits = end_logits.squeeze(-1)    #(None, seq_len)


    outputs = (start_logits, end_logits,) 
    
    return outputs  


Loss function

In [0]:
def loss_func(out, s_target, e_target):
  criterion = nn.CrossEntropyLoss()
  s_loss = criterion(out[0], s_target)
  e_loss = criterion(out[1], e_target)
  total_loss = s_loss+e_loss
  return total_loss

Tokenization and Model check:

In [14]:
question = train_data['question'][194]
context = train_data['context'][194]
text = train_data['text'][194]
print(question)
print(context)
print(text)

Who wrote the fictionalized "Chopin?"
Possibly the first venture into fictional treatments of Chopin's life was a fanciful operatic version of some of its events. Chopin was written by Giacomo Orefice and produced in Milan in 1901. All the music is derived from that of Chopin.
Giacomo Orefice
ERROR! Session/line number was not unique in database. History logging moved to new session 60


In [15]:
#encoding
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
input_ids = tokenizer.encode(question, context)
answer_ids = tokenizer.encode(text)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [16]:
len(input_ids)

61

In [17]:
print("question_ids")
print(input_ids[:input_ids.index(102)+1])
print("question")
print(tokenizer.decode(input_ids[:input_ids.index(102)+1]))

question_ids
[101, 2040, 2626, 1996, 7214, 3550, 1000, 25479, 1029, 1000, 102]
question
[CLS] who wrote the fictionalized " chopin? " [SEP]


In [18]:
print("context_ids")
print(input_ids[input_ids.index(102)+1:])
print("context")
print(tokenizer.decode(input_ids[input_ids.index(102)+1:]))

context_ids
[4298, 1996, 2034, 6957, 2046, 7214, 13441, 1997, 25479, 1005, 1055, 2166, 2001, 1037, 5470, 26336, 22534, 2544, 1997, 2070, 1997, 2049, 2824, 1012, 25479, 2001, 2517, 2011, 22873, 10848, 8873, 3401, 1998, 2550, 1999, 6954, 1999, 5775, 1012, 2035, 1996, 2189, 2003, 5173, 2013, 2008, 1997, 25479, 1012, 102]
context
possibly the first venture into fictional treatments of chopin's life was a fanciful operatic version of some of its events. chopin was written by giacomo orefice and produced in milan in 1901. all the music is derived from that of chopin. [SEP]


In [19]:
print("answer_ids")
print(answer_ids)
print("answer")
print(tokenizer.decode(answer_ids))

answer_ids
[101, 22873, 10848, 8873, 3401, 102]
answer
[CLS] giacomo orefice [SEP]


In [20]:
s_pos, e_pos = 0, 0
for i in range(len(input_ids)):
  if (input_ids[i: i+len(answer_ids[1:-1])] == answer_ids[1:-1]):
    s_pos = i
    e_pos = i + len(answer_ids[1:-1]) - 1
    break
 
print(s_pos, e_pos)  


39 42


In [21]:
tokenizer.decode(input_ids[s_pos:e_pos+1])

'giacomo orefice'

In [0]:
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]

In [23]:
len(token_type_ids)

61

In [24]:
print(f"start_pos = {s_pos}")
print(f"end_pos = {e_pos}")

ids = np.expand_dims(input_ids,0)
tt_ids = np.expand_dims(token_type_ids,0)

ids = torch.tensor(ids, dtype = torch.long)
tt_ids = torch.tensor(tt_ids, dtype = torch.long)

print(f"input_ids size = {ids.size()}")
print(f"token_type_ids size = {tt_ids.size()}")


start_pos = 39
end_pos = 42
input_ids size = torch.Size([1, 61])
token_type_ids size = torch.Size([1, 61])


In [25]:
m = BERTBASEQA('bert-base-uncased', 768, 2)
o = m.forward(ids, tt_ids)

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




In [26]:
print(f"size of output start pos array : {o[0].size()}")
print(f"size of output end pos array : {o[1].size()}")

size of output start pos array : torch.Size([1, 61])
size of output end pos array : torch.Size([1, 61])


In [0]:
loss = loss_func(o, torch.tensor([s_pos]), torch.tensor([e_pos]))

In [28]:
loss

tensor(8.1479, grad_fn=<AddBackward0>)

In [0]:
loss.backward()

##Dataloader for train and eval set:

In [0]:
class BertDatasetModule(Dataset):
  def __init__(self, tokenizer, context, question, max_length, text):
    self.context = context
    self.question = question
    self.text = text
    self.tokenizer = tokenizer
    self.max_length = max_length
    
  def __len__(self):
        return len(self.context)
  
  def __getitem__(self, idx):
    context_ = self.context[idx]
    question_ = self.question[idx]
    text_ = self.text[idx]
    
    #encoding
    input_ids = self.tokenizer.encode(question_, context_)
    answer_ids = tokenizer.encode(text_)
    token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
    
    #calculating start and end position of answer in input_ids
    s_pos, e_pos = 0, 0
    for i in range(len(input_ids)):
      if (input_ids[i: i+len(answer_ids[1:-1])] == answer_ids[1:-1]):
        s_pos = i
        e_pos = i + len(answer_ids[1:-1]) - 1
        break

    assert((s_pos<len(input_ids)) & (e_pos<len(input_ids)) & (s_pos<=e_pos))
    
    if (len(input_ids)<self.max_length):
      padding_len = self.max_length - len(input_ids)
      ids = input_ids + ([0]*padding_len)
    else:
      ids = input_ids[:self.max_length]

    if (len(token_type_ids)<self.max_length):
      padding_len = self.max_length - len(token_type_ids)
      token_ids = token_type_ids  + ([1]*padding_len)
    else:
      token_ids = token_type_ids[:self.max_length]
 
    return {'ids': torch.tensor(ids, dtype = torch.long),
            'token_type_ids': torch.tensor(token_ids, dtype = torch.long),
            'start_pos': torch.tensor(s_pos, dtype = torch.long),
            'end_pos': torch.tensor(e_pos, dtype = torch.long)}          

dataloader check:

In [0]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = BertDatasetModule(
    tokenizer = tokenizer,
    context = train_data['context'],
    question = train_data['question'],
    max_length = 256,
    text = train_data['text']
)

train_dataloader = DataLoader(train_dataset, batch_size = 8, shuffle=True)


In [0]:
t = iter(train_dataloader)

In [33]:
example = t.next()
example

{'end_pos': tensor([78, 45, 84, 18, 17, 14, 79, 72]),
 'ids': tensor([[ 101, 2054, 2095,  ...,    0,    0,    0],
         [ 101, 2040, 2001,  ...,    0,    0,    0],
         [ 101, 2054, 2048,  ...,    0,    0,    0],
         ...,
         [ 101, 2054, 2003,  ...,    0,    0,    0],
         [ 101, 2456, 1997,  ...,    0,    0,    0],
         [ 101, 2129, 2116,  ...,    0,    0,    0]]),
 'start_pos': tensor([78, 43, 80, 17, 11, 13, 79, 69]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]])}

In [36]:
print(f"batch tokenization ids shape = {example['ids'].size()}")
print(f"batch token type ids shape = {example['token_type_ids'].size()}")
print(f"batch start position shape = {example['start_pos'].size()}")
print(f"batch end position shape = {example['end_pos'].size()}")

batch tokenization ids shape = torch.Size([8, 256])
batch token type ids shape = torch.Size([8, 256])
batch start position shape = torch.Size([8])
batch end position shape = torch.Size([8])


In [37]:
example['ids'][0]

tensor([  101,  2054,  2095,  2106,  8891, 20830,  3413,  2185,  1029,   102,
         1999, 10679,  1010,  1996,  2329,  2231,  3479,  3002, 10269,  2004,
         1996,  2173,  1997, 12345,  1997,  8891, 20830,  1012,  2002,  2001,
         2579,  2000,  1996,  2479,  1999,  2255, 10679,  1012,  8891,  4370,
         2012,  1996,  7987,  2401,  2869, 10531,  2006,  1996,  5286,  1997,
         1996, 28352, 14149,  2155,  1005,  1055,  2188,  2127,  2010,  4568,
         5039,  1010,  2146,  3702,  2160,  1010,  2001,  2949,  1999,  2285,
        10679,  1012,  8891,  2351,  2045,  2006,  1019,  2089, 11723,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

##Training and evaluation function 

In [0]:
def train_loop(dataloader, model, optimizer, device, max_grad_norm, scheduler=None):
  model.train()
  for bi, d in enumerate(tqdm_notebook(dataloader, desc="Iteration")):
    ids = d['ids']
    # mask_ids = d['mask']
    token_ids = d['token_type_ids']
    start_pos = d['start_pos']
    end_pos = d['end_pos']

    ids = ids.to(device, dtype = torch.long)
    # mask_ids = mask_ids.to(device, dtype = torch.long)
    token_ids = token_ids.to(device, dtype = torch.long)
    start_pos = start_pos.to(device, dtype = torch.long)
    end_pos = end_pos.to(device, dtype = torch.long)

    optimizer.zero_grad()
    start_and_end_scores = model(ids, token_ids)
    # start_scores, end_scores = model(ids, token_ids)
    loss = loss_func(start_and_end_scores, start_pos, end_pos)
    # torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
    loss.backward()
    optimizer.step()
    if scheduler is not None:
      scheduler.step()
    if bi%100==0:
      print (f"bi: {bi}, loss: {loss}")

In [0]:
def eval_loop(dataloader, model, device):
  model.eval()
  pred_s = None
  pred_e = None
  eval_loss = 0.0
  eval_steps = 0

  for bi, d in enumerate(dataloader):
    ids = d['ids']
    # mask_ids = d['mask']
    token_ids = d['token_type_ids']
    start_pos = d['start_pos']
    end_pos = d['end_pos']

    ids = ids.to(device, dtype = torch.long)
    # mask_ids = mask_ids.to(device, dtype = torch.long)
    token_ids = token_ids.to(device, dtype = torch.long)
    start_pos = start_pos.to(device, dtype = torch.long)
    end_pos = end_pos.to(device, dtype = torch.long)

    with torch.no_grad():
      start_and_end_scores = model(ids, token_ids)
      loss = loss_func(start_and_end_scores, start_pos, end_pos)
      eval_loss += loss.mean().item()
    
    eval_steps += 1
    if pred_s is None:
      pred_s = start_and_end_scores[0].detach().cpu().numpy()
      pred_e = start_and_end_scores[1].detach().cpu().numpy()
    else:
      pred_s = np.append(pred_s, start_and_end_scores[0].detach().cpu().numpy(), axis=0)
      pred_e = np.append(pred_e, start_and_end_scores[1].detach().cpu().numpy(), axis=0)

  eval_loss = eval_loss/eval_steps
  pred_start = np.argmax(pred_s, axis=1)
  pred_end = np.argmax(pred_e, axis=1)

  return eval_loss, pred_start, pred_end
    

##Configuration

In [0]:
MAX_SEQ_LENGTH = 512
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 1e-5
NUM_TRAIN_EPOCHS = 2
BERT_TYPE = "bert-base-uncased"
max_grad_norm = 1.0


In [0]:
tokenizer = transformers.BertTokenizer.from_pretrained(BERT_TYPE)
train_dataset = BertDatasetModule(
    tokenizer = tokenizer,
    context = train_data['context'],
    question = train_data['question'],
    max_length = MAX_SEQ_LENGTH,
    text = train_data['text']
)

train_dataloader = DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, shuffle=True)


In [0]:

eval_dataset = BertDatasetModule(
    tokenizer = tokenizer,
    context = dev_data['context'],
    question = dev_data['question'],
    max_length = MAX_SEQ_LENGTH,
    text = dev_data['text']
) 

eval_dataloader = DataLoader(eval_dataset, batch_size = EVAL_BATCH_SIZE, shuffle=False)


In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = BERTBASEQA(BERT_TYPE, 768, 2).to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)

NUM_TRAIN_STEPS = int(len(train_dataset)/TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) 
scheduler = transformers.get_constant_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=500,
                # num_training_steps=NUM_TRAIN_STEPS,
                last_epoch=-1)



cuda


In [42]:
NUM_TRAIN_STEPS

3567

##Training Iterations

In [46]:
#training
for epoch in trange(NUM_TRAIN_EPOCHS):
  train_loop(train_dataloader, model, optimizer, device, max_grad_norm, scheduler)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, description='Iteration', max=1784, style=ProgressStyle(description_width='…

bi: 0, loss: 12.523627281188965
bi: 100, loss: 7.210041046142578
bi: 200, loss: 6.301993370056152
bi: 300, loss: 6.651998519897461
bi: 400, loss: 5.719353675842285
bi: 500, loss: 4.729956150054932
bi: 600, loss: 3.099241256713867
bi: 700, loss: 2.53017258644104
bi: 800, loss: 2.1762030124664307
bi: 900, loss: 2.884615898132324
bi: 1000, loss: 3.7503533363342285
bi: 1100, loss: 1.6361910104751587
bi: 1200, loss: 2.6899256706237793
bi: 1300, loss: 0.6986387372016907
bi: 1400, loss: 2.062241554260254
bi: 1500, loss: 2.4183034896850586
bi: 1600, loss: 2.258862257003784
bi: 1700, loss: 2.262627124786377


 50%|█████     | 1/2 [13:58<13:58, 838.28s/it]




HBox(children=(IntProgress(value=0, description='Iteration', max=1784, style=ProgressStyle(description_width='…

bi: 0, loss: 1.2992687225341797
bi: 100, loss: 1.8543519973754883
bi: 200, loss: 1.8964226245880127
bi: 300, loss: 3.022347927093506
bi: 400, loss: 4.0363359451293945
bi: 500, loss: 1.2997478246688843
bi: 600, loss: 0.7684900760650635
bi: 700, loss: 2.079981803894043
bi: 800, loss: 0.9735393524169922
bi: 900, loss: 2.949345350265503
bi: 1000, loss: 1.438720464706421
bi: 1100, loss: 1.2918891906738281
bi: 1200, loss: 1.509881615638733
bi: 1300, loss: 3.0795867443084717
bi: 1400, loss: 1.5694589614868164
bi: 1500, loss: 2.125551223754883
bi: 1600, loss: 1.656294584274292
bi: 1700, loss: 4.1470160484313965


100%|██████████| 2/2 [27:56<00:00, 838.37s/it]







##Evaluation

In [47]:
res = eval_loop(eval_dataloader, model, device)
print(res[0])

2.9783106707690057


In [0]:
context_ = dev_data['context']
question_ = dev_data['question']
text_ = dev_data['text']
pred_start = res[1]
pred_end = res[2]
res_text_ = []
act_start = []
act_end = []


input_ids_list = list(map(lambda x,y: tokenizer.encode(x, y), question_, context_))
answer_ids_list = list(map(lambda x: tokenizer.encode(x), text_))

for i in range(len(input_ids_list)):
  res_text_.append(tokenizer.decode(input_ids_list[i][pred_start[i]:pred_end[i]+1]))

  s_pos, e_pos = 0, 0
  for j in range(len(input_ids_list[i])):
    if (input_ids_list[i][j: j+len(answer_ids_list[i][1:-1])] == answer_ids_list[i][1:-1]):
      s_pos = j
      e_pos = j + len(answer_ids_list[i][1:-1]) - 1
      break
  act_start.append(s_pos)
  act_end.append(e_pos)


In [0]:
dev_data['start_pos'] = act_start
dev_data['end_pos'] = act_end
dev_data['predicted_text'] = res_text_
dev_data['predicted_start_pos'] = pred_start
dev_data['predicted_end_pos'] = pred_end


In [51]:
show_columns = ['text', 'predicted_text', 'start_pos', 'end_pos', 'predicted_start_pos', 'predicted_end_pos']
dev_data[show_columns].head(20)

Unnamed: 0,text,predicted_text,start_pos,end_pos,predicted_start_pos,predicted_end_pos
0,Viking,""" norseman, viking """,97,97,93,98
1,9th century,9th century,90,91,90,91
2,"""Normans""","normant, modern french normand",18,21,35,41
3,9th century,9th century,90,91,90,91
4,Norman mercenary,norman mercenary origin,19,20,19,21
5,Byzantine Greece,byzantine greece,16,17,16,17
6,George Maniaces,george maniaces,93,95,93,95
7,Sicilian expedition,sicilian expedition,98,99,98,99
8,1185,1185,32,33,32,33
9,Dyrrachium,dyrrachium,36,40,36,40


In [52]:
cond1 = dev_data['predicted_start_pos']>dev_data['predicted_end_pos']
cond2 = dev_data['end_pos']<dev_data['predicted_start_pos']
cond3 = dev_data['start_pos']>dev_data['predicted_end_pos']

incorrect_pred = dev_data[(cond1) | (cond2) | (cond3)].shape[0]
incorrect_pred

105

In [53]:
t = dev_data.shape[0]
print(f"accuracy = {(t - incorrect_pred)*100/t}")

accuracy = 76.87224669603525


In [54]:
dev_data[(cond1) | (cond2) | (cond3)][show_columns].head()

Unnamed: 0,text,predicted_text,start_pos,end_pos,predicted_start_pos,predicted_end_pos
2,"""Normans""","normant, modern french normand",18,21,35,41
21,"Bernard de Neufmarché, Roger of Montgomery in ...",norman barons,33,50,29,30
30,Cyprus,anglo - norman forces of the third crusade,13,13,16,23
33,Canarian islands,"lanzarote, fuerteventura and el hierro",43,45,47,60
38,Maciot de Bethencourt,"enrique perez de guzman, 2nd count de niebla",43,48,56,66


In [0]:
csv_incorrect = dev_data[(cond1) | (cond2)| (cond3)][show_columns]
csv_correct = dev_data.drop(csv_incorrect.index)[show_columns]

In [0]:
csv_incorrect.to_csv('csv_incorrect.csv')
csv_correct.to_csv('csv_correct.csv')
