In [None]:
%%capture
!pip install transformers

In [None]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
import time
import numpy as np
from transformers import BertModel
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd
import torch
from transformers import AutoTokenizer,BertTokenizerFast

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
class CustomModel(nn.Module):
  def __init__(self,num_labels):
    super(CustomModel,self).__init__()
    self.num_labels = num_labels

    #Load Model with given checkpoint and extract its body
    self.model= BertModel.from_pretrained('bert-base-uncased',output_attentions=True,output_hidden_states=True)
    self.second_hidden=nn.Linear(768,512)
    self.act=nn.ReLU()
    self.dropout=nn.Dropout(0.2)
    self.classifier = nn.Linear(512,num_labels) # load and initialize weights


  def forward(self, input_ids=None, attention_mask=None,start_positions=None,end_positions=None):
    #Extract outputs from the body
    sequence_output= self.model(input_ids=input_ids, attention_mask=attention_mask)

    seq=sequence_output.last_hidden_state
    seq1=self.second_hidden(seq)
    sequence_output1=self.act(seq1)
    sequence_outputs=self.dropout(sequence_output1)
    logits = self.classifier(sequence_outputs)  # Assuming this outputs a single tensor
    logits=nn.functional.softmax(logits,dim=1)

    # Split the logits into start and end logits
    start_logits, end_logits = logits[:, :, 0].squeeze(),logits[:,:,1].squeeze()
    # start_logits_ind=torch.argmax(start_logits,axis=1)
    # end_logits_ind=torch.argmax(end_logits,axis=1)

    loss = 0
    if (start_positions is not None) and (end_positions is not None):
      loss1=loss2=0
      for i in range(logits.shape[0]):
        loss1=loss1-torch.log(start_logits[i][start_positions[i]])
        loss2=loss2-torch.log(end_logits[i][end_positions[i]])
      loss = loss1 + loss2
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=sequence_output.hidden_states,attentions=sequence_output.attentions)

In [None]:
# Loading the bert tokenizer and the saved model

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = torch.load("/content/drive/MyDrive/bertqa_finetuned/sc_40k_d_5", map_location = torch.device('cpu'))
model.eval()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

CustomModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [None]:
# Predict function to predict the answer given a query and context
def predict(query,context):

  inputs = tokenizer.encode_plus(query, context, return_tensors='pt',truncation = True)
  del inputs["token_type_ids"]

  outputs = model(**inputs)
  logits = outputs['logits']
  start_logits,end_logits = logits[:, :, 0].squeeze(),logits[:,:,1].squeeze()
  answer_start = torch.argmax(start_logits)
  answer_end = torch.argmax(end_logits) + 1

  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

  return answer


# Processing the true answer and predicted answer to compute exact match
def normalize_text(s):
  import string, re

  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))


#Computing the exact match
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

# Computing the f1-score
def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()

  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)

  common_tokens = set(pred_tokens) & set(truth_tokens)

  if len(common_tokens) == 0:
    return 0

  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)

  return 2 * (prec * rec) / (prec + rec)

In [None]:
def give_an_answer(context,query,answer):

  prediction = predict(query,context)
  if prediction == '[CLS]':
    prediction = ''
  em_score = compute_exact_match(prediction, answer)
  f1_score = compute_f1(prediction, answer)
  return em_score,f1_score


In [None]:
# Reading the test data
path = Path('/content/drive/MyDrive/data/test-v2.0.json')

with open(path, 'rb') as f:
    squad_dict = json.load(f)

texts = []
queries = []
answers = []

for group in squad_dict['data']:
    for passage in group['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
            question = qa['question']
            if(qa['answers'] == []):
                texts.append(context)
                queries.append(question)
                answers.append({'text':'', 'answer_start':-1})
            else:
                for answer in qa['answers']:
                    texts.append(context)
                    queries.append(question)
                    answers.append(answer)

test_texts, test_queries, test_answers = texts, queries, answers

In [None]:
# Reading the test data with answers
path = Path('/content/drive/MyDrive/data/dev-v2.0.json')

with open(path, 'rb') as f:
    squad_dict = json.load(f)

texts = []
queries = []
answers = []
num = 0

for group in squad_dict['data']:
    for passage in group['paragraphs']:
        num += 1
        context = passage['context']
        for qa in passage['qas']:
            question = qa['question']
            for answer in qa['answers']:
              texts.append(context)
              queries.append(question)
              answers.append(answer)

val_texts, val_queries, val_answers = texts, queries, answers

In [None]:
len(val_texts)

10052

In [None]:
len(test_texts)

5915

In [None]:
# Taking last 2000 examples from validation dataset
val_texts = val_texts[-2000:]
val_queries = val_queries[-2000:]
val_answers = val_answers[-2000:]

In [None]:
len(val_texts)

2000

In [None]:
# Taking first 2000 examples from validation dataset
test_texts = test_texts[:2000]
test_queries = test_queries[:2000]
test_answers = test_answers[:2000]

In [None]:
# Evaluating EM and F1-Score
def evaluate(que, ans, con):
  test_em = 0
  test_f1 = 0
  for i in range(len(que)):
    em,f1 = give_an_answer(con[i],que[i],ans[i]['text'])
    test_em = test_em + em
    test_f1 = test_f1 + f1

  test_f1 = test_f1/len(que)
  test_em = test_em/len(que)
  return test_em,test_f1

In [None]:
# Calculating Exact match and F1 score on data which have answers (Total 2000 queries)
em_test,f1_test = evaluate(val_queries,val_answers,val_texts)
print("Exact match score for queries with answers : ", em_test)
print("F1-score for queries with answers : ", f1_test)

Exact match score for queries with answers :  0.3565
F1-score for queries with answers :  0.47472219821107886


In [None]:
# Calculating Exact match and F1 score on data which dont have answers (Total 2000 queries)
em_test,f1_test = evaluate(test_queries,test_answers,test_texts)
print("Exact match score for queries with no answers : ", em_test)
print("F1-score for queries with no answers : ", f1_test)

Exact match score for queries with no answers :  0.3675
F1-score for queries with no answers :  0.3675


**ANALYSIS**

In [None]:
# Filtering queries on the basis of the type (Eg: Which, What ...)
def type_que(q_type):
  ctr = 0
  L_que = []
  L_ans = []
  L_con = []
  for i in range(len(val_texts)):
    w = val_queries[i].lower().split()
    if q_type in w:
      ctr += 1
      L_que.append(val_queries[i])
      L_ans.append(val_answers[i])
      L_con.append(val_texts[i])
  for j in range(len(test_texts)):
    w = test_queries[j].lower().split()
    if q_type in w:
      ctr +=1
      L_que.append(test_queries[i])
      L_ans.append(test_answers[i])
      L_con.append(test_texts[i])
  return L_que, L_ans, L_con

In [None]:
wh_type = ['when','who','how','why','where','what']

for i in range(len(wh_type)):
  que, ans, con = type_que(wh_type[i])
  em_type,f1_type = evaluate(que,ans,con)
  print("Exact match score for \"" + wh_type[i] + "\" type of question :" , em_type)
  print("F1-score for \"" + wh_type[i] + "\" type of question :" , f1_type)

Exact match score for "when" type of question : 0.7086092715231788
F1-score for "when" type of question : 0.7428812725170342
Exact match score for "who" type of question : 0.7530864197530864
F1-score for "who" type of question : 0.8005580763594251
Exact match score for "how" type of question : 0.6597938144329897
F1-score for "how" type of question : 0.7260061204276573
Exact match score for "why" type of question : 0.4838709677419355
F1-score for "why" type of question : 0.5818816755149523
Exact match score for "where" type of question : 0.5253456221198156
F1-score for "where" type of question : 0.6448445046081085
Exact match score for "what" type of question : 0.6650641025641025
F1-score for "what" type of question : 0.7246847347637309


In [None]:
# Selecting queries which don't have Wh type questions and evaluating them
qw = ['what', 'who', 'where', 'when', 'why', 'how', 'whose', 'which']
others_que, others_ans, others_con = [], [], []
for i in range(len(val_queries)):
  w = val_queries[i].lower().split()
  common_element = set(qw) & set(w)
  if common_element:
    continue
  else:
    others_que.append(val_queries[i])
    others_ans.append(val_answers[i])
    others_con.append(val_texts[i])
for j in range(len(test_queries)):
  w = test_queries[i].lower().split()
  common_element = set(qw) & set(w)
  if common_element:
    continue
  else:
    others_que.append(test_queries[i])
    others_ans.append(test_answers[i])
    others_con.append(test_texts[i])

unst_em, unst_f1 = evaluate(others_que,others_ans,others_con)

print("Exact match score for unstructured questions :" ,unst_em )
print("F1-score for unstructured questions :" , unst_f1)

Exact match score for unstructured questions : 0.17647058823529413
F1-score for unstructured questions : 0.24548686034754147


In [None]:
# Evaluating queries based on Short Answer Questions and Long Answer Questions

small_que, small_ans, small_con = [],[],[]
long_que, long_ans,long_con = [],[],[]
for i in range(len(val_queries)):
  w = val_answers[i]['text'].lower().split()
  if len(w) < 5:
    small_que.append(val_queries[i])
    small_ans.append(val_answers[i])
    small_con.append(val_texts[i])
  else:
    long_que.append(val_queries[i])
    long_ans.append(val_answers[i])
    long_con.append(val_texts[i])

In [None]:
print(len(small_que))
print(len(long_que))

1561
439


In [None]:
em_small,f1_small = evaluate(small_que, small_ans, small_con)
print("The exact match score for questions with Short answers is : ", em_small)
print("The F1-score for questions with Short answers is : ", f1_small)

The exact match score for questions with Short answers is :  0.4336963484945548
The F1-score for questions with Short answers is :  0.5218048594253601


In [None]:
em_long,f1_long = evaluate(long_que, long_ans, long_con)
print("The exact match score for questions with Long answers is : ", em_long)
print("The F1-score for questions with Long answers is : ", f1_long)

The exact match score for questions with Long answers is :  0.08200455580865604
The F1-score for questions with Long answers is :  0.30730526391610674
