In [None]:
%%capture
!pip install transformers

In [None]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
import time
import numpy as np

In [None]:
from transformers import BertModel
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd

In [None]:
import torch
from transformers import AutoTokenizer,BertTokenizerFast

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Loading the fine-tuned modeol
model = torch.load("/content/drive/MyDrive/bertqa_finetuned/bertqa612", map_location = torch.device('cpu'))
model.eval()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [None]:
def predict(query,context):

  inputs = tokenizer.encode_plus(query, context, return_tensors='pt',truncation = True)
  del inputs["token_type_ids"]

  outputs = model(**inputs)
  answer_start = torch.argmax(outputs[0])
  answer_end = torch.argmax(outputs[1]) + 1

  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

  return answer

def normalize_text(s):
  import string, re

  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()

  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)

  common_tokens = set(pred_tokens) & set(truth_tokens)

  if len(common_tokens) == 0:
    return 0

  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)

  return 2 * (prec * rec) / (prec + rec)

In [None]:
def give_an_answer(context,query,answer):

  prediction = predict(query,context)
  if prediction == '[CLS]':
    prediction = ''
  em_score = compute_exact_match(prediction, answer)
  f1_score = compute_f1(prediction, answer)
  return em_score,f1_score

In [None]:
path = Path('/content/drive/MyDrive/data/test-v2.0.json')

with open(path, 'rb') as f:
    squad_dict = json.load(f)

texts = []
queries = []
answers = []

for group in squad_dict['data']:
    for passage in group['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
            question = qa['question']
            if(qa['answers'] == []):
                texts.append(context)
                queries.append(question)
                answers.append({'text':'', 'answer_start':-1})
            else:
                for answer in qa['answers']:
                    texts.append(context)
                    queries.append(question)
                    answers.append(answer)

test_texts, test_queries, test_answers = texts, queries, answers

In [None]:
path = Path('/content/drive/MyDrive/data/dev-v2.0.json')

with open(path, 'rb') as f:
    squad_dict = json.load(f)

texts = []
queries = []
answers = []
num = 0

for group in squad_dict['data']:
    for passage in group['paragraphs']:
        num += 1
        context = passage['context']
        for qa in passage['qas']:
            question = qa['question']
            for answer in qa['answers']:
                texts.append(context)
                queries.append(question)
                answers.append(answer)

val_texts, val_queries, val_answers = texts, queries, answers

In [None]:
val_texts = val_texts[-2000:]
val_queries = val_queries[-2000:]
val_answers = val_answers[-2000:]

In [None]:
test_texts = test_texts[:2000]
test_queries = test_queries[:2000]
test_answers = test_answers[:2000]

In [None]:
val_em = 0
val_f1 = 0
for i in range(len(val_texts)):
  em, f1 = give_an_answer(val_texts[i],val_queries[i],val_answers[i]['text'])
  val_em = val_em + em
  val_f1 = val_f1 + f1

val_em = val_em/len(val_texts)
val_f1 = val_f1/len(val_texts)
print("Exact match score for queries with answers : ", val_em)
print("F1-score for queries with answers : ", val_f1)

Exact match score for queries with answers :  0.3095
F1-score for queries with answers :  0.43103627170437514


In [None]:
test_em = 0
test_f1 = 0
for i in range(len(test_texts)):
  em, f1 = give_an_answer(test_texts[i],test_queries[i],test_answers[i]['text'])
  test_em = test_em + em
  test_f1 = test_f1 + f1

test_em = test_em/len(test_texts)
test_f1 = test_f1/len(test_texts)
print("Exact match score for queries with no answers : ", test_em)
print("F1-score for queries with no answers : ", test_f1)

Exact match score for queries with no answers :  0.3685
F1-score for queries with no answers :  0.3685
