In [None]:
%%capture
!pip install transformers
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
import time
import numpy as np
from transformers import BertModel
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd
import torch
from transformers import AutoTokenizer,BertTokenizerFast

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
class CustomModel(nn.Module):
  def __init__(self,num_labels):
    super(CustomModel,self).__init__()
    self.num_labels = num_labels

    #Load Model with given checkpoint and extract its body
    self.model= BertModel.from_pretrained('bert-base-uncased',output_attentions=True,output_hidden_states=True)
    self.second_hidden=nn.Linear(768,512)
    self.act=nn.ReLU()
    self.dropout=nn.Dropout(0.2)
    self.classifier = nn.Linear(512,num_labels) # load and initialize weights


  def forward(self, input_ids=None, attention_mask=None,start_positions=None,end_positions=None):
    #Extract outputs from the body
    sequence_output= self.model(input_ids=input_ids, attention_mask=attention_mask)

    seq=sequence_output.last_hidden_state
    seq1=self.second_hidden(seq)
    sequence_output1=self.act(seq1)
    sequence_outputs=self.dropout(sequence_output1)
    logits = self.classifier(sequence_outputs)  # Assuming this outputs a single tensor
    logits=nn.functional.softmax(logits,dim=1)

    # Split the logits into start and end logits
    start_logits, end_logits = logits[:, :, 0].squeeze(),logits[:,:,1].squeeze()
    # start_logits_ind=torch.argmax(start_logits,axis=1)
    # end_logits_ind=torch.argmax(end_logits,axis=1)

    loss = 0
    if (start_positions is not None) and (end_positions is not None):
      loss1=loss2=0
      for i in range(logits.shape[0]):
        loss1=loss1-torch.log(start_logits[i][start_positions[i]])
        loss2=loss2-torch.log(end_logits[i][end_positions[i]])
      loss = loss1 + loss2
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=sequence_output.hidden_states,attentions=sequence_output.attentions)

In [None]:
# Loading the Model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = torch.load("/content/drive/MyDrive/bertqa_finetuned/sc_40k_d_5", map_location = torch.device('cpu'))
model.eval()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

CustomModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [None]:
# Function to predict the answer given a query and context
def predict(query,context):

  inputs = tokenizer.encode_plus(query, context, return_tensors='pt',truncation = True)
  del inputs["token_type_ids"]

  outputs = model(**inputs)
  logits = outputs['logits']
  start_logits,end_logits = logits[:, :, 0].squeeze(),logits[:,:,1].squeeze()
  answer_start = torch.argmax(start_logits)
  answer_end = torch.argmax(end_logits) + 1

  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

  return answer

# Processing the text to compare the true and predicted answers
def normalize_text(s):
  import string, re

  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))


# Calculating the EM score
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

# Calculating the f1 score
def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()

  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)

  common_tokens = set(pred_tokens) & set(truth_tokens)

  if len(common_tokens) == 0:
    return 0

  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)

  return 2 * (prec * rec) / (prec + rec)

In [None]:
def give_an_answer(context,query,answer):

  prediction = predict(query,context)
  if prediction == '[CLS]':
    prediction = ''
  em_score = compute_exact_match(prediction, answer)
  f1_score = compute_f1(prediction, answer)

  print(f"Question: {query}")
  print(f"Prediction: {prediction}")
  print(f"True Answer: {answer}")
  print(f"EM: {em_score}")
  print(f"F1: {f1_score}")
  print("\n")

In [None]:
context = """The Great Barrier Reef is the world's largest coral reef system, located in the
 Coral Sea off the coast of Queensland, Australia. It is composed of over 2,900 individual
 reefs and 900 islands stretching over 2,300 kilometers. The Great Barrier Reef is home to a vast
 array of marine life, including various species of colorful coral, fish, sharks, and marine mammals.
 It is a UNESCO World Heritage Site and a popular destination for tourists and researchers alike."""

queries = ["Where is the Great Barrier Reef located?",
           "How many individual reefs make up the Great Barrier Reef?",
          "What is the second largest reef system after The Great Barrier Reef?"
          ]
answers = ["The Great Barrier Reef is located in the Coral Sea off the coast of Queensland, Australia.",
           "The Great Barrier Reef is composed of over 2,900 individual reefs.",
           ""
          ]

for q,a in zip(queries,answers):
  give_an_answer(context,q,a)

Question: Where is the Great Barrier Reef located?
Prediction: the coral sea off the coast of queensland, australia
True Answer: The Great Barrier Reef is located in the Coral Sea off the coast of Queensland, Australia.
EM: 0
F1: 0.7000000000000001


Question: How many individual reefs make up the Great Barrier Reef?
Prediction: over 2, 900
True Answer: The Great Barrier Reef is composed of over 2,900 individual reefs.
EM: 0
F1: 0.15384615384615383


Question: What is the second largest reef system after The Great Barrier Reef?
Prediction: 
True Answer: 
EM: 1
F1: 1


