# Questions Answering System using BERT:

### Importing Libraries:

In [26]:
import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
#import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer,AdamW,BertForQuestionAnswering
from transformers import AutoTokenizer, DistilBertForQuestionAnswering, AdamW
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import json
from pathlib import Path
import torch
import time
import os


import warnings
warnings.filterwarnings("ignore")

In [27]:
# %%capture
# !pip install transformers

## Loading and Reading SQuAD 2.0 dataset:

In [28]:
 %%capture
!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

In [29]:
# Train file path.
path = Path('squad/train-v2.0.json')

with open(path, 'rb') as f:
    squad_dict = json.load(f)

texts = []
queries = []
answers = []

# Searching for each passage, its question and answer.
for group in squad_dict['data']:
    for passage in group['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
            question = qa['question']
            for answer in qa['answers']:
                # Storing all passage, and its answer to the list.
                texts.append(context)
                queries.append(question)
                answers.append(answer)

train_texts, train_queries, train_answers = texts, queries, answers

In [30]:
# Validation file path.
path = Path('squad/dev-v2.0.json')

with open(path, 'rb') as f:
    squad_dict = json.load(f)

texts = []
queries = []
answers = []

# Searching for each passage, its question and answer.
for group in squad_dict['data']:
    for passage in group['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
            question = qa['question']
            for answer in qa['answers']:
                # Storing all passage, and its answer to the list.
                texts.append(context)
                queries.append(question)
                answers.append(answer)

val_texts, val_queries, val_answers = texts, queries, answers

## Checking the data

In [31]:
print(len(train_texts))
print(len(train_queries))
print(len(train_answers))

86821
86821
86821


In [32]:
print("Passage: ",train_texts[0])
print("Query: ",train_queries[0])
print("Answer: ",train_answers[0])

Passage:  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Query:  When did Beyonce start becoming popular?
Answer:  {'text': 'in the late 1990s', 'answer_start': 269}


In [33]:
print(len(val_texts))
print(len(val_queries))
print(len(val_answers))

20302
20302
20302


In [34]:
print("Passage: ",val_texts[0])
print("Query: ",val_queries[0])
print("Answer: ",val_answers[0])

Passage:  The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
Query:  In what country is Normandy located?
Answer:  {'text': 'France', 'answer_start': 159}


## Find the start and end position character

In [35]:
for answer, text in zip(train_answers, train_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(real_answer)

    if text[start_idx:end_idx] == real_answer:
        answer['answer_end'] = end_idx
    elif text[start_idx-1:end_idx-1] == real_answer:
        answer['answer_start'] = start_idx - 1
        answer['answer_end'] = end_idx - 1
    elif text[start_idx-2:end_idx-2] == real_answer:
        answer['answer_start'] = start_idx - 2
        answer['answer_end'] = end_idx - 2

In [36]:
for answer, text in zip(val_answers, val_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(real_answer)

    if text[start_idx:end_idx] == real_answer:
        answer['answer_end'] = end_idx
    elif text[start_idx-1:end_idx-1] == real_answer:
        answer['answer_start'] = start_idx - 1
        answer['answer_end'] = end_idx - 1
    elif text[start_idx-2:end_idx-2] == real_answer:
        answer['answer_start'] = start_idx - 2
        answer['answer_end'] = end_idx - 2

## Tokenization:

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, train_queries, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

In [None]:
# OPTIMIZER = AdamW(model.parameters(), lr=0.00001)
# Q_LEN = 256   # Question Length
# T_LEN = 32    # Target Length
# BATCH_SIZE = 4
# DEVICE = "cuda:0"

### Convert the start-end positions to tokens start-end positions

In [None]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []

  count = 0

  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length

    if end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - 1)

      if end_positions[-1] is None:
        count += 1
        end_positions[-1] = tokenizer.model_max_length

  print(count)

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]

        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)

        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100

        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

In [None]:
# # Dataloader

# train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# train_sampler = RandomSampler(train_data.index)
# val_sampler = RandomSampler(val_data.index)

# qa_dataset = QA_Dataset(tokenizer, data, Q_LEN, T_LEN)

# train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
# val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

## Create a Dataset class

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available()
                      else 'cpu')

In [None]:
# MODEL = MODEL.to(device)

## Bert Model:

In [None]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)
# OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
# DEVICE = "cuda:0"
optim = AdamW(model.parameters(), lr=5e-5)
Q_LEN = 256   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 4
epochs = 4

In [None]:
#with open(r'./squad/dev-v2.0.json') as f:
  #  data = json.load(f)

In [None]:
#with open(r'./squad/dev-v2.0.json') as f:
      # try:
      #     data = json.load(f)
      # except JSONDecodeError as e:
       #    print(f"Error loading JSON data: {e}")

In [None]:
#data.head()

Unnamed: 0,context,question,answer
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"Denmark, Iceland and Norway"
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,Rollo
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,10th century


In [None]:
# Dataloader

#train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

#train_sampler = RandomSampler(train_data.index)
#val_sampler = RandomSampler(val_data.index)

#qa_dataset = QA_Dataset(tokenizer, data, Q_LEN, T_LEN)

#train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
#val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [None]:
# train_loss = 0
# val_loss = 0
# train_batch_count = 0
# val_batch_count = 0

# for epoch in range(2):
#     model.train()
#     for batch in tqdm(train_loader, desc="Training batches"):
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)
#         decoder_attention_mask = batch["decoder_attention_mask"].to(device)

#         outputs = model(
#                           input_ids=input_ids,
#                           attention_mask=attention_mask,
#                           labels=labels,
#                           decoder_attention_mask=decoder_attention_mask
#                         )

#         optim.zero_grad()
#         outputs.loss.backward()
#         optim.step()
#         train_loss += outputs.loss.item()
#         train_batch_count += 1

#     #Evaluation
#     model.eval()
#     for batch in tqdm(val_loader, desc="Validation batches"):
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)
#         decoder_attention_mask = batch["decoder_attention_mask"].to(device)

#         outputs = model(
#                           input_ids=input_ids,
#                           attention_mask=attention_mask,
#                           labels=labels,
#                           decoder_attention_mask=decoder_attention_mask
#                         )

#         optim.zero_grad()
#         outputs.loss.backward()
#         optim.step()
#         val_loss += outputs.loss.item()
#         val_batch_count += 1

#     print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

In [None]:
model.save_pretrained(r"\Group13_CW2/qa_model")
tokenizer.save_pretrained(r"\Group13_CW2/qa_tokenizer")

## Training and Evaluating the Model:

In [None]:
whole_train_eval_time = time.time()

train_losses = []
val_losses = []

print_every = 50

for epoch in range(epochs):
  epoch_time = time.time()

  model.train()

  loss_of_epoch = 0

  print("Train:")

  for batch_idx,batch in enumerate(train_loader):

    optim.zero_grad()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    loss.backward()

    optim.step()

    loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
      print("Batch {:} / {:}".format(batch_idx+1,len(train_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(train_loader)
  train_losses.append(loss_of_epoch)

  model.eval()

  print("Evaluate:")

  loss_of_epoch = 0

  for batch_idx,batch in enumerate(val_loader):

    with torch.no_grad():

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)

      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
       print("Batch {:} / {:}".format(batch_idx+1,len(val_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(val_loader)
  val_losses.append(loss_of_epoch)

  print("\n-------Epoch ", epoch+1,
        "-------"
        "\nTraining Loss:", train_losses[-1],
        "\nValidation Loss:", val_losses[-1],
        "\nTime: ",(time.time() - epoch_time),
        "\n-----------------------",
        "\n\n")

print("Total training and evaluation time: ", (time.time() - whole_train_eval_time))

In [None]:
fig,ax = plt.subplots(1,1,figsize=(15,10))

ax.set_title("Train and Valid Losses",size=15)
ax.set_ylabel('Loss', fontsize = 20)
ax.set_xlabel('Epochs', fontsize = 20)
_=ax.plot(train_losses)
_=ax.plot(val_losses)
_=ax.legend(('Train','Val'))