<a href="https://colab.research.google.com/github/gfkaceli/ExploringQAtechniqueswithBERTandGPT/blob/george_test/FINETUNED_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch tqdm

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.6 MB/s[0m eta [36m0:00:0

In [None]:
%%capture
!mkdir data
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O data/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O data/dev-v2.0.json

In [None]:
import json
def read_data(path):
  # load the json file
  with open(path, 'rb') as f:
    squad = json.load(f)

  contexts = []
  questions = []
  answers = []

  for group in squad['data']:
    for passage in group['paragraphs']:
      context = passage['context']
      for qa in passage['qas']:
        question = qa['question']
        for answer in qa['answers']:
          contexts.append(context)
          questions.append(question)
          answers.append(answer)

  return contexts, questions, answers

In [None]:
train_contexts, train_questions, train_answers = read_data("data/train-v2.0.json")
val_contexts, val_questions, val_answers = read_data("data/dev-v2.0.json")

In [None]:
# now we must find the end of each answer index

def add_end(answers, contexts):
  for answer, context in zip(answers, contexts):
    text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(text)

   # sometimes squad answers are off by a character or two so we fix this
    if context[start_idx:end_idx] == text:
      answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = end_idx - 1  # is off by one character
    elif context[start_idx-2:end_idx-2] == text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = end_idx - 2     # is off by two characters

add_end(train_answers, train_contexts)
add_end(val_answers, val_contexts)

In [None]:
from transformers import BertForQuestionAnswering, BertTokenizerFast

model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import transformers
transformers.logging.set_verbosity_error()
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:
# convert out character start and end positions to tokens since we encoded the contexts and the questions

def token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

    # if the start position is none then the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

token_positions(train_encodings, train_answers)
token_positions(val_encodings, val_answers)

In [None]:
import torch
# now we define the dataset
class SQuAD_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [None]:
train_dataset = SQuAD_Dataset(train_encodings)
val_dataset = SQuAD_Dataset(val_encodings)

In [None]:
from torch.utils.data import DataLoader

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_path = "/content/drive/MyDrive/Bert-Base-FineTuned"
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cuda


In [None]:
from tqdm import tqdm
model.eval()

acc = []

for batch in tqdm(valid_loader):
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)
    model.to(device)
    outputs = model(input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)

    acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
    acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

acc = sum(acc)/len(acc)
print(f"\nthe accuracy is the following: {acc}")
print("\n\nT/P\tanswer_start\tanswer_end\n")
for i in range(len(start_true)):
  print(f"true\t{start_true[i]}\t{end_true[i]}\n"
        f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

100%|██████████| 635/635 [11:36<00:00,  1.10s/it]


the accuracy is the following: 0.6830216535902399


T/P	answer_start	answer_end

true	67	68
pred	50	68

true	67	68
pred	50	68

true	67	68
pred	50	68

true	66	68
pred	50	68

true	171	172
pred	171	172

true	171	172
pred	171	172

true	171	172
pred	171	172

true	171	172
pred	171	172

true	171	172
pred	171	172

true	158	160
pred	67	68

true	158	160
pred	67	68

true	158	160
pred	67	68

true	158	160
pred	67	68

true	158	160
pred	67	68






In [None]:
def get_prediction(context, question):
  inputs = tokenizer.encode_plus(question, context, return_tensors='pt').to(device)
  outputs = model(**inputs)

  answer_start = torch.argmax(outputs[0])
  answer_end = torch.argmax(outputs[1]) + 1

  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

  return answer

def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re
  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)
  def white_space_fix(text):
    return " ".join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match(prediction, truth):
    return bool(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()

  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)

  common_tokens = set(pred_tokens) & set(truth_tokens)

  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0

  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)

  return round(2 * (prec * rec) / (prec + rec), 2)

def question_answer(context, question,answer):
  prediction = get_prediction(context,question)
  em_score = exact_match(prediction, answer)
  f1_score = compute_f1(prediction, answer)

  print(f'Question: {question}')
  print(f'Prediction: {prediction}')
  print(f'True Answer: {answer}')
  print(f'Exact match: {em_score}')
  print(f'F1 score: {f1_score}\n')

In [None]:
context = """Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer,
          songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing
          and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.
          Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time.
          Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide,
          earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy"."""


questions = ["For whom the passage is talking about?",
             "When was Beyonce born?",
             "Where was Beyonce born?",
             "What is Beyonce's nationality?",
             "Who was Destiny's Child group manager?",
             "What name has the Beyoncé's debut album?",
             "How many Grammy Awards did Beyonce earn?",
             "When did the Beyoncé's debut album release?",
             "Who was the lead singer of R&B girl-group Destiny's Child?"]

answers = ["Beyonce Giselle Knowles - Carter", "September 4, 1981", "Houston, Texas",
           "American", "Mathew Knowles", "Dangerously in Love", "five", "2003",
           "Beyonce Giselle Knowles - Carter"]

for question, answer in zip(questions, answers):
  question_answer(context, question, answer)

Question: For whom the passage is talking about?
Prediction: beyonce giselle knowles - carter
True Answer: Beyonce Giselle Knowles - Carter
Exact match: True
F1 score: 1.0

Question: When was Beyonce born?
Prediction: september 4, 1981
True Answer: September 4, 1981
Exact match: True
F1 score: 1.0

Question: Where was Beyonce born?
Prediction: houston, texas
True Answer: Houston, Texas
Exact match: True
F1 score: 1.0

Question: What is Beyonce's nationality?
Prediction: american
True Answer: American
Exact match: True
F1 score: 1.0

Question: Who was Destiny's Child group manager?
Prediction: beyonce giselle knowles - carter
True Answer: Mathew Knowles
Exact match: False
F1 score: 0.33

Question: What name has the Beyoncé's debut album?
Prediction: dangerously in love
True Answer: Dangerously in Love
Exact match: True
F1 score: 1.0

Question: How many Grammy Awards did Beyonce earn?
Prediction: five
True Answer: five
Exact match: True
F1 score: 1.0

Question: When did the Beyoncé's deb

In [None]:
context = """ The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia;
French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle,
is a moist broadleaf forest that covers most of the Amazon basin of South America.
This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest.
This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%,
Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas"
in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest
in the world, with an estimated 390 billion individual trees divided into 16,000 species."""

questions = ["Which name is also used to describe the Amazon rainforest in French?",
             "How large is the rainforest?",
             "How many nations does this region include?",
             "How many individual trees are there in the rainforest?",
             "How many nations contain Amazonas in their names?"]

answers = ["Foret amazonienne",
           "5,500,000 square kilometres",
           "nine",
           "390 billion",
           "four"]

for question, answer in zip(questions, answers):
  question_answer(context, question, answer)

Question: Which name is also used to describe the Amazon rainforest in French?
Prediction: foret amazonienne
True Answer: Foret amazonienne
Exact match: True
F1 score: 1.0

Question: How large is the rainforest?
Prediction: 5, 500, 000 square kilometres
True Answer: 5,500,000 square kilometres
Exact match: False
F1 score: 0.5

Question: How many nations does this region include?
Prediction: nine
True Answer: nine
Exact match: True
F1 score: 1.0

Question: How many individual trees are there in the rainforest?
Prediction: 390 billion
True Answer: 390 billion
Exact match: True
F1 score: 1.0

Question: How many nations contain Amazonas in their names?
Prediction: four
True Answer: four
Exact match: True
F1 score: 1.0



In [None]:
context = """ Closely related fields in theoretical computer science are analysis of algorithms and computability theory.
A key distinction between analysis of algorithms and computational complexity theory is that the former is devoted to analyzing
the amount of resources needed by a particular algorithm to solve a problem, whereas the latter asks a more general question about
all possible algorithms that could be used to solve the same problem. More precisely, it tries to classify problems that can or
cannot be solved with appropriately restricted resources. In turn, imposing restrictions on the available resources is
what distinguishes computational complexity from computability theory: the latter theory asks what kind of problems can,
in principle, be solved algorithmically."""

questions = ["what are closely related fields in theoretical computer science?",
             "What field of computer science analyzes the resource requirements of a specific algorithm isolated unto itself within a given problem?",
             "What process classifies problems that can and cannot be solved with approximately unlimited resources?"]

answers = ["analysis of algorithms and computability theory",
           "analysis of algorithms",
           "analysis of algorithms" ]

for question, answer in zip(questions, answers):
  question_answer(context, question, answer)

Question: what are closely related fields in theoretical computer science?
Prediction: analysis of algorithms and computational complexity theory is that the former is devoted to analyzing the amount of resources needed by a particular algorithm to solve a problem, whereas the latter asks a more general question about all possible algorithms that could be used to solve the same problem
True Answer: analysis of algorithms and computability theory
Exact match: False
F1 score: 0.21

Question: What field of computer science analyzes the resource requirements of a specific algorithm isolated unto itself within a given problem?
Prediction: analyzing the amount of resources needed by a particular algorithm to solve a problem, whereas the latter asks a more general question about all possible algorithms that could be used to solve the same problem
True Answer: analysis of algorithms
Exact match: False
F1 score: 0.12

Question: What process classifies problems that can and cannot be solved with

In [None]:
context = """ Established originally by the Massachusetts legislature and soon thereafter named for John Harvard (its first benefactor),
Harvard is the United States' oldest institution of higher learning, and the Harvard Corporation (formally, the President and Fellows of Harvard College)
is its first chartered corporation. Although never formally affiliated with any denomination, the early College primarily trained Congregationalist and Unitarian clergy.
Its curriculum and student body were gradually secularized during the 18th century, and by the 19th century Harvard had emerged as the central cultural establishment among
 Boston elites. Following the American Civil War, President Charles W. Eliot's long tenure (1869–1909) transformed the college and affiliated professional schools into a
  modern research university; Harvard was a founding member of the Association of American Universities in 1900. James Bryant Conant led the university through the Great
   Depression and World War II and began to reform the curriculum and liberalize admissions after the war. The undergraduate college
   became coeducational after its 1977 merger with Radcliffe College."""

questions = ["who established Harvard University?",
             "Who named Harvard University?",
             "Who was harvard named after?",
             "What year did harvard become coeducational?",
             "What year did harvard join the Association of American Universities?"]
answers= ["the massachusetts legislature", "Charles W. Eliot", "John Harvard", "1977", "1900"]

for question, answer in zip(questions, answers):
  question_answer(context, question, answer)

Question: who established Harvard University?
Prediction: the massachusetts legislature
True Answer: the massachusetts legislature
Exact match: True
F1 score: 1.0

Question: Who named Harvard University?
Prediction: charles w. eliot
True Answer: Charles W. Eliot
Exact match: True
F1 score: 1.0

Question: Who was harvard named after?
Prediction: john harvard
True Answer: John Harvard
Exact match: True
F1 score: 1.0

Question: What year did harvard become coeducational?
Prediction: 1977
True Answer: 1977
Exact match: True
F1 score: 1.0

Question: What year did harvard join the Association of American Universities?
Prediction: 1900
True Answer: 1900
Exact match: True
F1 score: 1.0



In [None]:
context = """Hello my name is George Kaceli. I am a 4th year student at the university of windsor. I like sports and video games and enjoy reading books.
my favourite hobbies include biking and programming among other things. I have one younger sibling and a pet dog named zizou."""

questions = ["What is my name?",
             "What is the name of my dog?",
             "What are some of my favourite hobbies?",
             "What university do I attend?"]
answers = ["George Kaceli",
           "zizou",
           "biking and programming",
           "university of windsor"]
for question, answer in zip(questions, answers):
  question_answer(context, question, answer)


Question: What is my name?
Prediction: george kaceli
True Answer: George Kaceli
Exact match: True
F1 score: 1.0

Question: What is the name of my dog?
Prediction: zizou
True Answer: zizou
Exact match: True
F1 score: 1.0

Question: What are some of my favourite hobbies?
Prediction: biking and programming
True Answer: biking and programming
Exact match: True
F1 score: 1.0

Question: What university do I attend?
Prediction: university of windsor
True Answer: university of windsor
Exact match: True
F1 score: 1.0

