In [1]:
from transformers import DistilBertTokenizerFast, AutoTokenizer, AutoModelForQuestionAnswering, set_seed
from datasets import *
import numpy as np
import pandas as pd
import torch
import ast

In [2]:
SEED = 42
set_seed(SEED)

local_models_path = '../../data/models/BERT'

results = pd.DataFrame(columns=['Model', 'Train Data', 'Data Type', 'Bert.Precision', 'Bert.Recall', 'Bert.F1', 'BLEU', 'Squad.Exact', 'Squad.F1'])

In [3]:
# Define the prediction function
def inference_answer(question, context, tokenizer, model):
    question = question
    context = context
    test_feature = tokenizer(
        question,
        context,
        max_length=318
    )
    with torch.no_grad():
        outputs = model(torch.tensor([test_feature["input_ids"]]))
    start_logits = outputs.start_logits.cpu().numpy()
    end_logits = outputs.end_logits.cpu().numpy()
    answer_ids = test_feature["input_ids"][np.argmax(
        start_logits):np.argmax(end_logits)+1]
    return " ".join(tokenizer.batch_decode(answer_ids, skip_special_tokens=True))

## Load all data

2020

In [4]:
# Load the dataset from file and split it into train and test datasets
data_2020_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2020-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2020_full["test"] = data_2020_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2020_full["test"] = data_2020_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2020_full["test"] = data_2020_full["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2020_full["test"] = data_2020_full["test"].remove_columns(["text", "answer_start"])
# get ground truth answers
test_data_2020_full = data_2020_full["test"]
gt_answers_2020_full = [temp["answers"]["text"][0] for temp in test_data_2020_full]

# squad formatted data
references_2020 = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in zip(data_2020_full["test"]["id"], data_2020_full["test"]["answers"])]

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-dc9275c3697e5cc0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc9275c3697e5cc0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0b40be795cd765b7.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc9275c3697e5cc0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-3e1cf7e5eca93f71.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc9275c3697e5cc0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-1e92815268136ea2.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc9275c3697e5cc0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-21e74bfcddde162b.arrow
Loading cached process

2022

In [5]:
data_2022_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2022-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2022_full["test"] = data_2022_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2022_full["test"] = data_2022_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2022_full["test"] = data_2022_full["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2022_full["test"] = data_2022_full["test"].remove_columns(["text", "answer_start"])

test_data_2022_full = data_2022_full["test"]
gt_answers_2022_full = [temp["answers"]["text"][0] for temp in test_data_2022_full]

# squad formatted data
references_2022 = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in zip(data_2022_full["test"]["id"], data_2022_full["test"]["answers"])]

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-a8af1b4c8d81fb1c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-7d8841dd72615495.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-8b1142ae44904f4e.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-36096c12f970e2cd.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b621ad6a176502d0.arrow
Loading cached process

2020 + 2022

In [6]:
data_2020_2022 = load_dataset('csv', data_files="../../data/clean/sustainability-report-2042-squad-format.csv",
                                delimiter=";", split="train").train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2020_2022["test"] = data_2020_2022["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2020_2022["test"] = data_2020_2022["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2020_2022["test"] = data_2020_2022["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2020_2022["test"] = data_2020_2022["test"].remove_columns(["text", "answer_start"])
# get ground truth answers
test_data_2020_2022 = data_2020_2022["test"]
gt_answers_2020_2022 = [temp["answers"]["text"][0] for temp in test_data_2020_2022]

# squad formatted data
references_2020_2022 = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in zip(data_2020_2022["test"]["id"], data_2020_2022["test"]["answers"])]

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-dc46deea403e6d7a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-8c4226c3883b9f9f.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-24d9770f6403af3d.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-47e77242f5a0d387.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-cb788cd2e517f2b5.arrow
Loading cached process

2022 handwritten

In [7]:
data_2022_handwritten = load_dataset('csv', data_files=f"../../data/clean/QA_SR_2022_Expert-squad-format.csv",
                                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2022_handwritten["test"] = data_2022_handwritten["test"].remove_columns(["text", "answer_start"])

test_data_2022_handwritten = data_2022_handwritten["test"]
gt_answers_2022_handwritten = [temp["answers"]["text"][0] for temp in test_data_2022_handwritten]

# squad formatted data
references_2022_handwritten = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in enumerate(data_2022_handwritten["test"]["answers"])]

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-094de926302c4454/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-094de926302c4454\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-a6b47fc8c478d969.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-094de926302c4454\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-4ae4e0308bfc9ae2.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-094de926302c4454\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b9d36a6ac093f305.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-094de926302c4454\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c31eb97e51e66589.arrow
Loading cached process

## SIMPLE EVALUATION

In [8]:
import evaluate
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")
squad_v2_metric = evaluate.load("squad_v2")

### Distilbert

In [9]:
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer, model) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]
answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer, model) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]
answers_2020_2022 = [inference_answer(data_2020_2022["test"]["question"][idx], data_2020_2022["test"]["context"][idx], tokenizer, model) for idx in range(data_2020_2022["test"].shape[0])]
answers_2020_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_2022["test"]["id"], answers_2020_2022)]
answers_2022_handwritten = [inference_answer(data_2022_handwritten["test"]["question"][idx], data_2022_handwritten["test"]["context"][idx], tokenizer, model) for idx in range(data_2022_handwritten["test"].shape[0])]
answers_2022_handwritten_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in enumerate(answers_2022_handwritten)]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
bert_results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")
# print(f"Bertscore results 2022 handwritten\nF1: {np.array(bert_results_2022_handwritten['f1']).mean()}, Precision: {np.array(bert_results_2022_handwritten['precision']).mean()}, Recall: {np.array(bert_results_2022_handwritten['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
bleu_results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")
# print(f"Bleu results 2022 handwritten\n{bleu_results_2022_handwritten}")

# squad_v2
squad_results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
squad_results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
squad_results_2020_2022 = squad_v2_metric.compute(predictions=answers_2020_2022_squad, references=references_2020_2022)
squad_results_2022_handwritten = squad_v2_metric.compute(predictions=answers_2022_handwritten_squad, references=references_2022_handwritten)
# print(f"Squad_v2 results 2020\n{squad_results_2020}")
# print(f"Squad_v2 results 2022\n{squad_results_2022}")
# print(f"Squad_v2 results 2020-2022\n{squad_results_2020_2022}")
# print(f"Squad_v2 results 2022 handwritten\n{squad_results_2022_handwritten}")


# add results to dataframe
results.loc[len(results)] = ['distilbert', None, '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu'], squad_results_2020['exact'], squad_results_2020['f1']]
results.loc[len(results)] = ['distilbert', None, '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu'], squad_results_2022['exact'], squad_results_2022['f1']]
results.loc[len(results)] = ['distilbert', None, '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu'], squad_results_2020_2022['exact'], squad_results_2020_2022['f1']]
results.loc[len(results)] = ['distilbert', None, '2022 handwritten', np.array(bert_results_2022_handwritten['precision']).mean(), np.array(bert_results_2022_handwritten['recall']).mean(), np.array(bert_results_2022_handwritten['f1']).mean(), bleu_results_2022_handwritten['bleu'], squad_results_2022_handwritten['exact'], squad_results_2022_handwritten['f1']]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Roberta

In [10]:
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer, model) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]
answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer, model) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]
answers_2020_2022 = [inference_answer(data_2020_2022["test"]["question"][idx], data_2020_2022["test"]["context"][idx], tokenizer, model) for idx in range(data_2020_2022["test"].shape[0])]
answers_2020_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_2022["test"]["id"], answers_2020_2022)]
answers_2022_handwritten = [inference_answer(data_2022_handwritten["test"]["question"][idx], data_2022_handwritten["test"]["context"][idx], tokenizer, model) for idx in range(data_2022_handwritten["test"].shape[0])]
answers_2022_handwritten_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in enumerate(answers_2022_handwritten)]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
bert_results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")
# print(f"Bertscore results 2022 handwritten\nF1: {np.array(bert_results_2022_handwritten['f1']).mean()}, Precision: {np.array(bert_results_2022_handwritten['precision']).mean()}, Recall: {np.array(bert_results_2022_handwritten['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
bleu_results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")
# print(f"Bleu results 2022 handwritten\n{bleu_results_2022_handwritten}")

# squad_v2
squad_results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
squad_results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
squad_results_2020_2022 = squad_v2_metric.compute(predictions=answers_2020_2022_squad, references=references_2020_2022)
squad_results_2022_handwritten = squad_v2_metric.compute(predictions=answers_2022_handwritten_squad, references=references_2022_handwritten)
# print(f"Squad_v2 results 2020\n{squad_results_2020}")
# print(f"Squad_v2 results 2022\n{squad_results_2022}")
# print(f"Squad_v2 results 2020-2022\n{squad_results_2020_2022}")
# print(f"Squad_v2 results 2022 handwritten\n{squad_results_2022_handwritten}")


# add results to dataframe
results.loc[len(results)] = ['roberta', None, '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu'], squad_results_2020['exact'], squad_results_2020['f1']]
results.loc[len(results)] = ['roberta', None, '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu'], squad_results_2022['exact'], squad_results_2022['f1']]
results.loc[len(results)] = ['roberta', None, '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu'], squad_results_2020_2022['exact'], squad_results_2020_2022['f1']]
results.loc[len(results)] = ['roberta', None, '2022 handwritten', np.array(bert_results_2022_handwritten['precision']).mean(), np.array(bert_results_2022_handwritten['recall']).mean(), np.array(bert_results_2022_handwritten['f1']).mean(), bleu_results_2022_handwritten['bleu'], squad_results_2022_handwritten['exact'], squad_results_2022_handwritten['f1']]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Distilbert - finetuned

In [11]:
model_name_2020 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2020-full"
tokenizer_2020 = DistilBertTokenizerFast.from_pretrained(model_name_2020)
model_2020 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020)

model_name_2022 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-full"
tokenizer_2022 = DistilBertTokenizerFast.from_pretrained(model_name_2022)
model_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2022)

model_name_2020_2022 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2042-full_combined"
tokenizer_2020_2022 = DistilBertTokenizerFast.from_pretrained(model_name_2020_2022)
model_2020_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020_2022)

model_name_2022_handwritten = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-handwritten"
tokenizer_2022_handwritten = DistilBertTokenizerFast.from_pretrained(model_name_2022_handwritten)
model_2022_handwritten = AutoModelForQuestionAnswering.from_pretrained(model_name_2022_handwritten)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer_2020, model_2020) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]
answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer_2022, model_2022) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]
answers_2020_2022 = [inference_answer(data_2020_2022["test"]["question"][idx], data_2020_2022["test"]["context"][idx], tokenizer_2020_2022, model_2020_2022) for idx in range(data_2020_2022["test"].shape[0])]
answers_2020_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_2022["test"]["id"], answers_2020_2022)]
answers_2022_handwritten = [inference_answer(data_2022_handwritten["test"]["question"][idx], data_2022_handwritten["test"]["context"][idx], tokenizer_2022_handwritten, model_2022_handwritten) for idx in range(data_2022_handwritten["test"].shape[0])]
answers_2022_handwritten_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in enumerate(answers_2022_handwritten)]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
bert_results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")
# print(f"Bertscore results 2022 handwritten\nF1: {np.array(bert_results_2022_handwritten['f1']).mean()}, Precision: {np.array(bert_results_2022_handwritten['precision']).mean()}, Recall: {np.array(bert_results_2022_handwritten['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
bleu_results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")
# print(f"Bleu results 2022 handwritten\n{bleu_results_2022_handwritten}")

# squad_v2
squad_results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
squad_results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
squad_results_2020_2022 = squad_v2_metric.compute(predictions=answers_2020_2022_squad, references=references_2020_2022)
squad_results_2022_handwritten = squad_v2_metric.compute(predictions=answers_2022_handwritten_squad, references=references_2022_handwritten)
# print(f"Squad_v2 results 2020\n{squad_results_2020}")
# print(f"Squad_v2 results 2022\n{squad_results_2022}")
# print(f"Squad_v2 results 2020-2022\n{squad_results_2020_2022}")
# print(f"Squad_v2 results 2022 handwritten\n{squad_results_2022_handwritten}")


# add results to dataframe
results.loc[len(results)] = ['distilbert', 'full', '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu'], squad_results_2020['exact'], squad_results_2020['f1']]
results.loc[len(results)] = ['distilbert', 'full', '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu'], squad_results_2022['exact'], squad_results_2022['f1']]
results.loc[len(results)] = ['distilbert', 'full', '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu'], squad_results_2020_2022['exact'], squad_results_2020_2022['f1']]
results.loc[len(results)] = ['distilbert', 'full', '2022 handwritten', np.array(bert_results_2022_handwritten['precision']).mean(), np.array(bert_results_2022_handwritten['recall']).mean(), np.array(bert_results_2022_handwritten['f1']).mean(), bleu_results_2022_handwritten['bleu'], squad_results_2022_handwritten['exact'], squad_results_2022_handwritten['f1']]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

### Roberta - finetuned

In [12]:
model_name_2020 = f"{local_models_path}/roberta-base-squad2-finetuned-NLB-QA-2020-full"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020)
model_2020 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020)

model_name_2022 = f"{local_models_path}/roberta-base-squad2-finetuned-NLB-QA-2022-full"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022)
model_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2022)

model_name_2020_2022 = f"{local_models_path}/roberta-base-squad2-finetuned-NLB-QA-2042-full_combined"
tokenizer_2020_2022 = AutoTokenizer.from_pretrained(model_name_2020_2022)
model_2020_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020_2022)

model_name_2022_handwritten = f"{local_models_path}/roberta-base-squad2-finetuned-NLB-QA-2022-handwritten"
tokenizer_2022_handwritten = AutoTokenizer.from_pretrained(model_name_2022_handwritten)
model_2022_handwritten = AutoModelForQuestionAnswering.from_pretrained(model_name_2022_handwritten)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer_2020, model_2020) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]
answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer_2022, model_2022) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]
answers_2020_2022 = [inference_answer(data_2020_2022["test"]["question"][idx], data_2020_2022["test"]["context"][idx], tokenizer_2020_2022, model_2020_2022) for idx in range(data_2020_2022["test"].shape[0])]
answers_2020_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_2022["test"]["id"], answers_2020_2022)]
answers_2022_handwritten = [inference_answer(data_2022_handwritten["test"]["question"][idx], data_2022_handwritten["test"]["context"][idx], tokenizer_2022_handwritten, model_2022_handwritten) for idx in range(data_2022_handwritten["test"].shape[0])]
answers_2022_handwritten_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in enumerate(answers_2022_handwritten)]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
bert_results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")
# print(f"Bertscore results 2022 handwritten\nF1: {np.array(bert_results_2022_handwritten['f1']).mean()}, Precision: {np.array(bert_results_2022_handwritten['precision']).mean()}, Recall: {np.array(bert_results_2022_handwritten['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
bleu_results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")
# print(f"Bleu results 2022 handwritten\n{bleu_results_2022_handwritten}")

# squad_v2
squad_results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
squad_results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
squad_results_2020_2022 = squad_v2_metric.compute(predictions=answers_2020_2022_squad, references=references_2020_2022)
squad_results_2022_handwritten = squad_v2_metric.compute(predictions=answers_2022_handwritten_squad, references=references_2022_handwritten)
# print(f"Squad_v2 results 2020\n{squad_results_2020}")
# print(f"Squad_v2 results 2022\n{squad_results_2022}")
# print(f"Squad_v2 results 2020-2022\n{squad_results_2020_2022}")
# print(f"Squad_v2 results 2022 handwritten\n{squad_results_2022_handwritten}")


# add results to dataframe
results.loc[len(results)] = ['roberta', 'full', '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu'], squad_results_2020['exact'], squad_results_2020['f1']]
results.loc[len(results)] = ['roberta', 'full', '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu'], squad_results_2022['exact'], squad_results_2022['f1']]
results.loc[len(results)] = ['roberta', 'full', '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu'], squad_results_2020_2022['exact'], squad_results_2020_2022['f1']]
results.loc[len(results)] = ['roberta', 'full', '2022 handwritten', np.array(bert_results_2022_handwritten['precision']).mean(), np.array(bert_results_2022_handwritten['recall']).mean(), np.array(bert_results_2022_handwritten['f1']).mean(), bleu_results_2022_handwritten['bleu'], squad_results_2022_handwritten['exact'], squad_results_2022_handwritten['f1']]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

### Distilbert - finetuned  - train set halved

In [13]:
model_name_2020 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2020-smaller"
tokenizer_2020 = DistilBertTokenizerFast.from_pretrained(model_name_2020)
model_2020 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020)

model_name_2022 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-smaller"
tokenizer_2022 = DistilBertTokenizerFast.from_pretrained(model_name_2022)
model_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2022)

model_name_2020_2022 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2042-smaller_combined"
tokenizer_2020_2022 = DistilBertTokenizerFast.from_pretrained(model_name_2020_2022)
model_2020_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020_2022)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer_2020, model_2020) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]
answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer_2022, model_2022) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]
answers_2020_2022 = [inference_answer(data_2020_2022["test"]["question"][idx], data_2020_2022["test"]["context"][idx], tokenizer_2020_2022, model_2020_2022) for idx in range(data_2020_2022["test"].shape[0])]
answers_2020_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_2022["test"]["id"], answers_2020_2022)]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")

# squad_v2
squad_results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
squad_results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
squad_results_2020_2022 = squad_v2_metric.compute(predictions=answers_2020_2022_squad, references=references_2020_2022)
# print(f"Squad_v2 results 2020\n{squad_results_2020}")
# print(f"Squad_v2 results 2022\n{squad_results_2022}")
# print(f"Squad_v2 results 2020-2022\n{squad_results_2020_2022}")


# add results to dataframe
results.loc[len(results)] = ['distilbert', 'smaller', '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu'], squad_results_2020['exact'], squad_results_2020['f1']]
results.loc[len(results)] = ['distilbert', 'smaller', '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu'], squad_results_2022['exact'], squad_results_2022['f1']]
results.loc[len(results)] = ['distilbert', 'smaller', '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu'], squad_results_2020_2022['exact'], squad_results_2020_2022['f1']]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

### Roberta - finetuned - train set halved

In [14]:
model_name_2020 = f"{local_models_path}/roberta-base-squad2-finetuned-NLB-QA-2020-smaller"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020)
model_2020 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020)

model_name_2022 = f"{local_models_path}/roberta-base-squad2-finetuned-NLB-QA-2022-smaller"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022)
model_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2022)

model_name_2020_2022 = f"{local_models_path}/roberta-base-squad2-finetuned-NLB-QA-2042-smaller_combined"
tokenizer_2020_2022 = AutoTokenizer.from_pretrained(model_name_2020_2022)
model_2020_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020_2022)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer_2020, model_2020) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]
answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer_2022, model_2022) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]
answers_2020_2022 = [inference_answer(data_2020_2022["test"]["question"][idx], data_2020_2022["test"]["context"][idx], tokenizer_2020_2022, model_2020_2022) for idx in range(data_2020_2022["test"].shape[0])]
answers_2020_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_2022["test"]["id"], answers_2020_2022)]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")

# squad_v2
squad_results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
squad_results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
squad_results_2020_2022 = squad_v2_metric.compute(predictions=answers_2020_2022_squad, references=references_2020_2022)
# print(f"Squad_v2 results 2020\n{squad_results_2020}")
# print(f"Squad_v2 results 2022\n{squad_results_2022}")
# print(f"Squad_v2 results 2020-2022\n{squad_results_2020_2022}")


# add results to dataframe
results.loc[len(results)] = ['roberta', 'smaller', '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu'], squad_results_2020['exact'], squad_results_2020['f1']]
results.loc[len(results)] = ['roberta', 'smaller', '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu'], squad_results_2022['exact'], squad_results_2022['f1']]
results.loc[len(results)] = ['roberta', 'smaller', '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu'], squad_results_2020_2022['exact'], squad_results_2020_2022['f1']]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

## Display all results

In [15]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(results)

Unnamed: 0,Model,Train Data,Data Type,Bert.Precision,Bert.Recall,Bert.F1,BLEU,Squad.Exact,Squad.F1
0,distilbert,,2020,0.90593,0.933719,0.919075,0.151413,44.642857,57.656287
1,distilbert,,2022,0.869497,0.915677,0.88435,0.109171,31.775701,46.816539
2,distilbert,,2020-2022,0.85937,0.921706,0.874852,0.092217,37.323944,48.682393
3,distilbert,,2022 handwritten,0.877114,0.865025,0.870431,0.15355,15.789474,37.710726
4,roberta,,2020,0.739321,0.918992,0.755386,0.395476,46.428571,57.19739
5,roberta,,2022,0.728639,0.911008,0.743328,0.252076,40.186916,53.393092
6,roberta,,2020-2022,0.704602,0.911239,0.717773,0.264607,38.028169,46.469602
7,roberta,,2022 handwritten,0.568249,0.82547,0.571296,0.005261,5.263158,22.144303
8,distilbert,full,2020,0.904786,0.932842,0.918088,0.150651,42.857143,56.703906
9,distilbert,full,2022,0.869133,0.919125,0.88241,0.161154,37.383178,50.040854
