In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, set_seed
from datasets import *
import numpy as np
import pandas as pd
import torch
import ast

In [2]:
SEED = 42
set_seed(SEED)

local_models_path = '../../data/models/T5'

results = pd.DataFrame(columns=['Model', 'Train Data', 'Data Type', 'Bert.Precision', 'Bert.Recall', 'Bert.F1', 'BLEU'])

In [3]:
def get_answer(question, context, tokenizer, model):
    input_text = "question: %s  context: %s" % (question, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask'])

    return tokenizer.decode(output[0])

## Load all data

2020

In [4]:
# Load the dataset from file and split it into train and test datasets
data_2020_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2020-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2020_full["test"] = data_2020_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2020_full["test"] = data_2020_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2020_full["test"] = data_2020_full["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2020_full["test"] = data_2020_full["test"].remove_columns(["text", "answer_start"])
# get ground truth answers
test_data_2020_full = data_2020_full["test"]
gt_answers_2020_full = [temp["answers"]["text"][0] for temp in test_data_2020_full]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-d8382661cd597e83/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c261d5613d28d856.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e61829c1e4a24b65.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0b15501cefb41ff7.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e4de42d02343959f.arrow
Loading cached processed da

2022

In [5]:
data_2022_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2022-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2022_full["test"] = data_2022_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2022_full["test"] = data_2022_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2022_full["test"] = data_2022_full["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2022_full["test"] = data_2022_full["test"].remove_columns(["text", "answer_start"])

test_data_2022_full = data_2022_full["test"]
gt_answers_2022_full = [temp["answers"]["text"][0] for temp in test_data_2022_full]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-003bb09dc8228b5f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-515ab9eb5e89ae1b.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-bf44f2d0ce4c658e.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0158f993dabee325.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-bee36876181b6213.arrow
Loading cached processed da

2020 + 2022

In [6]:
data_2020_2022 = load_dataset('csv', data_files="../../data/clean/sustainability-report-2042-squad-format.csv",
                                delimiter=";", split="train").train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2020_2022["test"] = data_2020_2022["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2020_2022["test"] = data_2020_2022["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2020_2022["test"] = data_2020_2022["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2020_2022["test"] = data_2020_2022["test"].remove_columns(["text", "answer_start"])
# get ground truth answers
test_data_2020_2022 = data_2020_2022["test"]
gt_answers_2020_2022 = [temp["answers"]["text"][0] for temp in test_data_2020_2022]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-a6bdc04297c1a3c2/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-a6bdc04297c1a3c2\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-4c58d6607e064ca8.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-a6bdc04297c1a3c2\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-6ec98b5ad96d608a.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-a6bdc04297c1a3c2\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b60d97fbf617fa23.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-a6bdc04297c1a3c2\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-da9287d5acd53d81.arrow
Loading cached processed da

2022 handwritten

In [7]:
data_2022_handwritten = load_dataset('csv', data_files=f"../../data/clean/QA_SR_2022_Expert-squad-format.csv",
                                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2022_handwritten["test"] = data_2022_handwritten["test"].remove_columns(["text", "answer_start"])

test_data_2022_handwritten = data_2022_handwritten["test"]
gt_answers_2022_handwritten = [temp["answers"]["text"][0] for temp in test_data_2022_handwritten]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-853b320bab41342e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-853b320bab41342e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-690d708f98f9a3b4.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-853b320bab41342e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b3c2cbaa1563558c.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-853b320bab41342e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-227bc1b6d7ce66b1.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-853b320bab41342e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-ea73fd3146b3fd5a.arrow
Loading cached processed da

## SIMPLE EVALUATION

In [8]:
import evaluate
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")

### Small

In [9]:
model_name = "mrm8488/t5-small-finetuned-squadv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

answers_2020 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2020_2022 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2020_2022["question"], test_data_2020_2022["context"])]
answers_2022_handwritten = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2022_handwritten["question"], test_data_2022_handwritten["context"])]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
bert_results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")
# print(f"Bertscore results 2022 handwritten\nF1: {np.array(bert_results_2022_handwritten['f1']).mean()}, Precision: {np.array(bert_results_2022_handwritten['precision']).mean()}, Recall: {np.array(bert_results_2022_handwritten['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
bleu_results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")
# print(f"Bleu results 2022 handwritten\n{bleu_results_2022_handwritten}")


# add results to dataframe
results.loc[len(results)] = ['t5-small', None, '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu']]
results.loc[len(results)] = ['t5-small', None, '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu']]
results.loc[len(results)] = ['t5-small', None, '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu']]
results.loc[len(results)] = ['t5-small', None, '2022 handwritten', np.array(bert_results_2022_handwritten['precision']).mean(), np.array(bert_results_2022_handwritten['recall']).mean(), np.array(bert_results_2022_handwritten['f1']).mean(), bleu_results_2022_handwritten['bleu']]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


### Small - finetuned

In [10]:
model_name_2020 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2020-full"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020, local_files_only=True)
model_2020 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020, local_files_only=True)

model_name_2022 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2022-full"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022, local_files_only=True)
model_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022, local_files_only=True)

model_name_2020_2022 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2042-full_combined"
tokenizer_2020_2022 = AutoTokenizer.from_pretrained(model_name_2020_2022, local_files_only=True)
model_2020_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020_2022, local_files_only=True)

model_name_2022_handwritten = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2022-handwritten"
tokenizer_2022_handwritten = AutoTokenizer.from_pretrained(model_name_2022_handwritten, local_files_only=True)
model_2022_handwritten = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022_handwritten, local_files_only=True)

answers_2020 = [get_answer(question, context, tokenizer_2020, model_2020) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer_2022, model_2022) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2020_2022 = [get_answer(question, context, tokenizer_2020_2022, model_2020_2022) for question, context in zip(test_data_2020_2022["question"], test_data_2020_2022["context"])]
answers_2022_handwritten = [get_answer(question, context, tokenizer_2022_handwritten, model_2022_handwritten) for question, context in zip(test_data_2022_handwritten["question"], test_data_2022_handwritten["context"])]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
bert_results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")
# print(f"Bertscore results 2022 handwritten\nF1: {np.array(bert_results_2022_handwritten['f1']).mean()}, Precision: {np.array(bert_results_2022_handwritten['precision']).mean()}, Recall: {np.array(bert_results_2022_handwritten['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
bleu_results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2022 handwritten\n{bleu_results_2022_handwritten}")


# add results to dataframe
results.loc[len(results)] = ['t5-small', 'full', '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu']]
results.loc[len(results)] = ['t5-small', 'full', '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu']]
results.loc[len(results)] = ['t5-small', 'full', '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu']]
results.loc[len(results)] = ['t5-small', 'full', '2022 handwritten', np.array(bert_results_2022_handwritten['precision']).mean(), np.array(bert_results_2022_handwritten['recall']).mean(), np.array(bert_results_2022_handwritten['f1']).mean(), bleu_results_2022_handwritten['bleu']]



### Small - finetuned - train set halved

In [11]:
model_name_2020 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2020-smaller"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020, local_files_only=True)
model_2020 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020, local_files_only=True)

model_name_2022 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2022-smaller"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022, local_files_only=True)
model_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022, local_files_only=True)

model_name_2020_2022 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2042-smaller_combined"
tokenizer_2020_2022 = AutoTokenizer.from_pretrained(model_name_2020_2022, local_files_only=True)
model_2020_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020_2022, local_files_only=True)

answers_2020 = [get_answer(question, context, tokenizer_2020, model_2020) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer_2022, model_2022) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2020_2022 = [get_answer(question, context, tokenizer_2020_2022, model_2020_2022) for question, context in zip(test_data_2020_2022["question"], test_data_2020_2022["context"])]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")


# add results to dataframe
results.loc[len(results)] = ['t5-small', 'smaller', '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu']]
results.loc[len(results)] = ['t5-small', 'smaller', '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu']]
results.loc[len(results)] = ['t5-small', 'smaller', '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu']]



### Base

In [12]:
model_name = "mrm8488/t5-base-finetuned-squadv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

answers_2020 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2020_2022 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2020_2022["question"], test_data_2020_2022["context"])]
answers_2022_handwritten = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2022_handwritten["question"], test_data_2022_handwritten["context"])]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
bert_results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")
# print(f"Bertscore results 2022 handwritten\nF1: {np.array(bert_results_2022_handwritten['f1']).mean()}, Precision: {np.array(bert_results_2022_handwritten['precision']).mean()}, Recall: {np.array(bert_results_2022_handwritten['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
bleu_results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")
# print(f"Bleu results 2022 handwritten\n{bleu_results_2022_handwritten}")


# add results to dataframe
results.loc[len(results)] = ['t5-base', None, '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu']]
results.loc[len(results)] = ['t5-base', None, '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu']]
results.loc[len(results)] = ['t5-base', None, '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu']]
results.loc[len(results)] = ['t5-base', None, '2022 handwritten', np.array(bert_results_2022_handwritten['precision']).mean(), np.array(bert_results_2022_handwritten['recall']).mean(), np.array(bert_results_2022_handwritten['f1']).mean(), bleu_results_2022_handwritten['bleu']]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


### Base - finetuned

In [13]:
model_name_2020 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2020-full"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020, local_files_only=True)
model_2020 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020, local_files_only=True)

model_name_2022 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2022-full"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022, local_files_only=True)
model_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022, local_files_only=True)

model_name_2020_2022 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2042-full_combined"
tokenizer_2020_2022 = AutoTokenizer.from_pretrained(model_name_2020_2022, local_files_only=True)
model_2020_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020_2022, local_files_only=True)

model_name_2022_handwritten = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2022-handwritten"
tokenizer_2022_handwritten = AutoTokenizer.from_pretrained(model_name_2022_handwritten, local_files_only=True)
model_2022_handwritten = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022_handwritten, local_files_only=True)

answers_2020 = [get_answer(question, context, tokenizer_2020, model_2020) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer_2022, model_2022) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2020_2022 = [get_answer(question, context, tokenizer_2020_2022, model_2020_2022) for question, context in zip(test_data_2020_2022["question"], test_data_2020_2022["context"])]
answers_2022_handwritten = [get_answer(question, context, tokenizer_2022_handwritten, model_2022_handwritten) for question, context in zip(test_data_2022_handwritten["question"], test_data_2022_handwritten["context"])]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
bert_results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")
# print(f"Bertscore results 2022 handwritten\nF1: {np.array(bert_results_2022_handwritten['f1']).mean()}, Precision: {np.array(bert_results_2022_handwritten['precision']).mean()}, Recall: {np.array(bert_results_2022_handwritten['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
bleu_results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")
# print(f"Bleu results 2022 handwritten\n{bleu_results_2022_handwritten}")


# add results to dataframe
results.loc[len(results)] = ['t5-base', 'full', '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu']]
results.loc[len(results)] = ['t5-base', 'full', '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu']]
results.loc[len(results)] = ['t5-base', 'full', '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu']]
results.loc[len(results)] = ['t5-base', 'full', '2022 handwritten', np.array(bert_results_2022_handwritten['precision']).mean(), np.array(bert_results_2022_handwritten['recall']).mean(), np.array(bert_results_2022_handwritten['f1']).mean(), bleu_results_2022_handwritten['bleu']]



### Base - finetuned - train set halved

In [14]:
model_name_2020 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2020-smaller"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020, local_files_only=True)
model_2020 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020, local_files_only=True)

model_name_2022 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2022-smaller"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022, local_files_only=True)
model_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022, local_files_only=True)

model_name_2020_2022 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2042-smaller_combined"
tokenizer_2020_2022 = AutoTokenizer.from_pretrained(model_name_2020_2022, local_files_only=True)
model_2020_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020_2022, local_files_only=True)

answers_2020 = [get_answer(question, context, tokenizer_2020, model_2020) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer_2022, model_2022) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2020_2022 = [get_answer(question, context, tokenizer_2020_2022, model_2020_2022) for question, context in zip(test_data_2020_2022["question"], test_data_2020_2022["context"])]

# bertscore
bert_results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
bert_results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
# print(f"Bertscore results 2020\nF1: {np.array(bert_results_2020['f1']).mean()}, Precision: {np.array(bert_results_2020['precision']).mean()}, Recall: {np.array(bert_results_2020['recall']).mean()}")
# print(f"Bertscore results 2022\nF1: {np.array(bert_results_2022['f1']).mean()}, Precision: {np.array(bert_results_2022['precision']).mean()}, Recall: {np.array(bert_results_2022['recall']).mean()}")
# print(f"Bertscore results 2020-2022\nF1: {np.array(bert_results_2020_2022['f1']).mean()}, Precision: {np.array(bert_results_2020_2022['precision']).mean()}, Recall: {np.array(bert_results_2020_2022['recall']).mean()}")

# bleu
bleu_results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
bleu_results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)
# print(f"Bleu results 2020\n{bleu_results_2020}")
# print(f"Bleu results 2022\n{bleu_results_2022}")
# print(f"Bleu results 2020-2022\n{bleu_results_2020_2022}")


# add results to dataframe
results.loc[len(results)] = ['t5-base', 'smaller', '2020', np.array(bert_results_2020['precision']).mean(), np.array(bert_results_2020['recall']).mean(), np.array(bert_results_2020['f1']).mean(), bleu_results_2020['bleu']]
results.loc[len(results)] = ['t5-base', 'smaller', '2022', np.array(bert_results_2022['precision']).mean(), np.array(bert_results_2022['recall']).mean(), np.array(bert_results_2022['f1']).mean(), bleu_results_2022['bleu']]
results.loc[len(results)] = ['t5-base', 'smaller', '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu']]



## Display all results

In [15]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(results)

Unnamed: 0,Model,Train Data,Data Type,Bert.Precision,Bert.Recall,Bert.F1,BLEU
0,t5-small,,2020,0.874057,0.898828,0.886122,0.149324
1,t5-small,,2022,0.862691,0.879855,0.87094,0.102004
2,t5-small,,2020-2022,0.866785,0.890219,0.878066,0.100362
3,t5-small,,2022 handwritten,0.875987,0.854775,0.864843,0.19188
4,t5-small,full,2020,0.876334,0.903819,0.889727,0.156851
5,t5-small,full,2022,0.851697,0.891869,0.871092,0.117521
6,t5-small,full,2020-2022,0.850393,0.902251,0.875423,0.12556
7,t5-small,full,2022 handwritten,0.896634,0.878134,0.886905,0.327944
8,t5-small,smaller,2020,0.877085,0.902383,0.889428,0.155688
9,t5-small,smaller,2022,0.865651,0.888903,0.876829,0.108624
