In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, set_seed
from datasets import *
import numpy as np
import torch
import ast

In [2]:
SEED = 42
set_seed(SEED)

local_models_path = '../../data/models/T5'

In [3]:
def get_answer(question, context, tokenizer, model):
    input_text = "question: %s  context: %s" % (question, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask'])

    return tokenizer.decode(output[0])

## Load all data

2020

In [4]:
# Load the dataset from file and split it into train and test datasets
data_2020_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2020-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2020_full["test"] = data_2020_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2020_full["test"] = data_2020_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2020_full["test"] = data_2020_full["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2020_full["test"] = data_2020_full["test"].remove_columns(["text", "answer_start"])
# get ground truth answers
test_data_2020_full = data_2020_full["test"]
gt_answers_2020_full = [temp["answers"]["text"][0] for temp in test_data_2020_full]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-d8382661cd597e83/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c261d5613d28d856.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e61829c1e4a24b65.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0b15501cefb41ff7.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e4de42d02343959f.arrow
Loading cached processed da

2022

In [5]:
data_2022_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2022-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2022_full["test"] = data_2022_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2022_full["test"] = data_2022_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2022_full["test"] = data_2022_full["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2022_full["test"] = data_2022_full["test"].remove_columns(["text", "answer_start"])

test_data_2022_full = data_2022_full["test"]
gt_answers_2022_full = [temp["answers"]["text"][0] for temp in test_data_2022_full]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-003bb09dc8228b5f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-515ab9eb5e89ae1b.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-bf44f2d0ce4c658e.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0158f993dabee325.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-bee36876181b6213.arrow
Loading cached processed da

2022 handwritten

In [6]:
data_2022_handwritten = load_dataset('csv', data_files=f"../../data/clean/QA_SR_2022_Expert-squad-format.csv",
                                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2022_handwritten["test"] = data_2022_handwritten["test"].remove_columns(["text", "answer_start"])

test_data_2022_handwritten = data_2022_handwritten["test"]
gt_answers_2022_handwritten = [temp["answers"]["text"][0] for temp in test_data_2022_handwritten]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-853b320bab41342e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-853b320bab41342e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-690d708f98f9a3b4.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-853b320bab41342e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b3c2cbaa1563558c.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-853b320bab41342e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-227bc1b6d7ce66b1.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-853b320bab41342e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-ea73fd3146b3fd5a.arrow
Loading cached processed da

## SIMPLE EVALUATION

In [7]:
import evaluate
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")

### Small

In [8]:
model_name = "mrm8488/t5-small-finetuned-squadv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

answers_2020 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2022_handwritten = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2022_handwritten["question"], test_data_2022_handwritten["context"])]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")
print(f"Bertscore results 2022 handwritten\nF1: {np.array(results_2022_handwritten['f1']).mean()}, Precision: {np.array(results_2022_handwritten['precision']).mean()}, Recall: {np.array(results_2022_handwritten['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")
print(f"Bleu results 2022 handwritten\n{results_2022_handwritten}")

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Bertscore results 2020
F1: 0.8861216125743729, Precision: 0.874057241848537, Recall: 0.8988280519843102
Bertscore results 2022
F1: 0.870940341570667, Precision: 0.8626913940795115, Recall: 0.8798553592690798
Bertscore results 2022 handwritten
F1: 0.8648425999440645, Precision: 0.8759872129088954, Recall: 0.8547752625063846
Bleu results 2020
{'bleu': 0.14932377828599544, 'precisions': [0.2303473491773309, 0.15885947046843177, 0.12873563218390804, 0.10554089709762533], 'brevity_penalty': 1.0, 'length_ratio': 3.529032258064516, 'translation_length': 547, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.1020040212593282, 'precisions': [0.18302658486707565, 0.1113662456946039, 0.08115183246073299, 0.0654490106544901], 'brevity_penalty': 1.0, 'length_ratio': 3.5434782608695654, 'translation_length': 978, 'reference_length': 276}
Bleu results 2022 handwritten
{'bleu': 0.19187955903950352, 'precisions': [0.3286384976525822, 0.29381443298969073, 0.26857142857142857, 0.26282051282051283], '

### Small - finetuned

In [9]:
model_name_2020 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2020-full"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020, local_files_only=True)
model_2020 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020, local_files_only=True)

model_name_2022 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2022-full"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022, local_files_only=True)
model_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022, local_files_only=True)

model_name_2022_handwritten = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2022-handwritten"
tokenizer_2022_handwritten = AutoTokenizer.from_pretrained(model_name_2022_handwritten, local_files_only=True)
model_2022_handwritten = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022_handwritten, local_files_only=True)

answers_2020 = [get_answer(question, context, tokenizer_2020, model_2020) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer_2022, model_2022) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2022_handwritten = [get_answer(question, context, tokenizer_2022_handwritten, model_2022_handwritten) for question, context in zip(test_data_2022_handwritten["question"], test_data_2022_handwritten["context"])]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")
print(f"Bertscore results 2022 handwritten\nF1: {np.array(results_2022_handwritten['f1']).mean()}, Precision: {np.array(results_2022_handwritten['precision']).mean()}, Recall: {np.array(results_2022_handwritten['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")
print(f"Bleu results 2022 handwritten\n{results_2022_handwritten}")

Bertscore results 2020
F1: 0.8897274445210185, Precision: 0.8763338114534106, Recall: 0.9038189798593521
Bertscore results 2022
F1: 0.8710922827230436, Precision: 0.8516964990401936, Recall: 0.8918689511646735
Bertscore results 2022 handwritten
F1: 0.8869054850779081, Precision: 0.8966338007073653, Recall: 0.8781342726004752
Bleu results 2020
{'bleu': 0.1568508288241227, 'precisions': [0.24150268336314848, 0.16699801192842942, 0.13646532438478748, 0.10997442455242967], 'brevity_penalty': 1.0, 'length_ratio': 3.606451612903226, 'translation_length': 559, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.11752063190477298, 'precisions': [0.21484375, 0.13304252998909488, 0.09382716049382717, 0.07112375533428165], 'brevity_penalty': 1.0, 'length_ratio': 3.710144927536232, 'translation_length': 1024, 'reference_length': 276}
Bleu results 2022 handwritten
{'bleu': 0.3279438249818612, 'precisions': [0.4628099173553719, 0.4260089686098655, 0.39215686274509803, 0.3837837837837838], 'brevity

### Small - finetuned - train set halved

In [10]:
model_name_2020 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2020-smaller"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020, local_files_only=True)
model_2020 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020, local_files_only=True)

model_name_2022 = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2022-smaller"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022, local_files_only=True)
model_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022, local_files_only=True)

answers_2020 = [get_answer(question, context, tokenizer_2020, model_2020) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer_2022, model_2022) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")

Bertscore results 2020
F1: 0.8894282504916191, Precision: 0.8770850726536342, Recall: 0.9023833402565548
Bertscore results 2022
F1: 0.8768286537901263, Precision: 0.865651259355456, Recall: 0.8889033811114658
Bleu results 2020
{'bleu': 0.1556884614710667, 'precisions': [0.2392086330935252, 0.164, 0.13513513513513514, 0.11082474226804123], 'brevity_penalty': 1.0, 'length_ratio': 3.587096774193548, 'translation_length': 556, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.10862415967904492, 'precisions': [0.2023928215353938, 0.12276785714285714, 0.08491761723700887, 0.06598240469208211], 'brevity_penalty': 1.0, 'length_ratio': 3.6340579710144927, 'translation_length': 1003, 'reference_length': 276}


### Base

In [11]:
model_name = "mrm8488/t5-base-finetuned-squadv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

answers_2020 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2022_handwritten = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2022_handwritten["question"], test_data_2022_handwritten["context"])]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")
print(f"Bertscore results 2022 handwritten\nF1: {np.array(results_2022_handwritten['f1']).mean()}, Precision: {np.array(results_2022_handwritten['precision']).mean()}, Recall: {np.array(results_2022_handwritten['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")
print(f"Bleu results 2022 handwritten\n{results_2022_handwritten}")

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Bertscore results 2020
F1: 0.8447753435799054, Precision: 0.824425468487399, Recall: 0.8669386197413717
Bertscore results 2022
F1: 0.8554230839292579, Precision: 0.8355540934010087, Recall: 0.8770849186683369
Bertscore results 2022 handwritten
F1: 0.8365662003818312, Precision: 0.8323138230725339, Recall: 0.8418080493023521
Bleu results 2020
{'bleu': 0.08073612217075588, 'precisions': [0.11964735516372796, 0.08536585365853659, 0.07038123167155426, 0.05910543130990415], 'brevity_penalty': 1.0, 'length_ratio': 5.122580645161291, 'translation_length': 794, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.06065554146750149, 'precisions': [0.1196769456681351, 0.07410358565737052, 0.04965156794425087, 0.03073967339097022], 'brevity_penalty': 1.0, 'length_ratio': 4.934782608695652, 'translation_length': 1362, 'reference_length': 276}
Bleu results 2022 handwritten
{'bleu': 0.1417131660796983, 'precisions': [0.17421602787456447, 0.14925373134328357, 0.14056224899598393, 0.13043478260869565

### Base - finetuned

In [12]:
model_name_2020 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2020-full"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020, local_files_only=True)
model_2020 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020, local_files_only=True)

model_name_2022 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2022-full"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022, local_files_only=True)
model_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022, local_files_only=True)

model_name_2022_handwritten = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2022-handwritten"
tokenizer_2022_handwritten = AutoTokenizer.from_pretrained(model_name_2022_handwritten, local_files_only=True)
model_2022_handwritten = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022_handwritten, local_files_only=True)

answers_2020 = [get_answer(question, context, tokenizer_2020, model_2020) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer_2022, model_2022) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]
answers_2022_handwritten = [get_answer(question, context, tokenizer_2022_handwritten, model_2022_handwritten) for question, context in zip(test_data_2022_handwritten["question"], test_data_2022_handwritten["context"])]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")
print(f"Bertscore results 2022 handwritten\nF1: {np.array(results_2022_handwritten['f1']).mean()}, Precision: {np.array(results_2022_handwritten['precision']).mean()}, Recall: {np.array(results_2022_handwritten['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")
print(f"Bleu results 2022 handwritten\n{results_2022_handwritten}")

Bertscore results 2020
F1: 0.8806309082678386, Precision: 0.8578400526727948, Recall: 0.9048874878457615
Bertscore results 2022
F1: 0.8735402399134413, Precision: 0.8510980578226464, Recall: 0.8974188555066831
Bertscore results 2022 handwritten
F1: 0.9095499013599596, Precision: 0.9105994732756364, Recall: 0.9091016179636905
Bleu results 2020
{'bleu': 0.177079091797076, 'precisions': [0.26654740608228983, 0.18687872763419483, 0.15436241610738255, 0.1278772378516624], 'brevity_penalty': 1.0, 'length_ratio': 3.606451612903226, 'translation_length': 559, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.14729439117167095, 'precisions': [0.24539282250242483, 0.16125541125541126, 0.12239902080783353, 0.09718309859154929], 'brevity_penalty': 1.0, 'length_ratio': 3.7355072463768115, 'translation_length': 1031, 'reference_length': 276}
Bleu results 2022 handwritten
{'bleu': 0.47295300683074787, 'precisions': [0.5152439024390244, 0.48220064724919093, 0.4586206896551724, 0.43911439114391143]

### Base - finetuned - train set halved

In [13]:
model_name_2020 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2020-smaller"
tokenizer_2020 = AutoTokenizer.from_pretrained(model_name_2020, local_files_only=True)
model_2020 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2020, local_files_only=True)

model_name_2022 = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2022-smaller"
tokenizer_2022 = AutoTokenizer.from_pretrained(model_name_2022, local_files_only=True)
model_2022 = AutoModelForSeq2SeqLM.from_pretrained(model_name_2022, local_files_only=True)

answers_2020 = [get_answer(question, context, tokenizer_2020, model_2020) for question, context in zip(test_data_2020_full["question"], test_data_2020_full["context"])]
answers_2022 = [get_answer(question, context, tokenizer_2022, model_2022) for question, context in zip(test_data_2022_full["question"], test_data_2022_full["context"])]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")

Bertscore results 2020
F1: 0.882076063326427, Precision: 0.8593617251941136, Recall: 0.9062652587890625
Bertscore results 2022
F1: 0.8726239822735297, Precision: 0.8505396414025922, Recall: 0.8961357507750253
Bleu results 2020
{'bleu': 0.17848622686317264, 'precisions': [0.26833631484794274, 0.18886679920477137, 0.15659955257270694, 0.1278772378516624], 'brevity_penalty': 1.0, 'length_ratio': 3.606451612903226, 'translation_length': 559, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.14435510274740104, 'precisions': [0.24271844660194175, 0.15926327193932827, 0.11887254901960784, 0.09449929478138223], 'brevity_penalty': 1.0, 'length_ratio': 3.7318840579710146, 'translation_length': 1030, 'reference_length': 276}
