In [33]:
from transformers import DistilBertTokenizerFast, AutoModelForQuestionAnswering, set_seed
from datasets import *
import numpy as np
import torch
import ast

In [34]:
SEED = 42
set_seed(SEED)

local_models_path = '../../data/models/BERT'

In [35]:
# Define the prediction function
def inference_answer(question, context, tokenizer, model):
    question = question
    context = context
    test_feature = tokenizer(
        question,
        context,
        max_length=318
    )
    with torch.no_grad():
        outputs = model(torch.tensor([test_feature["input_ids"]]))
    start_logits = outputs.start_logits.cpu().numpy()
    end_logits = outputs.end_logits.cpu().numpy()
    answer_ids = test_feature["input_ids"][np.argmax(
        start_logits):np.argmax(end_logits)+1]
    return " ".join(tokenizer.batch_decode(answer_ids))

## Load all data

2020

In [36]:
# Load the dataset from file and split it into train and test datasets
data_2020_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2020-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2020_full["test"] = data_2020_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2020_full["test"] = data_2020_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2020_full["test"] = data_2020_full["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2020_full["test"] = data_2020_full["test"].remove_columns(["text", "answer_start"])
# get ground truth answers
test_data_2020_full = data_2020_full["test"]
gt_answers_2020_full = [temp["answers"]["text"][0] for temp in test_data_2020_full]

# squad formatted data
references_2020 = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in zip(data_2020_full["test"]["id"], data_2020_full["test"]["answers"])]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-d8382661cd597e83/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c261d5613d28d856.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e61829c1e4a24b65.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0b15501cefb41ff7.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e4de42d02343959f.arrow
Loading cached processed da

2022

In [37]:
data_2022_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2022-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2022_full["test"] = data_2022_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2022_full["test"] = data_2022_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2022_full["test"] = data_2022_full["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2022_full["test"] = data_2022_full["test"].remove_columns(["text", "answer_start"])

test_data_2022_full = data_2022_full["test"]
gt_answers_2022_full = [temp["answers"]["text"][0] for temp in test_data_2022_full]

# squad formatted data
references_2022 = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in zip(data_2022_full["test"]["id"], data_2022_full["test"]["answers"])]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-003bb09dc8228b5f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-515ab9eb5e89ae1b.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-bf44f2d0ce4c658e.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0158f993dabee325.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-bee36876181b6213.arrow
Loading cached processed da

2022 handwritten

In [38]:
data_2022_handwritten = load_dataset('csv', data_files=f"../../data/clean/QA_SR_2022_Expert-squad-format.csv",
                                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data_2022_handwritten["test"] = data_2022_handwritten["test"].remove_columns(["text", "answer_start"])

test_data_2022_handwritten = data_2022_handwritten["test"]
gt_answers_2022_handwritten = [temp["answers"]["text"][0] for temp in test_data_2022_handwritten]

# squad formatted data
references_2022_handwritten = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in enumerate(data_2022_handwritten["test"]["answers"])]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-7e0bd965690926b0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-9bb74a86dcf2fec4.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-3da9507e622ee022.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-23042aa7dfb34243.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-9d063bd963fc89f9.arrow
Loading cached processed da

## SIMPLE EVALUATION

In [39]:
import evaluate
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")
squad_v2_metric = evaluate.load("squad_v2")

### Baseline

In [40]:
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer, model) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]
answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer, model) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]
answers_2022_handwritten = [inference_answer(data_2022_handwritten["test"]["question"][idx], data_2022_handwritten["test"]["context"][idx], tokenizer, model) for idx in range(data_2022_handwritten["test"].shape[0])]
answers_2022_handwritten_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in enumerate(answers_2022_handwritten)]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")
print(f"Bertscore results 2022 handwritten\nF1: {np.array(results_2022_handwritten['f1']).mean()}, Precision: {np.array(results_2022_handwritten['precision']).mean()}, Recall: {np.array(results_2022_handwritten['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")
print(f"Bleu results 2022 handwritten\n{results_2022_handwritten}")

# squad_v2
results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
results_2022_handwritten = squad_v2_metric.compute(predictions=answers_2022_handwritten_squad, references=references_2022_handwritten)
print(f"Squad_v2 results 2020\n{results_2020}")
print(f"Squad_v2 results 2022\n{results_2022}")
print(f"Squad_v2 results 2022 handwritten\n{results_2022_handwritten}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Bertscore results 2020
F1: 0.9191093551261085, Precision: 0.9054063920463834, Recall: 0.9349026339394706
Bertscore results 2022
F1: 0.888749390562004, Precision: 0.8703976368235651, Recall: 0.9097076999806912
Bertscore results 2022 handwritten
F1: 0.8637932695840534, Precision: 0.8706748579677782, Recall: 0.8577684540497629
Bleu results 2020
{'bleu': 0.15141296248186284, 'precisions': [0.29705882352941176, 0.1936619718309859, 0.11934156378600823, 0.07655502392344497], 'brevity_penalty': 1.0, 'length_ratio': 2.193548387096774, 'translation_length': 340, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.10917104830220783, 'precisions': [0.22283356258596973, 0.13344051446945338, 0.08333333333333333, 0.05732484076433121], 'brevity_penalty': 1.0, 'length_ratio': 2.6340579710144927, 'translation_length': 727, 'reference_length': 276}
Bleu results 2022 handwritten
{'bleu': 0.1535497194269064, 'precisions': [0.49056603773584906, 0.39285714285714285, 0.3387096774193548, 0.2882882882882883],

In [41]:
display(gt_answers_2022_handwritten)
display(answers_2022_handwritten)

['10 reports of suspicions of harmful behaviour were categorised as whistle-blow cases',
 'ESG risks do not represent a new risk category, but rather one of risk drivers of the existing type of risks, such as credit, liquidity, market and operational risk.',
 'organise regular education and training courses',
 'NLB Group rejects all forms of bribery and corruption',
 'By the year 2022 the NLG Group decreased the number of paper prints by 43% compared to 2019',
 'a standardised document that describes ethical business conduct, outlines values, and sets conduct guidelines for relationships with clients, competitors, business partners, state authorities, regulators, shareholders, and internal relationships',
 'In NLB Group various (whistleblowing) channels',
 'the bank managed to reduce the use of paper by up to 19% in 2022 compared to the previous year',
 '70%',
 'According to ESMS, ESG risk management is considered on three levels',
 'Our goal is to raise awareness in among the public o

['10',
 'E ##S ##G risks do not',
 'organise regular education and training courses',
 'unfair , illegal , and de ##tri ##mental to countries with corrupt practices and society in general',
 '43 %',
 'a standard ##ised document that describes ethical business conduct',
 'internally and publicly available',
 '19 %',
 '70 %',
 'three',
 'to raise awareness in among the public of the importance of physical exercise for preserving health',
 'zero - carbon sources',
 'The Bank Association of Slovenia , Am ##C ##ham , the Chamber of Commerce and Industry of Slovenia',
 'one',
 'https : / / whistle ##r . n ##l ##b . si / f ##aq - en ##g .',
 'January 202 ##3',
 'only in legally permitted ways',
 'companies ad ##here to the constitution and internationally recognized human rights',
 '7 to 8 million']

### Finetuned

In [42]:
model_name_2020 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2020-full"
tokenizer_2020 = DistilBertTokenizerFast.from_pretrained(model_name_2020)
model_2020 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020)

model_name_2022 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-full"
tokenizer_2022 = DistilBertTokenizerFast.from_pretrained(model_name_2022)
model_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2022)

model_name_2022_handwritten = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-handwritten"
tokenizer_2022_handwritten = DistilBertTokenizerFast.from_pretrained(model_name_2022_handwritten)
model_2022_handwritten = AutoModelForQuestionAnswering.from_pretrained(model_name_2022_handwritten)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer_2020, model_2020) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]

answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer_2022, model_2022) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]

answers_2022_handwritten = [inference_answer(data_2022_handwritten["test"]["question"][idx], data_2022_handwritten["test"]["context"][idx], tokenizer_2022_handwritten, model_2022_handwritten) for idx in range(data_2022_handwritten["test"].shape[0])]
answers_2022_handwritten_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in enumerate(answers_2022_handwritten)]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")
print(f"Bertscore results 2022 handwritten\nF1: {np.array(results_2022_handwritten['f1']).mean()}, Precision: {np.array(results_2022_handwritten['precision']).mean()}, Recall: {np.array(results_2022_handwritten['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")
print(f"Bleu results 2022 handwritten\n{results_2022_handwritten}")

# squad_v2
results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
results_2022_handwritten = squad_v2_metric.compute(predictions=answers_2022_handwritten_squad, references=references_2022_handwritten)
print(f"Squad_v2 results 2020\n{results_2020}")
print(f"Squad_v2 results 2022\n{results_2022}")
print(f"Squad_v2 results 2022 handwritten\n{results_2022_handwritten}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Bertscore results 2020
F1: 0.9174406975507736, Precision: 0.902937830558845, Recall: 0.9340311182396752
Bertscore results 2022
F1: 0.8868488758523888, Precision: 0.8694146097263443, Recall: 0.9062445141444696
Bertscore results 2022 handwritten
F1: 0.6164656752034238, Precision: 0.6088139759866815, Recall: 0.625061317494041
Bleu results 2020
{'bleu': 0.1506511045315903, 'precisions': [0.2934472934472934, 0.18983050847457628, 0.11857707509881422, 0.0779816513761468], 'brevity_penalty': 1.0, 'length_ratio': 2.264516129032258, 'translation_length': 351, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.16115398022423177, 'precisions': [0.2727272727272727, 0.1834862385321101, 0.13174946004319654, 0.10230179028132992], 'brevity_penalty': 1.0, 'length_ratio': 2.351449275362319, 'translation_length': 649, 'reference_length': 276}
Bleu results 2022 handwritten
{'bleu': 0.09798153053436642, 'precisions': [0.3422818791946309, 0.2814814814814815, 0.2459016393442623, 0.21818181818181817], 'brev



In [43]:
display(gt_answers_2022_handwritten)
display(answers_2022_handwritten)

['10 reports of suspicions of harmful behaviour were categorised as whistle-blow cases',
 'ESG risks do not represent a new risk category, but rather one of risk drivers of the existing type of risks, such as credit, liquidity, market and operational risk.',
 'organise regular education and training courses',
 'NLB Group rejects all forms of bribery and corruption',
 'By the year 2022 the NLG Group decreased the number of paper prints by 43% compared to 2019',
 'a standardised document that describes ethical business conduct, outlines values, and sets conduct guidelines for relationships with clients, competitors, business partners, state authorities, regulators, shareholders, and internal relationships',
 'In NLB Group various (whistleblowing) channels',
 'the bank managed to reduce the use of paper by up to 19% in 2022 compared to the previous year',
 '70%',
 'According to ESMS, ESG risk management is considered on three levels',
 'Our goal is to raise awareness in among the public o

['10',
 'E ##S ##G risks do not represent a new risk category ,',
 'we organise regular education and training courses',
 'These forms of actions are unfair , illegal , and de ##tri ##mental to countries with corrupt practices and society in general .',
 '[CLS]',
 'The NL ##B Group Code of Con ##duct is a standard ##ised document that describes ethical business conduct ,',
 'internally and publicly available',
 '',
 '[CLS]',
 'three levels',
 '[CLS]',
 '',
 '',
 'On the level of NL ##B Group , there was one attempted incident of corruption ( in NL ##B d . d . ) in 202 ##2 .',
 '',
 'January 202 ##3 .',
 'We op ##ti ##mise taxes only in legally permitted ways .',
 '[CLS]',
 '']

### Finetuned  - train set halved

In [44]:
model_name_2020 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2020-smaller"
tokenizer_2020 = DistilBertTokenizerFast.from_pretrained(model_name_2020)
model_2020 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020)

model_name_2022 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-smaller"
tokenizer_2022 = DistilBertTokenizerFast.from_pretrained(model_name_2022)
model_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2022)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer_2020, model_2020) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]

answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer_2022, model_2022) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")

# squad_v2
results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
print(f"Squad_v2 results 2020\n{results_2020}")
print(f"Squad_v2 results 2022\n{results_2022}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Bertscore results 2020
F1: 0.916775752391134, Precision: 0.9032407285911697, Recall: 0.9322482645511627
Bertscore results 2022
F1: 0.8936889589389908, Precision: 0.8761515416831613, Recall: 0.9134437742634355
Bleu results 2020
{'bleu': 0.1573921816360192, 'precisions': [0.29878048780487804, 0.19852941176470587, 0.12608695652173912, 0.08205128205128205], 'brevity_penalty': 1.0, 'length_ratio': 2.1161290322580646, 'translation_length': 328, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.1478251376103963, 'precisions': [0.2547584187408492, 0.16782006920415224, 0.11895161290322581, 0.09389671361502347], 'brevity_penalty': 1.0, 'length_ratio': 2.4746376811594204, 'translation_length': 683, 'reference_length': 276}
Squad_v2 results 2020
{'exact': 42.857142857142854, 'f1': 55.334858162886114, 'total': 56, 'HasAns_exact': 42.857142857142854, 'HasAns_f1': 55.334858162886114, 'HasAns_total': 56, 'best_exact': 42.857142857142854, 'best_exact_thresh': 0.0, 'best_f1': 55.334858162886114, 'be

