In [1]:
from transformers import DistilBertTokenizerFast, AutoModelForQuestionAnswering, set_seed
from datasets import *
import numpy as np
import torch
import ast

In [2]:
SEED = 42
set_seed(SEED)

local_models_path = '../../data/models'

In [3]:
# Define the prediction function
def inference_answer(question, context, tokenizer, model):
    question = question
    context = context
    test_feature = tokenizer(
        question,
        context,
        max_length=318
    )
    with torch.no_grad():
        outputs = model(torch.tensor([test_feature["input_ids"]]))
    start_logits = outputs.start_logits.cpu().numpy()
    end_logits = outputs.end_logits.cpu().numpy()
    answer_ids = test_feature["input_ids"][np.argmax(
        start_logits):np.argmax(end_logits)+1]
    return " ".join(tokenizer.batch_decode(answer_ids))

## Load all data

2020

In [4]:
# Load the dataset from file and split it into train and test datasets
data_2020_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2020-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2020_full["test"] = data_2020_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2020_full["test"] = data_2020_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
data_2020_full["test"] = data_2020_full["test"].remove_columns(["text", "answer_start"])
# get ground truth answers
test_data_2020_full = data_2020_full["test"]
gt_answers_2020_full = [temp["answers"]["text"][0] for temp in test_data_2020_full]

# squad formatted data
references_2020 = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in zip(data_2020_full["test"]["id"], data_2020_full["test"]["answers"])]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-d8382661cd597e83/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c261d5613d28d856.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e61829c1e4a24b65.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0b15501cefb41ff7.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e4de42d02343959f.arrow


2022

In [5]:
data_2022_full = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-2022-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data_2022_full["test"] = data_2022_full["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data_2022_full["test"] = data_2022_full["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
data_2022_full["test"] = data_2022_full["test"].remove_columns(["text", "answer_start"])

test_data_2022_full = data_2022_full["test"]
gt_answers_2022_full = [temp["answers"]["text"][0] for temp in test_data_2022_full]

# squad formatted data
references_2022 = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in zip(data_2022_full["test"]["id"], data_2022_full["test"]["answers"])]

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-003bb09dc8228b5f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-515ab9eb5e89ae1b.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-bf44f2d0ce4c658e.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0158f993dabee325.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-003bb09dc8228b5f\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-bee36876181b6213.arrow


## SIMPLE EVALUATION

In [6]:
import evaluate
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")
squad_v2_metric = evaluate.load("squad_v2")

### Baseline

In [7]:
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer, model) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]
answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer, model) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")

# squad_v2
results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
print(f"Squad_v2 results 2020\n{results_2020}")
print(f"Squad_v2 results 2022\n{results_2022}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Bertscore results 2020
F1: 0.9191093625766891, Precision: 0.9054063952394894, Recall: 0.9349026360682079
Bertscore results 2022
F1: 0.8887493989177954, Precision: 0.8703976446223036, Recall: 0.9097077100076408
Bleu results 2020
{'bleu': 0.15141296248186284, 'precisions': [0.29705882352941176, 0.1936619718309859, 0.11934156378600823, 0.07655502392344497], 'brevity_penalty': 1.0, 'length_ratio': 2.193548387096774, 'translation_length': 340, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.10917104830220783, 'precisions': [0.22283356258596973, 0.13344051446945338, 0.08333333333333333, 0.05732484076433121], 'brevity_penalty': 1.0, 'length_ratio': 2.6340579710144927, 'translation_length': 727, 'reference_length': 276}
Squad_v2 results 2020
{'exact': 44.642857142857146, 'f1': 57.656286734314676, 'total': 56, 'HasAns_exact': 44.642857142857146, 'HasAns_f1': 57.656286734314676, 'HasAns_total': 56, 'best_exact': 44.642857142857146, 'best_exact_thresh': 0.0, 'best_f1': 57.656286734314676, '



### Finetuned

In [8]:
model_name_2020 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2020-full"
tokenizer_2020 = DistilBertTokenizerFast.from_pretrained(model_name_2020)
model_2020 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020)

model_name_2022 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-full"
tokenizer_2022 = DistilBertTokenizerFast.from_pretrained(model_name_2022)
model_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2022)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer_2020, model_2020) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]

answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer_2022, model_2022) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")

# squad_v2
results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
print(f"Squad_v2 results 2020\n{results_2020}")
print(f"Squad_v2 results 2022\n{results_2022}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Bertscore results 2020
F1: 0.9174407050013542, Precision: 0.9029378337519509, Recall: 0.9340311203684125
Bertscore results 2022
F1: 0.8868488780805998, Precision: 0.8694146191962412, Recall: 0.9062445197149972
Bleu results 2020
{'bleu': 0.1506511045315903, 'precisions': [0.2934472934472934, 0.18983050847457628, 0.11857707509881422, 0.0779816513761468], 'brevity_penalty': 1.0, 'length_ratio': 2.264516129032258, 'translation_length': 351, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.16115398022423177, 'precisions': [0.2727272727272727, 0.1834862385321101, 0.13174946004319654, 0.10230179028132992], 'brevity_penalty': 1.0, 'length_ratio': 2.351449275362319, 'translation_length': 649, 'reference_length': 276}
Squad_v2 results 2020
{'exact': 42.857142857142854, 'f1': 56.70390578193373, 'total': 56, 'HasAns_exact': 42.857142857142854, 'HasAns_f1': 56.70390578193373, 'HasAns_total': 56, 'best_exact': 42.857142857142854, 'best_exact_thresh': 0.0, 'best_f1': 56.70390578193373, 'best_f1_



### Finetuned  - train set halved

In [9]:
model_name_2020 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2020-smaller"
tokenizer_2020 = DistilBertTokenizerFast.from_pretrained(model_name_2020)
model_2020 = AutoModelForQuestionAnswering.from_pretrained(model_name_2020)

model_name_2022 = f"{local_models_path}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-smaller"
tokenizer_2022 = DistilBertTokenizerFast.from_pretrained(model_name_2022)
model_2022 = AutoModelForQuestionAnswering.from_pretrained(model_name_2022)

answers_2020 = [inference_answer(data_2020_full["test"]["question"][idx], data_2020_full["test"]["context"][idx], tokenizer_2020, model_2020) for idx in range(data_2020_full["test"].shape[0])]
answers_2020_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2020_full["test"]["id"], answers_2020)]

answers_2022 = [inference_answer(data_2022_full["test"]["question"][idx], data_2022_full["test"]["context"][idx], tokenizer_2022, model_2022) for idx in range(data_2022_full["test"].shape[0])]
answers_2022_squad = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data_2022_full["test"]["id"], answers_2022)]

# bertscore
results_2020 = bertscore.compute(predictions=answers_2020, references=gt_answers_2020_full, lang="en")
results_2022 = bertscore.compute(predictions=answers_2022, references=gt_answers_2022_full, lang="en")
print(f"Bertscore results 2020\nF1: {np.array(results_2020['f1']).mean()}, Precision: {np.array(results_2020['precision']).mean()}, Recall: {np.array(results_2020['recall']).mean()}")
print(f"Bertscore results 2022\nF1: {np.array(results_2022['f1']).mean()}, Precision: {np.array(results_2022['precision']).mean()}, Recall: {np.array(results_2022['recall']).mean()}")

# bleu
results_2020 = bleu.compute(predictions=answers_2020, references=gt_answers_2020_full)
results_2022 = bleu.compute(predictions=answers_2022, references=gt_answers_2022_full)
print(f"Bleu results 2020\n{results_2020}")
print(f"Bleu results 2022\n{results_2022}")

# squad_v2
results_2020 = squad_v2_metric.compute(predictions=answers_2020_squad, references=references_2020)
results_2022 = squad_v2_metric.compute(predictions=answers_2022_squad, references=references_2022)
print(f"Squad_v2 results 2020\n{results_2020}")
print(f"Squad_v2 results 2022\n{results_2022}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Bertscore results 2020
F1: 0.9167757598417146, Precision: 0.903240730719907, Recall: 0.932248269873006
Bertscore results 2022
F1: 0.8936889656236239, Precision: 0.8761515533812693, Recall: 0.9134437837333322
Bleu results 2020
{'bleu': 0.1573921816360192, 'precisions': [0.29878048780487804, 0.19852941176470587, 0.12608695652173912, 0.08205128205128205], 'brevity_penalty': 1.0, 'length_ratio': 2.1161290322580646, 'translation_length': 328, 'reference_length': 155}
Bleu results 2022
{'bleu': 0.1478251376103963, 'precisions': [0.2547584187408492, 0.16782006920415224, 0.11895161290322581, 0.09389671361502347], 'brevity_penalty': 1.0, 'length_ratio': 2.4746376811594204, 'translation_length': 683, 'reference_length': 276}
Squad_v2 results 2020
{'exact': 42.857142857142854, 'f1': 55.334858162886114, 'total': 56, 'HasAns_exact': 42.857142857142854, 'HasAns_f1': 55.334858162886114, 'HasAns_total': 56, 'best_exact': 42.857142857142854, 'best_exact_thresh': 0.0, 'best_f1': 55.334858162886114, 'bes

