In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, set_seed
from datasets import *
import numpy as np
import pandas as pd
import torch
import ast
import evaluate
import os

In [9]:
local_models_path = '../data/models/T5/SE'

results = pd.DataFrame(columns=['Model', 'Data Type', 'Bert.Precision', 'Bert.Recall', 'Bert.F1', 'BLEU'])

In [4]:
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")

In [5]:
def get_answer(question, context, tokenizer, model):
    input_text = "question: %s  context: %s" % (question, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask'])

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [10]:
models = os.listdir(local_models_path)

In [13]:
handwritten = [model for model in models if 'handwritten' in model]
rest = [model for model in models if 'handwritten' not in model]

In [None]:
model = None
tokenizer = None

for model in rest:
    model_data = model.split("-")

    print(f"Evaluating model {model}...")
    model_name = f"{local_models_path}/{model}"
    seed = model_data[-1]
    set_seed(int(seed))

    data_2020_2022 = load_dataset('csv', data_files="../data/clean/sustainability-report-2042-squad-format.csv",
                                delimiter=";", split="train").train_test_split(test_size=0.3, shuffle=True, seed=seed)

    # Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
    data_2020_2022["test"] = data_2020_2022["test"].map(
        lambda example: ast.literal_eval(example["answers"]))
    data_2020_2022["test"] = data_2020_2022["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                    "text": example["text"], "answer_start": example["answer_start"]}})
    # replace all "\n" with " " in the context, answers and questions
    data_2020_2022["test"] = data_2020_2022["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                    "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
    data_2020_2022["test"] = data_2020_2022["test"].remove_columns(["text", "answer_start"])
    # get ground truth answers
    test_data_2020_2022 = data_2020_2022["test"]
    gt_answers_2020_2022 = [temp["answers"]["text"][0] for temp in test_data_2020_2022]

    tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, local_files_only=True)

    answers_2020_2022 = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2020_2022["question"], test_data_2020_2022["context"])]

    bert_results_2020_2022 = bertscore.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022, lang="en")
    bleu_results_2020_2022 = bleu.compute(predictions=answers_2020_2022, references=gt_answers_2020_2022)

    results.loc[len(results)] = ['t5-small', '2020-2022', np.array(bert_results_2020_2022['precision']).mean(), np.array(bert_results_2020_2022['recall']).mean(), np.array(bert_results_2020_2022['f1']).mean(), bleu_results_2020_2022['bleu']]
    
    del tokenizer
    del model
    torch.cuda.empty_cache()

In [None]:
tokenizer = None
model = None

for model in handwritten:
    model_data = model.split("-")

    print(f"Evaluating model {model}...")
    model_name = f"{local_models_path}/{model}"
    seed = model_data[-1]
    set_seed(int(seed))

    data_2022_handwritten = load_dataset('csv', data_files=f"../../data/clean/QA_SR_2022_Expert-squad-format.csv",
                                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

    # Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
    data_2022_handwritten["test"] = data_2022_handwritten["test"].map(
        lambda example: ast.literal_eval(example["answers"]))
    data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                    "text": example["text"], "answer_start": example["answer_start"]}})
    # replace all "\n" with " " in the context, answers and questions
    data_2022_handwritten["test"] = data_2022_handwritten["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                    "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
    data_2022_handwritten["test"] = data_2022_handwritten["test"].remove_columns(["text", "answer_start"])

    test_data_2022_handwritten = data_2022_handwritten["test"]
    gt_answers_2022_handwritten = [temp["answers"]["text"][0] for temp in test_data_2022_handwritten]

    tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, local_files_only=True)

    answers_2022_handwritten = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data_2022_handwritten["question"], test_data_2022_handwritten["context"])]

    bert_results_2022_handwritten = bertscore.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten, lang="en")
    bleu_results_2022_handwritten = bleu.compute(predictions=answers_2022_handwritten, references=gt_answers_2022_handwritten)

    
    results.loc[len(results)] = ['t5-small', 'handwritten', np.array(bert_results_2022_handwritten['precision']).mean(), np.array(bert_results_2022_handwritten['recall']).mean(), np.array(bert_results_2022_handwritten['f1']).mean(), bleu_results_2022_handwritten['bleu']]
     
    del tokenizer
    del model
    torch.cuda.empty_cache()

In [None]:
df_2020_2022 = results[results['Data Type'] == '2020-2022']
df_handwritten = results[results['Data Type'] == 'handwritten']

In [None]:
# get average results and standard error
df_2020_2022_mean = df_2020_2022.mean(axis=1)
df_2020_2022_sem = df_2020_2022.sem(axis=1)

df_handwritten_mean = df_handwritten.mean(axis=1)
df_handwritten_sem = df_handwritten.sem(axis=1)

In [None]:
# display results
print("2020-2022")
print(f"Mean metrics: {df_2020_2022_mean}")
print(f"Standard Error: {df_2020_2022_sem}")

print("Handwritten")
print(f"Mean metrics: {df_handwritten_mean}")
print(f"Standard Error: {df_handwritten_sem}")