In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, set_seed
from datasets import *
import numpy as np
import torch
import ast

In [2]:
SEED = 42
set_seed(SEED)

year = 2020
# year = 2022

local_models_path = '../../data/models'

In [3]:
# Load the dataset from file and split it into train and test datasets
data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{year}-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-d8382661cd597e83/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c261d5613d28d856.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e61829c1e4a24b65.arrow


In [4]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
data["test"] = data["test"].remove_columns(["text", "answer_start"])

test_data = data["test"]
gt_answers = [temp["answers"]["text"][0] for temp in test_data]
gt_answers

Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0b15501cefb41ff7.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e4de42d02343959f.arrow


['382',
 'green/sustainability financing',
 '1',
 '2021',
 'law, collectiveagreements and internal regulations',
 'Social and EnvironmentalPolicy',
 'equal opportunities, justice',
 'a visit by Santa Claus',
 '2020',
 'three',
 '30th December 2020',
 '2020',
 '6.7%,',
 'Komercijalna Banka a.d. Beograd',
 '2020',
 '2021',
 'Bogdan DarmanoviÄ‡',
 '4,769',
 'World Institute forSustainability and Ethics in Rising Economies',
 '2018',
 '2020',
 '23 million EUR',
 '307',
 'a higher quality of life of the wider society',
 'EUR 340 million',
 'Retail Banking in Slovenia, Corporate Bankingin Slovenia, and Strategic Foreign Markets',
 '30.12.2020',
 '69% women and 31% men',
 'More than 200',
 '31%',
 'corruption and bribery',
 '307',
 '2.11million',
 '17,297',
 '58%',
 '45',
 '2017',
 'annually',
 'CRS',
 '2019',
 '97%',
 'EUR 340 million',
 '94',
 'Beograd',
 '2,914',
 '4 Sep',
 '2%.',
 '69% women and 31% men',
 'to invest in a systematicdevelopment of employees',
 '17,295',
 'by e-mail, via th

In [5]:
def get_answer(question, context, tokenizer, model):
    input_text = "question: %s  context: %s" % (question, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask'])

    return tokenizer.decode(output[0])

## SIMPLE EVALUATION

In [6]:
import evaluate
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")

### Small

In [7]:
model_name = "mrm8488/t5-small-finetuned-squadv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

answers = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data["question"], test_data["context"])]

# bertscore
results = bertscore.compute(predictions=answers, references=gt_answers, lang="en")
print(f"Bertscore\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

# bleu
results = bleu.compute(predictions=answers, references=gt_answers)
print(f"Bleu\n{results}")

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Bertscore
F1: 0.8861216104456356, Precision: 0.8740572397197995, Recall: 0.8988280519843102
Bleu
{'bleu': 0.14932377828599544, 'precisions': [0.2303473491773309, 0.15885947046843177, 0.12873563218390804, 0.10554089709762533], 'brevity_penalty': 1.0, 'length_ratio': 3.529032258064516, 'translation_length': 547, 'reference_length': 155}


### Small - finetuned

In [8]:
model_name = f"{local_models_path}/t5-small-finetuned-squadv2-finetuned-NLB-QA-{year}" # TODO: path (best to put all models in data/models folder)?
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, local_files_only=True)

answers = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data["question"], test_data["context"])]

# bertscore
results = bertscore.compute(predictions=answers, references=gt_answers, lang="en")
print(f"Bertscore\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

# bleu
results = bleu.compute(predictions=answers, references=gt_answers)
print(f"Bleu\n{results}")

Bertscore
F1: 0.8897274541003364, Precision: 0.8763338146465165, Recall: 0.9038189905030387
Bleu
{'bleu': 0.1568508288241227, 'precisions': [0.24150268336314848, 0.16699801192842942, 0.13646532438478748, 0.10997442455242967], 'brevity_penalty': 1.0, 'length_ratio': 3.606451612903226, 'translation_length': 559, 'reference_length': 155}


### Small - finetuned - (TODO: train set size)

In [9]:
# TODO

### Base

In [10]:
model_name = "mrm8488/t5-base-finetuned-squadv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

answers = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data["question"], test_data["context"])]

# bertscore
results = bertscore.compute(predictions=answers, references=gt_answers, lang="en")
print(f"Bertscore\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

# bleu
results = bleu.compute(predictions=answers, references=gt_answers)
print(f"Bleu\n{results}")

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Bertscore
F1: 0.8447753446442741, Precision: 0.8244254706161362, Recall: 0.8669386165482658
Bleu
{'bleu': 0.08073612217075588, 'precisions': [0.11964735516372796, 0.08536585365853659, 0.07038123167155426, 0.05910543130990415], 'brevity_penalty': 1.0, 'length_ratio': 5.122580645161291, 'translation_length': 794, 'reference_length': 155}


### Base - finetuned

In [11]:
model_name = f"{local_models_path}/t5-base-finetuned-squadv2-finetuned-NLB-QA-{year}" # TODO: path (best to put all models in data/models folder)?
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, local_files_only=True)

answers = [get_answer(question, context, tokenizer, model) for question, context in zip(test_data["question"], test_data["context"])]

# bertscore
results = bertscore.compute(predictions=answers, references=gt_answers, lang="en")
print(f"Bertscore\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

# bleu
results = bleu.compute(predictions=answers, references=gt_answers)
print(f"Bleu\n{results}")

Bertscore
F1: 0.8797242726598468, Precision: 0.8563968539237976, Recall: 0.9045447430440358
Bleu
{'bleu': 0.17734617872420755, 'precisions': [0.2669039145907473, 0.18774703557312253, 0.15555555555555556, 0.12690355329949238], 'brevity_penalty': 1.0, 'length_ratio': 3.6258064516129034, 'translation_length': 562, 'reference_length': 155}


### Base - finetuned - (TODO: train set size)

In [12]:
# TODO