In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration

from datasets import *
import numpy as np

import torch
import ast


SEED = 42

Reference notebooks:

https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=KdmKlMkfcLa0

https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb

### Load tokenizer and model

In [None]:
model_name = "mrm8488/t5-small-finetuned-squadv2" # small model
# model_name = "mrm8488/t5-base-finetuned-squadv2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

### Load the dataset

In [None]:
# Load the dataset from file and split it into train and test datasets
data = load_dataset('csv', data_files="../../data/clean/sustainability-report-2020-squad-format.csv", delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-d8382661cd597e83/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c261d5613d28d856.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e61829c1e4a24b65.arrow


In [None]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["test"] = data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["train"] = data["train"].remove_columns(["text", "answer_start"])

Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0b15501cefb41ff7.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-6c4455904f60e079.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-65eb14b3b79cbed9.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-559c811f459458f4.arrow


In [None]:
data["train"][0]

{'question': 'How does the Bank prevent or manage cyber risks arising from the increased volume of work from home?',
 'context': 'nages cyber risks arising from the increased volumeof work from home through implemented measures, some of which areexplained below, namelyPromoting Freedom of Expr',
 'answers': {'answer_start': [69], 'text': ['through implemented measures']},
 'id': 132}

### Tokenize the dataset

In [None]:
# process the examples in input and target text format and the eos token at the end 
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples
def convert_to_features(examples):
    model_inputs = tokenizer(examples['input_text'], pad_to_max_length=True, max_length=512, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], pad_to_max_length=True, max_length=128, truncation=True)
        temp = np.array(labels["input_ids"])
        temp[temp == tokenizer.pad_token_id] = -100
        labels["input_ids"] = temp.tolist()

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_data, test_data = data["train"], data["test"]

train_data = train_data.map(add_eos_to_examples, load_from_cache_file=False)
train_data = train_data.map(convert_to_features, batched=True, load_from_cache_file=False)

test_data = test_data.map(add_eos_to_examples, load_from_cache_file=False)
test_data = test_data.map(convert_to_features, batched=True, load_from_cache_file=False)

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]



Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

In [None]:
train_data[0]

{'question': 'How does the Bank prevent or manage cyber risks arising from the increased volume of work from home?',
 'context': 'nages cyber risks arising from the increased volumeof work from home through implemented measures, some of which areexplained below, namelyPromoting Freedom of Expr',
 'answers': {'answer_start': [69], 'text': ['through implemented measures']},
 'id': 132,
 'input_text': 'question: How does the Bank prevent or manage cyber risks arising from the increased volume of work from home?  context: nages cyber risks arising from the increased volumeof work from home through implemented measures, some of which areexplained below, namelyPromoting Freedom of Expr </s>',
 'target_text': 'through implemented measures </s>',
 'input_ids': [822,
  10,
  571,
  405,
  8,
  1925,
  1709,
  42,
  1865,
  9738,
  5217,
  3,
  14739,
  45,
  8,
  1936,
  2908,
  13,
  161,
  45,
  234,
  58,
  2625,
  10,
  3,
  9761,
  7,
  9738,
  5217,
  3,
  14739,
  45,
  8,
  1936,
  2908

### Fine tunning

In [None]:
import numpy as np
import torch
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments


name = model_name.split("/")[-1]
training_args = Seq2SeqTrainingArguments(
    output_dir = f"./models",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=25,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False
)

# with this batch size the base model fits on a GPU with 8GB of memory
# training_args = Seq2SeqTrainingArguments(
#     output_dir = f"./models",
#     evaluation_strategy = "epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     weight_decay=0.01,
#     save_total_limit=3,
#     num_train_epochs=25,
#     predict_with_generate=True,
#     fp16=True,
#     push_to_hub=False
# )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()
trainer.save_model(f"./models/{name}-finetuned-NLB-QA")



  0%|          | 0/425 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 2.19038724899292, 'eval_runtime': 0.3839, 'eval_samples_per_second': 145.857, 'eval_steps_per_second': 18.232, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.8614195585250854, 'eval_runtime': 0.3766, 'eval_samples_per_second': 148.718, 'eval_steps_per_second': 18.59, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.6485927104949951, 'eval_runtime': 0.3677, 'eval_samples_per_second': 152.296, 'eval_steps_per_second': 19.037, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.5159841775894165, 'eval_runtime': 0.3797, 'eval_samples_per_second': 147.481, 'eval_steps_per_second': 18.435, 'epoch': 4.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.4004179239273071, 'eval_runtime': 0.3746, 'eval_samples_per_second': 149.506, 'eval_steps_per_second': 18.688, 'epoch': 5.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.297468900680542, 'eval_runtime': 0.3701, 'eval_samples_per_second': 151.329, 'eval_steps_per_second': 18.916, 'epoch': 6.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.1983356475830078, 'eval_runtime': 0.382, 'eval_samples_per_second': 146.589, 'eval_steps_per_second': 18.324, 'epoch': 7.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.1087299585342407, 'eval_runtime': 0.3731, 'eval_samples_per_second': 150.085, 'eval_steps_per_second': 18.761, 'epoch': 8.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.0139919519424438, 'eval_runtime': 0.3742, 'eval_samples_per_second': 149.648, 'eval_steps_per_second': 18.706, 'epoch': 9.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.9129347205162048, 'eval_runtime': 0.3732, 'eval_samples_per_second': 150.069, 'eval_steps_per_second': 18.759, 'epoch': 10.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.8196895718574524, 'eval_runtime': 0.3743, 'eval_samples_per_second': 149.614, 'eval_steps_per_second': 18.702, 'epoch': 11.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.7290547490119934, 'eval_runtime': 0.3774, 'eval_samples_per_second': 148.376, 'eval_steps_per_second': 18.547, 'epoch': 12.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.6496738195419312, 'eval_runtime': 0.3794, 'eval_samples_per_second': 147.62, 'eval_steps_per_second': 18.453, 'epoch': 13.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5828622579574585, 'eval_runtime': 0.3681, 'eval_samples_per_second': 152.137, 'eval_steps_per_second': 19.017, 'epoch': 14.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5341923832893372, 'eval_runtime': 0.3756, 'eval_samples_per_second': 149.087, 'eval_steps_per_second': 18.636, 'epoch': 15.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5010005831718445, 'eval_runtime': 0.3847, 'eval_samples_per_second': 145.583, 'eval_steps_per_second': 18.198, 'epoch': 16.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.47875767946243286, 'eval_runtime': 0.375, 'eval_samples_per_second': 149.338, 'eval_steps_per_second': 18.667, 'epoch': 17.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.46184012293815613, 'eval_runtime': 0.381, 'eval_samples_per_second': 146.967, 'eval_steps_per_second': 18.371, 'epoch': 18.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.44914335012435913, 'eval_runtime': 0.3823, 'eval_samples_per_second': 146.467, 'eval_steps_per_second': 18.308, 'epoch': 19.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.43888458609580994, 'eval_runtime': 0.3824, 'eval_samples_per_second': 146.428, 'eval_steps_per_second': 18.304, 'epoch': 20.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.4320794641971588, 'eval_runtime': 0.3708, 'eval_samples_per_second': 151.012, 'eval_steps_per_second': 18.876, 'epoch': 21.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.42618927359580994, 'eval_runtime': 0.3768, 'eval_samples_per_second': 148.604, 'eval_steps_per_second': 18.576, 'epoch': 22.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.42218831181526184, 'eval_runtime': 0.3775, 'eval_samples_per_second': 148.363, 'eval_steps_per_second': 18.545, 'epoch': 23.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.41946396231651306, 'eval_runtime': 0.371, 'eval_samples_per_second': 150.96, 'eval_steps_per_second': 18.87, 'epoch': 24.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.41794899106025696, 'eval_runtime': 0.3789, 'eval_samples_per_second': 147.778, 'eval_steps_per_second': 18.472, 'epoch': 25.0}
{'train_runtime': 85.9523, 'train_samples_per_second': 37.521, 'train_steps_per_second': 4.945, 'train_loss': 0.7614494054457721, 'epoch': 25.0}


### Evaluation

In [None]:
def get_answer(question, context):
    input_text = "question: %s  context: %s" % (question, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask'])

    return tokenizer.decode(output[0])

In [None]:
question = test_data[0]["question"]
context = test_data[0]["context"]
answer = test_data[0]["answers"]["text"][0]
print(f"Question: {question} \nContext: {context} \nAnswer: {answer}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
print(f"Squad model answer: {get_answer(question, context)}")

tokenizer = AutoTokenizer.from_pretrained(f"./models/{name}-finetuned-NLB-QA", local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(f"./models/{name}-finetuned-NLB-QA", local_files_only=True)
print(f"Our model answer: {get_answer(question, context)}")

Question: How many employees departed from NLB Group in 2020? 
Context: NLB Group In total, 382 employeesdeparted from NLB Group in 2020.In total, 162 employees de 
Answer: 382


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Squad model answer: <pad> subtract(n1,n0)|divide(#0,n2)|</s>
Our model answer: <pad> 382 </s>


In [None]:
answers = [temp["answers"]["text"][0] for temp in test_data]

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
squad_answers = [get_answer(question, context) for question, context in zip(test_data["question"], test_data["context"])]

tokenizer = AutoTokenizer.from_pretrained(f"./models/{name}-finetuned-NLB-QA", local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(f"./models/{name}-finetuned-NLB-QA", local_files_only=True)
our_answers = [get_answer(question, context) for question, context in zip(test_data["question"], test_data["context"])]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [None]:
squad_answers

['<pad> subtract(n1,n0)|divide(#0,n2)|</s>',
 '<pad> green/sustainability financing</s>',
 '<pad> multiply(n0,const_100)|divide(#0,n0',
 '<pad> 2021</s>',
 '<pad> law, collectiveagreements and internal regulations</s>',
 '<pad> Social and EnvironmentalPolicy</s>',
 '<pad> equal opportunities, justice, and promotingculture focused on performance</s>',
 '<pad> Santa Claus</s>',
 '<pad> end of 2020</s>',
 '<pad> multiply(n0,const_1000)|divide(#0,n1)',
 '<pad> December 2020</s>',
 '<pad> 2020</s>',
 '<pad> divide(n2,const_100)|divide(n3,const_100',
 '<pad> Komercijalna Banka a.d.</s>',
 '<pad> 2020</s>',
 '<pad> fically adopted for anticorruptionarea will be implemented in KB in 2021</s>',
 '<pad> Bogdan Darmanovi<unk></s>',
 '<pad> 4,769</s>',
 '<pad> World Institute forSustainability and Ethics in Rising Economies</s>',
 '<pad> 2018</s>',
 '<pad> 2020</s>',
 '<pad> multiply(n1,const_1000)|divide(#0,n0)',
 '<pad> divide(n2,const_100)|divide(n3,const_100',
 '<pad> a higher quality of life 

In [None]:
our_answers

['<pad> 382 </s>',
 '<pad> green/sustainability financing </s>',
 '<pad> 1 </s>',
 '<pad> 2021 </s>',
 '<pad> law, collectiveagreements and internal regulations </s>',
 '<pad> Social and EnvironmentalPolicy </s>',
 '<pad> equal opportunities, justice, and promotingculture focused on performance </s>',
 '<pad> a visit by Santa Claus </s>',
 '<pad> end of 2020 </s>',
 '<pad> three </s>',
 '<pad> 30th December 2020 </s>',
 '<pad> 2020 </s>',
 '<pad> 6.7% </s>',
 '<pad> Komercijalna Banka a.d. Beograd KB ',
 '<pad> 2020 </s>',
 '<pad> 2021 </s>',
 '<pad> Bogdan Darmanovi<unk> </s>',
 '<pad> 4,769 </s>',
 '<pad> World Institute forSustainability and Ethics in Rising Economies </s>',
 '<pad> 2018 </s>',
 '<pad> 2020 </s>',
 '<pad> 23 million EUR </s>',
 '<pad> 307 </s>',
 '<pad> a higher quality of life of the wider society </s>',
 '<pad> EUR 340 million </s>',
 '<pad> Retail Banking in Slovenia, Corporate Bankingin Slovenia, and Strategic Foreign Markets </s>',
 '<pad> 30.12.2020 </s>',
 '<

In [None]:
import evaluate
bertscore = evaluate.load("bertscore")

results = bertscore.compute(predictions=squad_answers, references=answers, lang="en")
# Embeddings bases evaluation
print(f"Squad\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

results = bertscore.compute(predictions=our_answers, references=answers, lang="en")
# Embeddings bases evaluation
print(f"Our model\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

Squad
F1: 0.8447753403867994, Precision: 0.8244254706161362, Recall: 0.866938612290791
Our model
F1: 0.8797242705311094, Precision: 0.8563968539237976, Recall: 0.9045447419796672


In [None]:
bleu = evaluate.load("bleu")

results = bleu.compute(predictions=squad_answers, references=answers)
print(f"Squad\n{results}")
results = bleu.compute(predictions=our_answers, references=answers)
print(f"Our model\n{results}")

Squad
{'bleu': 0.08073612217075588, 'precisions': [0.11964735516372796, 0.08536585365853659, 0.07038123167155426, 0.05910543130990415], 'brevity_penalty': 1.0, 'length_ratio': 5.122580645161291, 'translation_length': 794, 'reference_length': 155}
Our model
{'bleu': 0.17734617872420755, 'precisions': [0.2669039145907473, 0.18774703557312253, 0.15555555555555556, 0.12690355329949238], 'brevity_penalty': 1.0, 'length_ratio': 3.6258064516129034, 'translation_length': 562, 'reference_length': 155}
