In [16]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration

from datasets import *
import numpy as np

import torch
import ast


SEED = 42
year = 2022

Reference notebooks:

https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=KdmKlMkfcLa0

https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb

### Load tokenizer and model

In [2]:
model_name = "mrm8488/t5-small-finetuned-squadv2"  # small model
# model_name = "mrm8488/t5-base-finetuned-squadv2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


### Load the dataset

In [3]:
# Load the dataset from file and split it into train and test datasets
data = load_dataset('csv', data_files="../../data/clean/sustainability-report-2022-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-e3048f1bd60b5c4e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e96c5bd318352c3f.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-3f782815aab69336.arrow


In [4]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
data["test"] = data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
data["train"] = data["train"].remove_columns(["text", "answer_start"])

Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0e58ef57ff25bb58.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-944b27bfd97247d4.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-d92cbfe0185ed3c0.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e00fcfca42a8136e.arrow


In [5]:
data["train"][0]

{'index': 358,
 'question': 'When was the first banking institution established?',
 'context': 'on is based on people,banking knowledge and culture reaching back to 1820 whenthe first banking institution  the Carniola Savings Bank  was',
 'answers': {'answer_start': [69], 'text': ['1820']},
 'id': 143}

### Tokenize the dataset

In [6]:
# process the examples in input and target text format and the eos token at the end
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (
        example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples


def convert_to_features(examples):
    model_inputs = tokenizer(
        examples['input_text'], pad_to_max_length=True, max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target_text'], pad_to_max_length=True, max_length=128, truncation=True)
        temp = np.array(labels["input_ids"])
        temp[temp == tokenizer.pad_token_id] = -100
        labels["input_ids"] = temp.tolist()

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
train_data, test_data = data["train"], data["test"]

train_data = train_data.map(add_eos_to_examples, load_from_cache_file=False)
train_data = train_data.map(
    convert_to_features, batched=True, load_from_cache_file=False)

test_data = test_data.map(add_eos_to_examples, load_from_cache_file=False)
test_data = test_data.map(
    convert_to_features, batched=True, load_from_cache_file=False)

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]



Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [8]:
train_data[0]

{'index': 358,
 'question': 'When was the first banking institution established?',
 'context': 'on is based on people,banking knowledge and culture reaching back to 1820 whenthe first banking institution  the Carniola Savings Bank  was',
 'answers': {'answer_start': [69], 'text': ['1820']},
 'id': 143,
 'input_text': 'question: When was the first banking institution established?  context: on is based on people,banking knowledge and culture reaching back to 1820 whenthe first banking institution  the Carniola Savings Bank  was </s>',
 'target_text': '1820 </s>',
 'input_ids': [822,
  10,
  366,
  47,
  8,
  166,
  8175,
  6568,
  2127,
  58,
  2625,
  10,
  30,
  19,
  3,
  390,
  30,
  151,
  6,
  4739,
  53,
  1103,
  11,
  1543,
  7232,
  223,
  12,
  507,
  1755,
  116,
  532,
  166,
  8175,
  6568,
  8,
  1184,
  29,
  23578,
  19063,
  7,
  1925,
  47,
  3,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


### Fine tunning

In [9]:
import numpy as np
import torch
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments


name = model_name.split("/")[-1]
training_args = Seq2SeqTrainingArguments(
    output_dir=f"{name}-finetuned-NLB-QA-{year}",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=25,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False
)

# with this batch size the base model fits on a GPU with 8GB of memory
# training_args = Seq2SeqTrainingArguments(
#     output_dir = f"./models",
#     evaluation_strategy = "epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     weight_decay=0.01,
#     save_total_limit=3,
#     num_train_epochs=25,
#     predict_with_generate=True,
#     fp16=True,
#     push_to_hub=False
# )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [10]:
trainer.train()
trainer.save_model(f"{name}-finetuned-NLB-QA-{year}")



  0%|          | 0/775 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 1.8605215549468994, 'eval_runtime': 0.547, 'eval_samples_per_second': 195.613, 'eval_steps_per_second': 25.594, 'epoch': 1.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 1.465985894203186, 'eval_runtime': 0.5327, 'eval_samples_per_second': 200.854, 'eval_steps_per_second': 26.28, 'epoch': 2.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 1.1761937141418457, 'eval_runtime': 0.5292, 'eval_samples_per_second': 202.197, 'eval_steps_per_second': 26.456, 'epoch': 3.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.8798961639404297, 'eval_runtime': 0.5279, 'eval_samples_per_second': 202.706, 'eval_steps_per_second': 26.522, 'epoch': 4.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5622873902320862, 'eval_runtime': 0.5421, 'eval_samples_per_second': 197.369, 'eval_steps_per_second': 25.824, 'epoch': 5.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.3936600983142853, 'eval_runtime': 0.5567, 'eval_samples_per_second': 192.221, 'eval_steps_per_second': 25.15, 'epoch': 6.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.3170520067214966, 'eval_runtime': 0.545, 'eval_samples_per_second': 196.327, 'eval_steps_per_second': 25.688, 'epoch': 7.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.26753655076026917, 'eval_runtime': 0.578, 'eval_samples_per_second': 185.121, 'eval_steps_per_second': 24.221, 'epoch': 8.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.2362598329782486, 'eval_runtime': 0.5486, 'eval_samples_per_second': 195.029, 'eval_steps_per_second': 25.518, 'epoch': 9.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.21938280761241913, 'eval_runtime': 0.5457, 'eval_samples_per_second': 196.095, 'eval_steps_per_second': 25.657, 'epoch': 10.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.2066628783941269, 'eval_runtime': 0.5408, 'eval_samples_per_second': 197.846, 'eval_steps_per_second': 25.886, 'epoch': 11.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.20078468322753906, 'eval_runtime': 0.5582, 'eval_samples_per_second': 191.678, 'eval_steps_per_second': 25.079, 'epoch': 12.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.19411641359329224, 'eval_runtime': 0.5386, 'eval_samples_per_second': 198.679, 'eval_steps_per_second': 25.995, 'epoch': 13.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.1893462985754013, 'eval_runtime': 0.577, 'eval_samples_per_second': 185.437, 'eval_steps_per_second': 24.263, 'epoch': 14.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.18639765679836273, 'eval_runtime': 0.55, 'eval_samples_per_second': 194.546, 'eval_steps_per_second': 25.455, 'epoch': 15.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.18122951686382294, 'eval_runtime': 0.5354, 'eval_samples_per_second': 199.847, 'eval_steps_per_second': 26.148, 'epoch': 16.0}
{'loss': 0.7045, 'learning_rate': 7.2000000000000005e-06, 'epoch': 16.13}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.17755372822284698, 'eval_runtime': 0.5571, 'eval_samples_per_second': 192.057, 'eval_steps_per_second': 25.129, 'epoch': 17.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.17584510147571564, 'eval_runtime': 0.5589, 'eval_samples_per_second': 191.443, 'eval_steps_per_second': 25.049, 'epoch': 18.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.17383332550525665, 'eval_runtime': 0.576, 'eval_samples_per_second': 185.764, 'eval_steps_per_second': 24.306, 'epoch': 19.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.17242954671382904, 'eval_runtime': 0.566, 'eval_samples_per_second': 189.046, 'eval_steps_per_second': 24.735, 'epoch': 20.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.17186328768730164, 'eval_runtime': 0.5619, 'eval_samples_per_second': 190.44, 'eval_steps_per_second': 24.917, 'epoch': 21.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.17117920517921448, 'eval_runtime': 0.5647, 'eval_samples_per_second': 189.482, 'eval_steps_per_second': 24.792, 'epoch': 22.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.17101497948169708, 'eval_runtime': 0.5546, 'eval_samples_per_second': 192.929, 'eval_steps_per_second': 25.243, 'epoch': 23.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.17068082094192505, 'eval_runtime': 0.554, 'eval_samples_per_second': 193.14, 'eval_steps_per_second': 25.271, 'epoch': 24.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.17061103880405426, 'eval_runtime': 0.5495, 'eval_samples_per_second': 194.717, 'eval_steps_per_second': 25.477, 'epoch': 25.0}
{'train_runtime': 124.3145, 'train_samples_per_second': 49.874, 'train_steps_per_second': 6.234, 'train_loss': 0.5058726156911543, 'epoch': 25.0}


### Evaluation

In [11]:
def get_answer(question, context):
    input_text = "question: %s  context: %s" % (question, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(
        input_ids=features['input_ids'], attention_mask=features['attention_mask'])

    return tokenizer.decode(output[0])

In [15]:
question = test_data[0]["question"]
context = test_data[0]["context"]
answer = test_data[0]["answers"]["text"][0]
print(f"Question: {question} \nContext: {context} \nAnswer: {answer}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
print(f"Squad model answer: {get_answer(question, context)}")

tokenizer = AutoTokenizer.from_pretrained(
    f"./{name}-finetuned-NLB-QA-{year}", local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
    f"./{name}-finetuned-NLB-QA-{year}", local_files_only=True)
print(f"Our model answer: {get_answer(question, context)}")

Question: What are absolute emissions and emissions intensity in line with? 
Context: ly publishing absolute emissions and emissionsintensity in line with best practice and, within a year ofsetting targets, disclosing progress against a  
Answer: best practice


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Squad model answer: <pad> best practice</s>
Our model answer: <pad> best practice </s>


In [18]:
answers = [temp["answers"]["text"][0] for temp in test_data]

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
squad_answers = [get_answer(question, context) for question, context in zip(
    test_data["question"], test_data["context"])]

tokenizer = AutoTokenizer.from_pretrained(
    f"./{name}-finetuned-NLB-QA-{year}", local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
    f"./{name}-finetuned-NLB-QA-{year}", local_files_only=True)
our_answers = [get_answer(question, context) for question, context in zip(
    test_data["question"], test_data["context"])]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [19]:
squad_answers

['<pad> best practice</s>',
 '<pad> Magda</s>',
 '<pad> more than 400</s>',
 '<pad> Sustainability Development unit</s>',
 '<pad> None</s>',
 '<pad> Top Employer award</s>',
 '<pad> annual performance evaluation</s>',
 '<pad> three levels</s>',
 '<pad> six</s>',
 '<pad>ESG risks and opportunities</s>',
 '<pad> six</s>',
 '<pad> 2022</s>',
 '<pad> 5</s>',
 '<pad> leading initiatives within theassociations</s>',
 '<pad> None</s>',
 '<pad> the EBRD</s>',
 '<pad> 100%</s>',
 '<pad> 10%</s>',
 '<pad> Relationship Manager</s>',
 '<pad> None</s>',
 '<pad> three</s>',
 '<pad> managerial, professional or young talent</s>',
 '<pad> None</s>',
 '<pad> 254</s>',
 '<pad> composition, performance, potential conflict ofinterest</s>',
 '<pad> 2022</s>',
 '<pad> 14.5%</s>',
 '<pad> 3</s>',
 '<pad> fromzero carbon energysourceoperationalcarbonfootprintreductiontrees</s>',
 '<pad> 6%</s>',
 '<pad> 2023</s>',
 '<pad> 50%</s>',
 '<pad> ILO standards</s>',
 '<pad> 2050</s>',
 '<pad> ransparentprocurement</s

In [20]:
our_answers

['<pad> best practice </s>',
 '<pad> Magda </s>',
 '<pad> 400 </s>',
 '<pad> Sustainability Development unit </s>',
 '<pad> Cyberse</s>',
 '<pad> Top Employer award </s>',
 '<pad> annual performance evaluation</s>',
 '<pad> three levels</s>',
 '<pad> six</s>',
 '<pad>ESG risks and opportunities </s>',
 '<pad> six </s>',
 '<pad> 2022 </s>',
 '<pad> 5 </s>',
 '<pad> leading initiatives within theassociations </s>',
 '<pad>2022 </s>',
 '<pad> EBRD </s>',
 '<pad> 100%</s>',
 '<pad> 10% </s>',
 '<pad> Relationship Manager </s>',
 '<pad> ESG regulatory developments </s>',
 '<pad> three</s>',
 '<pad> managerial, professional or young talent </s>',
 '<pad> ESMA </s>',
 '<pad> 254 </s>',
 '<pad> its composition, performance, potential conflict ofinterest of individual members of the Supervis</s>',
 '<pad> 2022 </s>',
 '<pad> 14.5%.</s>',
 '<pad> 3 </s>',
 '<pad> new sustainable carbon energysourceoperationalcarbonfootprintreductiontrees</s>',
 '<pad> 6%,</s>',
 '<pad> 2023 </s>',
 '<pad> 50% </

In [21]:
import evaluate
bertscore = evaluate.load("bertscore")

results = bertscore.compute(
    predictions=squad_answers, references=answers, lang="en")
# Embeddings bases evaluation
print(
    f"Squad\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

results = bertscore.compute(predictions=our_answers,
                            references=answers, lang="en")
# Embeddings bases evaluation
print(
    f"Our model\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

Squad
F1: 0.868855295894302, Precision: 0.8768738697622424, Recall: 0.8617404877582443
Our model
F1: 0.8580790713568714, Precision: 0.8443797642939559, Recall: 0.8726573970830329


In [22]:
bleu = evaluate.load("bleu")

results = bleu.compute(predictions=squad_answers, references=answers)
print(f"Squad\n{results}")
results = bleu.compute(predictions=our_answers, references=answers)
print(f"Our model\n{results}")

Squad
{'bleu': 0.1020040212593282, 'precisions': [0.18302658486707565, 0.1113662456946039, 0.08115183246073299, 0.0654490106544901], 'brevity_penalty': 1.0, 'length_ratio': 3.5434782608695654, 'translation_length': 978, 'reference_length': 276}
Our model
{'bleu': 0.11752063190477298, 'precisions': [0.21484375, 0.13304252998909488, 0.09382716049382717, 0.07112375533428165], 'brevity_penalty': 1.0, 'length_ratio': 3.710144927536232, 'translation_length': 1024, 'reference_length': 276}
