In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, set_seed
from datasets import *
import numpy as np
import torch
import ast

In [2]:
SEED = 42
set_seed(SEED)

# any combination of these years and dataset types can be used
# year = 2020
# year = 2022
# dataset_type = "full"
# dataset_type = "smaller"

# only combination of these years and dataset types can be used
year = 2022
dataset_type = "handwritten"

local_models_path = '../../data/models/T5'

# model_name = "mrm8488/t5-small-finetuned-squadv2" # small model
model_name = "mrm8488/t5-base-finetuned-squadv2"

Reference notebooks:

https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=KdmKlMkfcLa0

https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb

### Load tokenizer and model

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


### Load the dataset

In [4]:
# Load the dataset from file and split it into train and test datasets
if dataset_type == "full":
    data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{year}-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)
elif dataset_type == "smaller":
    data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{year}-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)
    data["train"] = data["train"].select(range(len(data["train"]) // 2))
elif dataset_type == "handwritten":
    data = load_dataset('csv', data_files=f"../../data/clean/QA_SR_2022_Expert-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)
else:
    raise Exception("Invalid dataset type")

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-7e0bd965690926b0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-9bb74a86dcf2fec4.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-3da9507e622ee022.arrow


In [5]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
data["test"] = data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
data["train"] = data["train"].remove_columns(["text", "answer_start"])

Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-23042aa7dfb34243.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-9d063bd963fc89f9.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0d917c3b398933a7.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-7e0bd965690926b0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-4d188acf2b5e0de5.arrow


In [6]:
data["train"][0]

{'question': 'How many new green loan offers did NLB introduce for legal entities? ',
 'context': 'NLB introduced 3 new green loan offers for legal entities (Micro and SME):\n• NLB Green partner loan to finance the construction of turnkey solar power plants\n• NLB Green Loan for investments in the energy efficiency of business buildings\n• NLB Green Loan for reducing the carbon footprint, that cover the area of energy efficiency, circular\neconomy and sustainable agriculture',
 'answers': {'answer_start': [1],
  'text': ['NLB introduced 3 new green loan offers for legal entities (Micro and SME)']}}

### Tokenize the dataset

In [7]:
# process the examples in input and target text format and the eos token at the end
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (
        example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples


def convert_to_features(examples):
    model_inputs = tokenizer(
        examples['input_text'], pad_to_max_length=True, max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target_text'], pad_to_max_length=True, max_length=128, truncation=True)
        temp = np.array(labels["input_ids"])
        temp[temp == tokenizer.pad_token_id] = -100
        labels["input_ids"] = temp.tolist()

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
train_data, test_data = data["train"], data["test"]

train_data = train_data.map(add_eos_to_examples, load_from_cache_file=False)
train_data = train_data.map(
    convert_to_features, batched=True, load_from_cache_file=False)

test_data = test_data.map(add_eos_to_examples, load_from_cache_file=False)
test_data = test_data.map(
    convert_to_features, batched=True, load_from_cache_file=False)

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]



Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

In [9]:
train_data[0]

{'question': 'How many new green loan offers did NLB introduce for legal entities? ',
 'context': 'NLB introduced 3 new green loan offers for legal entities (Micro and SME):\n• NLB Green partner loan to finance the construction of turnkey solar power plants\n• NLB Green Loan for investments in the energy efficiency of business buildings\n• NLB Green Loan for reducing the carbon footprint, that cover the area of energy efficiency, circular\neconomy and sustainable agriculture',
 'answers': {'answer_start': [1],
  'text': ['NLB introduced 3 new green loan offers for legal entities (Micro and SME)']},
 'input_text': 'question: How many new green loan offers did NLB introduce for legal entities?   context: NLB introduced 3 new green loan offers for legal entities (Micro and SME):\n• NLB Green partner loan to finance the construction of turnkey solar power plants\n• NLB Green Loan for investments in the energy efficiency of business buildings\n• NLB Green Loan for reducing the carbon footpr

### Fine tunning

In [10]:
name = model_name.split("/")[-1]
output_dir = f"{local_models_path}/{name}-finetuned-NLB-QA-{year}-{dataset_type}"
if "small" in model_name:
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=25,
        predict_with_generate=True,
        fp16=True,
        push_to_hub=False,
        load_best_model_at_end=True
    )
elif "base" in model_name:
    # with this batch size the base model fits on a GPU with 8GB of memory
    training_args = Seq2SeqTrainingArguments(
        output_dir = output_dir,
        evaluation_strategy = "epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=25,
        predict_with_generate=True,
        fp16=True,
        push_to_hub=False,
        load_best_model_at_end=True
    )
else:
    raise ValueError("Model name not supported")

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [11]:
trainer.train()
trainer.save_model(output_dir)



  0%|          | 0/275 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.9833294749259949, 'eval_runtime': 0.3502, 'eval_samples_per_second': 54.255, 'eval_steps_per_second': 14.278, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.7590473294258118, 'eval_runtime': 0.3541, 'eval_samples_per_second': 53.656, 'eval_steps_per_second': 14.12, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.62278813123703, 'eval_runtime': 0.3483, 'eval_samples_per_second': 54.551, 'eval_steps_per_second': 14.355, 'epoch': 3.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.5308075547218323, 'eval_runtime': 0.349, 'eval_samples_per_second': 54.438, 'eval_steps_per_second': 14.326, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.4693298935890198, 'eval_runtime': 0.347, 'eval_samples_per_second': 54.749, 'eval_steps_per_second': 14.408, 'epoch': 5.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.4325083792209625, 'eval_runtime': 0.3496, 'eval_samples_per_second': 54.351, 'eval_steps_per_second': 14.303, 'epoch': 6.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.4057804048061371, 'eval_runtime': 0.351, 'eval_samples_per_second': 54.133, 'eval_steps_per_second': 14.246, 'epoch': 7.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3878600299358368, 'eval_runtime': 0.353, 'eval_samples_per_second': 53.826, 'eval_steps_per_second': 14.165, 'epoch': 8.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.37303969264030457, 'eval_runtime': 0.3419, 'eval_samples_per_second': 55.569, 'eval_steps_per_second': 14.623, 'epoch': 9.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.36076536774635315, 'eval_runtime': 0.3427, 'eval_samples_per_second': 55.439, 'eval_steps_per_second': 14.589, 'epoch': 10.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3502790629863739, 'eval_runtime': 0.3518, 'eval_samples_per_second': 54.008, 'eval_steps_per_second': 14.213, 'epoch': 11.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3469579517841339, 'eval_runtime': 0.3553, 'eval_samples_per_second': 53.47, 'eval_steps_per_second': 14.071, 'epoch': 12.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.34655535221099854, 'eval_runtime': 0.4015, 'eval_samples_per_second': 47.318, 'eval_steps_per_second': 12.452, 'epoch': 13.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3442915976047516, 'eval_runtime': 0.3578, 'eval_samples_per_second': 53.099, 'eval_steps_per_second': 13.973, 'epoch': 14.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3450705409049988, 'eval_runtime': 0.3653, 'eval_samples_per_second': 52.006, 'eval_steps_per_second': 13.686, 'epoch': 15.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3506848216056824, 'eval_runtime': 0.3566, 'eval_samples_per_second': 53.287, 'eval_steps_per_second': 14.023, 'epoch': 16.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3552534282207489, 'eval_runtime': 0.3532, 'eval_samples_per_second': 53.792, 'eval_steps_per_second': 14.156, 'epoch': 17.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.35668933391571045, 'eval_runtime': 0.3582, 'eval_samples_per_second': 53.039, 'eval_steps_per_second': 13.958, 'epoch': 18.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3588699698448181, 'eval_runtime': 0.36, 'eval_samples_per_second': 52.78, 'eval_steps_per_second': 13.889, 'epoch': 19.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.36043480038642883, 'eval_runtime': 0.3637, 'eval_samples_per_second': 52.241, 'eval_steps_per_second': 13.748, 'epoch': 20.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3609529137611389, 'eval_runtime': 0.3606, 'eval_samples_per_second': 52.696, 'eval_steps_per_second': 13.867, 'epoch': 21.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.36137065291404724, 'eval_runtime': 0.3546, 'eval_samples_per_second': 53.587, 'eval_steps_per_second': 14.102, 'epoch': 22.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3612312376499176, 'eval_runtime': 0.3564, 'eval_samples_per_second': 53.31, 'eval_steps_per_second': 14.029, 'epoch': 23.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.36143380403518677, 'eval_runtime': 0.3654, 'eval_samples_per_second': 51.998, 'eval_steps_per_second': 13.684, 'epoch': 24.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3610367476940155, 'eval_runtime': 0.3609, 'eval_samples_per_second': 52.642, 'eval_steps_per_second': 13.853, 'epoch': 25.0}
{'train_runtime': 162.3821, 'train_samples_per_second': 6.62, 'train_steps_per_second': 1.694, 'train_loss': 0.26920485063032673, 'epoch': 25.0}
