In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration

from datasets import *
import numpy as np

import torch
import ast


SEED = 42

### Load tokenizer and model

In [2]:
model_name = "mrm8488/t5-small-finetuned-squadv2" # small model
# model_name = "mrm8488/t5-base-finetuned-squadv2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


### Load the dataset

In [3]:
# Load the dataset from file and split it into train and test datasets
data = load_dataset('csv', data_files="../../data/clean/sustainability-report-2020-squad-format.csv", delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-d8382661cd597e83/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c261d5613d28d856.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e61829c1e4a24b65.arrow


In [4]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["test"] = data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["train"] = data["train"].remove_columns(["text", "answer_start"])

Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0b15501cefb41ff7.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-6c4455904f60e079.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-65eb14b3b79cbed9.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-559c811f459458f4.arrow


In [5]:
data["train"][0]

{'question': 'How does the Bank prevent or manage cyber risks arising from the increased volume of work from home?',
 'context': 'nages cyber risks arising from the increased volumeof work from home through implemented measures, some of which areexplained below, namelyPromoting Freedom of Expr',
 'answers': {'answer_start': [69], 'text': ['through implemented measures']},
 'id': 132}

### Tokenize the dataset

In [6]:
# process the examples in input and target text format and the eos token at the end 
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples
def convert_to_features(examples):
    model_inputs = tokenizer(examples['input_text'], pad_to_max_length=True, max_length=512, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], pad_to_max_length=True, max_length=128, truncation=True)

    # encodings = {
    #     'input_ids': input_encodings['input_ids'], 
    #     'attention_mask': input_encodings['attention_mask'],
    #     'target_ids': target_encodings['input_ids'],
    #     'target_attention_mask': target_encodings['attention_mask']
    # }

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
train_data, test_data = data["train"], data["test"]

train_data = train_data.map(add_eos_to_examples, load_from_cache_file=False)
train_data = train_data.map(convert_to_features, batched=True, load_from_cache_file=False)

test_data = test_data.map(add_eos_to_examples, load_from_cache_file=False)
test_data = test_data.map(convert_to_features, batched=True, load_from_cache_file=False)

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]



Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

In [8]:
train_data[0]

{'question': 'How does the Bank prevent or manage cyber risks arising from the increased volume of work from home?',
 'context': 'nages cyber risks arising from the increased volumeof work from home through implemented measures, some of which areexplained below, namelyPromoting Freedom of Expr',
 'answers': {'answer_start': [69], 'text': ['through implemented measures']},
 'id': 132,
 'input_text': 'question: How does the Bank prevent or manage cyber risks arising from the increased volume of work from home?  context: nages cyber risks arising from the increased volumeof work from home through implemented measures, some of which areexplained below, namelyPromoting Freedom of Expr </s>',
 'target_text': 'through implemented measures </s>',
 'input_ids': [822,
  10,
  571,
  405,
  8,
  1925,
  1709,
  42,
  1865,
  9738,
  5217,
  3,
  14739,
  45,
  8,
  1936,
  2908,
  13,
  161,
  45,
  234,
  58,
  2625,
  10,
  3,
  9761,
  7,
  9738,
  5217,
  3,
  14739,
  45,
  8,
  1936,
  2908

In [9]:
# set the tensor type and the columns which the dataset should return
# columns = ['input_ids', 'target_ids', 'attention_mask', 'target_attention_mask']
# train_data.set_format(type='torch', columns=columns)
# test_data.set_format(type='torch', columns=columns)
# torch.save(train_data, 'train_data.pt')
# torch.save(test_data, 'valid_data.pt')

### Fine tunning

In [10]:
# import dataclasses
# import logging
# import os
# import sys
# from dataclasses import dataclass, field
# from typing import Dict, List, Optional

import numpy as np
import torch

from transformers import T5ForConditionalGeneration, T5Tokenizer, EvalPrediction
from transformers import (
    DefaultDataCollator,
    DataCollatorForSeq2Seq,
    Trainer,
    Seq2SeqTrainer,
    TrainingArguments,
    Seq2SeqTrainingArguments,
    # set_seed,
)


# logger = logging.getLogger(__name__)

training_args = Seq2SeqTrainingArguments(
    output_dir = f"./models/{model_name}-finetuned-NLB-QA",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


# def main():
#     # See all possible arguments in src/transformers/training_args.py
#     # or by passing the --help flag to this script.
#     # We now keep distinct sets of args, for a cleaner separation of concerns.

#     # we will load the arguments from a json file, 
#     #make sure you save the arguments in at ./args.json
#     model_args, data_args, training_args = ...

#     if (
#         os.path.exists(training_args.output_dir)
#         and os.listdir(training_args.output_dir)
#         and training_args.do_train
#         and not training_args.overwrite_output_dir
#     ):
#         raise ValueError(
#             f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
#         )

#     # Setup logging
#     logging.basicConfig(
#         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
#         datefmt="%m/%d/%Y %H:%M:%S",
#         level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
#     )
#     logger.warning(
#         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
#         training_args.local_rank,
#         training_args.device,
#         training_args.n_gpu,
#         bool(training_args.local_rank != -1),
#         training_args.fp16,
#     )
#     logger.info("Training/evaluation parameters %s", training_args)

#     # Set seed
#     set_seed(training_args.seed)

#     # Load pretrained model and tokenizer
#     #
#     # Distributed training:
#     # The .from_pretrained methods guarantee that only one local process can concurrently
#     # download model & vocab.

#     # tokenizer = T5Tokenizer.from_pretrained(
#     #     model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
#     #     cache_dir=model_args.cache_dir,
#     # )
#     # model = T5ForConditionalGeneration.from_pretrained(
#     #     model_args.model_name_or_path,
#     #     cache_dir=model_args.cache_dir,
#     # )

#     # Get datasets
#     # print('loading data')
#     # train_dataset  = torch.load(data_args.train_file_path)
#     # valid_dataset = torch.load(data_args.valid_file_path)
#     # print('loading done')

#     # TODO
#     # just test the defualt data collator for now
#     # data_collator = T2TDataCollator()
#     # data_collator = DefaultDataCollator()
#     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#     # Initialize our Trainer
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_data,
#         eval_dataset=test_data,
#         data_collator=data_collator,
#         # prediction_loss_only=True, 
#         tokenizer=tokenizer # TODO: why is this not in the original script
#     )

#     # Training
#     if training_args.do_train:
#         trainer.train(
#             model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
#         )
#         trainer.save_model()
#         # For convenience, we also re-save the tokenizer to the same directory,
#         # so that you can share your model easily on huggingface.co/models =)
#         if trainer.is_world_master():
#             tokenizer.save_pretrained(training_args.output_dir)

#     # Evaluation
#     results = {}
#     if training_args.do_eval and training_args.local_rank in [-1, 0]:
#         logger.info("*** Evaluate ***")

#         eval_output = trainer.evaluate()

#         output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
#         with open(output_eval_file, "w") as writer:
#             logger.info("***** Eval results *****")
#             for key in sorted(eval_output.keys()):
#                 logger.info("  %s = %s", key, str(eval_output[key]))
#                 writer.write("%s = %s\n" % (key, str(eval_output[key])))
    
#         results.update(eval_output)
    
#     return results

In [11]:
trainer.train()



  0%|          | 0/170 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 20.1995849609375, 'eval_runtime': 0.3888, 'eval_samples_per_second': 144.021, 'eval_steps_per_second': 18.003, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 15.202910423278809, 'eval_runtime': 0.3772, 'eval_samples_per_second': 148.471, 'eval_steps_per_second': 18.559, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 10.675898551940918, 'eval_runtime': 0.3816, 'eval_samples_per_second': 146.767, 'eval_steps_per_second': 18.346, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 6.612198352813721, 'eval_runtime': 0.3757, 'eval_samples_per_second': 149.039, 'eval_steps_per_second': 18.63, 'epoch': 4.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 3.352875232696533, 'eval_runtime': 0.3786, 'eval_samples_per_second': 147.911, 'eval_steps_per_second': 18.489, 'epoch': 5.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.3994090557098389, 'eval_runtime': 0.3795, 'eval_samples_per_second': 147.545, 'eval_steps_per_second': 18.443, 'epoch': 6.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5980759263038635, 'eval_runtime': 0.3786, 'eval_samples_per_second': 147.93, 'eval_steps_per_second': 18.491, 'epoch': 7.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.34437328577041626, 'eval_runtime': 0.3792, 'eval_samples_per_second': 147.671, 'eval_steps_per_second': 18.459, 'epoch': 8.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.27598103880882263, 'eval_runtime': 0.381, 'eval_samples_per_second': 146.998, 'eval_steps_per_second': 18.375, 'epoch': 9.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.2601204216480255, 'eval_runtime': 0.3831, 'eval_samples_per_second': 146.181, 'eval_steps_per_second': 18.273, 'epoch': 10.0}
{'train_runtime': 40.5121, 'train_samples_per_second': 31.842, 'train_steps_per_second': 4.196, 'train_loss': 6.508491067325368, 'epoch': 10.0}


TrainOutput(global_step=170, training_loss=6.508491067325368, metrics={'train_runtime': 40.5121, 'train_samples_per_second': 31.842, 'train_steps_per_second': 4.196, 'train_loss': 6.508491067325368, 'epoch': 10.0})

In [11]:
# import json

# args_dict = {
#     # "num_cores": 8,
#     # 'training_script': 'train_t5_squad.py',
#     "model_name_or_path": model_name,
#     "max_len": 512 ,
#     "target_max_len": 16,
#     "output_dir": './models',
#     "overwrite_output_dir": True,
#     "per_gpu_train_batch_size": 8,
#     "per_gpu_eval_batch_size": 8,
#     "gradient_accumulation_steps": 4,
#     "learning_rate": 1e-4,
#     "tpu_num_cores": 8,
#     "num_train_epochs": 4,
#     "do_train": True
# }

# with open('args.json', 'w') as f:
#     json.dump(args_dict, f)

In [12]:
# results = main()

05/06/2023 15:33:41 - INFO - __main__ -   Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
greater_is_better=None,
group_by_lengt

  0%|          | 0/16 [00:00<?, ?it/s]

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds