In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, set_seed
from datasets import *
import numpy as np
import torch
import ast

torch.cuda.empty_cache()

In [2]:
SEED = 42
set_seed(SEED)

# any combination of these years and dataset types can be used
# year = 2020
# year = 2022
# dataset_type = "full"
# dataset_type = "smaller"

# only combination of these years and dataset types can be used
# year = 2022
# dataset_type = "handwritten"

# only combination of these years and dataset types can be used
year = 2042 # idk some random number for file names
dataset_type = "full-combined"
# dataset_type = "smaller-combined"

local_models_path = '../../data/models/T5'

# model_name = "mrm8488/t5-small-finetuned-squadv2" # small model
model_name = "mrm8488/t5-base-finetuned-squadv2"

Reference notebooks:

https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=KdmKlMkfcLa0

https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb

### Load tokenizer and model

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


### Load the dataset

In [4]:
# Load the dataset from file and split it into train and test datasets
if dataset_type == "full":
    data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{year}-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)
elif dataset_type == "smaller":
    data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{year}-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)
    data["train"] = data["train"].select(range(len(data["train"]) // 2))
elif dataset_type == "full-combined":
    data = load_dataset('csv', data_files="../../data/clean/sustainability-report-2042-squad-format.csv",
                        delimiter=";", split="train").train_test_split(test_size=0.3, shuffle=True, seed=SEED)
elif dataset_type == "smaller-combined":
    data = load_dataset('csv', data_files="../../data/clean/sustainability-report-2042-squad-format.csv",
                        delimiter=";", split="train").train_test_split(test_size=0.3, shuffle=True, seed=SEED)
    data["train"] = data["train"].select(range(len(data["train"]) // 2))
elif dataset_type == "handwritten":
    data = load_dataset('csv', data_files=f"../../data/clean/QA_SR_2022_Expert-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)
else:
    raise Exception("Invalid dataset type")

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-dc46deea403e6d7a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-8c4226c3883b9f9f.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-24d9770f6403af3d.arrow


In [5]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data["test"] = data["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data["test"] = data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data["train"] = data["train"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data["train"] = data["train"].remove_columns(["text", "answer_start"])

Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-47e77242f5a0d387.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-cb788cd2e517f2b5.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-4957f1c2e4c5cb13.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-80903bafe3ee6c44.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853b

In [6]:
data["train"][0]

{'question': 'What are the three types of talent we are implementing?',
 'context': 'ting a common talent management strategy, payingspecial attention to managerial, professional or young talentand successors in management positions. ',
 'answers': {'answer_start': [69],
  'text': ['managerial, professional or young talent']},
 'id': 327}

### Tokenize the dataset

In [7]:
# process the examples in input and target text format and the eos token at the end
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (
        example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples


def convert_to_features(examples):
    model_inputs = tokenizer(
        examples['input_text'], pad_to_max_length=True, max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target_text'], pad_to_max_length=True, max_length=128, truncation=True)
        temp = np.array(labels["input_ids"])
        temp[temp == tokenizer.pad_token_id] = -100
        labels["input_ids"] = temp.tolist()

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
train_data, test_data = data["train"], data["test"]

train_data = train_data.map(add_eos_to_examples, load_from_cache_file=False)
train_data = train_data.map(
    convert_to_features, batched=True, load_from_cache_file=False)

test_data = test_data.map(add_eos_to_examples, load_from_cache_file=False)
test_data = test_data.map(
    convert_to_features, batched=True, load_from_cache_file=False)

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]



Map:   0%|          | 0/162 [00:00<?, ? examples/s]

Map:   0%|          | 0/162 [00:00<?, ? examples/s]

In [9]:
train_data[0]

{'question': 'What are the three types of talent we are implementing?',
 'context': 'ting a common talent management strategy, payingspecial attention to managerial, professional or young talentand successors in management positions. ',
 'answers': {'answer_start': [69],
  'text': ['managerial, professional or young talent']},
 'id': 327,
 'input_text': 'question: What are the three types of talent we are implementing?  context: ting a common talent management strategy, payingspecial attention to managerial, professional or young talentand successors in management positions.  </s>',
 'target_text': 'managerial, professional or young talent </s>',
 'input_ids': [822,
  10,
  363,
  33,
  8,
  386,
  1308,
  13,
  3683,
  62,
  33,
  3,
  10311,
  58,
  2625,
  10,
  3,
  1222,
  3,
  9,
  1017,
  3683,
  758,
  1998,
  6,
  3788,
  17434,
  1388,
  12,
  29957,
  6,
  771,
  42,
  1021,
  3683,
  232,
  22261,
  7,
  16,
  758,
  4655,
  5,
  3,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
 

### Fine tunning

In [10]:
name = model_name.split("/")[-1]
output_dir = f"{local_models_path}/{name}-finetuned-NLB-QA-{year}-{dataset_type}"
if "small" in model_name:
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=25,
        predict_with_generate=True,
        fp16=True,
        push_to_hub=False,
        load_best_model_at_end=True
    )
elif "base" in model_name:
    # with this batch size the base model fits on a GPU with 8GB of memory
    training_args = Seq2SeqTrainingArguments(
        output_dir = output_dir,
        evaluation_strategy = "epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=25,
        predict_with_generate=True,
        fp16=True,
        push_to_hub=False,
        load_best_model_at_end=True
    )
else:
    raise ValueError("Model name not supported")

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [11]:
trainer.train()
trainer.save_model(output_dir)



  0%|          | 0/1200 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.8727893829345703, 'eval_runtime': 2.9235, 'eval_samples_per_second': 55.413, 'eval_steps_per_second': 14.024, 'epoch': 1.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.28397178649902344, 'eval_runtime': 4.7129, 'eval_samples_per_second': 34.373, 'eval_steps_per_second': 8.699, 'epoch': 2.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.25794029235839844, 'eval_runtime': 3.916, 'eval_samples_per_second': 41.369, 'eval_steps_per_second': 10.47, 'epoch': 3.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.25176313519477844, 'eval_runtime': 3.5277, 'eval_samples_per_second': 45.922, 'eval_steps_per_second': 11.622, 'epoch': 4.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.2521061897277832, 'eval_runtime': 3.1846, 'eval_samples_per_second': 50.869, 'eval_steps_per_second': 12.874, 'epoch': 5.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.23818056285381317, 'eval_runtime': 3.8573, 'eval_samples_per_second': 41.998, 'eval_steps_per_second': 10.629, 'epoch': 6.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.2422378957271576, 'eval_runtime': 3.2632, 'eval_samples_per_second': 49.645, 'eval_steps_per_second': 12.564, 'epoch': 7.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.24052278697490692, 'eval_runtime': 3.8926, 'eval_samples_per_second': 41.617, 'eval_steps_per_second': 10.533, 'epoch': 8.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.23568932712078094, 'eval_runtime': 3.5813, 'eval_samples_per_second': 45.236, 'eval_steps_per_second': 11.449, 'epoch': 9.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.23433895409107208, 'eval_runtime': 8.2862, 'eval_samples_per_second': 19.551, 'eval_steps_per_second': 4.948, 'epoch': 10.0}
{'loss': 0.3099, 'learning_rate': 1.18e-05, 'epoch': 10.42}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.233358696103096, 'eval_runtime': 11.1372, 'eval_samples_per_second': 14.546, 'eval_steps_per_second': 3.681, 'epoch': 11.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.23610953986644745, 'eval_runtime': 13.2611, 'eval_samples_per_second': 12.216, 'eval_steps_per_second': 3.092, 'epoch': 12.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.23389215767383575, 'eval_runtime': 8.7524, 'eval_samples_per_second': 18.509, 'eval_steps_per_second': 4.684, 'epoch': 13.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.22835618257522583, 'eval_runtime': 8.1611, 'eval_samples_per_second': 19.85, 'eval_steps_per_second': 5.024, 'epoch': 14.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.22783802449703217, 'eval_runtime': 8.9405, 'eval_samples_per_second': 18.12, 'eval_steps_per_second': 4.586, 'epoch': 15.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.23263968527317047, 'eval_runtime': 8.5505, 'eval_samples_per_second': 18.946, 'eval_steps_per_second': 4.795, 'epoch': 16.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.23932774364948273, 'eval_runtime': 9.115, 'eval_samples_per_second': 17.773, 'eval_steps_per_second': 4.498, 'epoch': 17.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.24685056507587433, 'eval_runtime': 8.4411, 'eval_samples_per_second': 19.192, 'eval_steps_per_second': 4.857, 'epoch': 18.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.24762453138828278, 'eval_runtime': 9.095, 'eval_samples_per_second': 17.812, 'eval_steps_per_second': 4.508, 'epoch': 19.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.2499370127916336, 'eval_runtime': 8.062, 'eval_samples_per_second': 20.094, 'eval_steps_per_second': 5.086, 'epoch': 20.0}
{'loss': 0.0117, 'learning_rate': 3.4833333333333336e-06, 'epoch': 20.83}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.24866707623004913, 'eval_runtime': 8.2159, 'eval_samples_per_second': 19.718, 'eval_steps_per_second': 4.99, 'epoch': 21.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.2467965930700302, 'eval_runtime': 7.6427, 'eval_samples_per_second': 21.197, 'eval_steps_per_second': 5.365, 'epoch': 22.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.2440795600414276, 'eval_runtime': 18.1662, 'eval_samples_per_second': 8.918, 'eval_steps_per_second': 2.257, 'epoch': 23.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.24404074251651764, 'eval_runtime': 8.9515, 'eval_samples_per_second': 18.098, 'eval_steps_per_second': 4.58, 'epoch': 24.0}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 0.24345065653324127, 'eval_runtime': 9.8538, 'eval_samples_per_second': 16.44, 'eval_steps_per_second': 4.161, 'epoch': 25.0}
{'train_runtime': 2928.1856, 'train_samples_per_second': 1.614, 'train_steps_per_second': 0.41, 'train_loss': 0.13519883126020432, 'epoch': 25.0}
