In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration

from datasets import *
import numpy as np

import torch
import ast


SEED = 42

Reference notebooks:

https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=KdmKlMkfcLa0

https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb

### Load tokenizer and model

In [None]:
model_name = "mrm8488/t5-small-finetuned-squadv2" # small model
# model_name = "mrm8488/t5-base-finetuned-squadv2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

### Load the dataset

In [None]:
# Load the dataset from file and split it into train and test datasets
data = load_dataset('csv', data_files="../../data/clean/sustainability-report-2020-squad-format.csv", delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

In [None]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["test"] = data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["train"] = data["train"].remove_columns(["text", "answer_start"])

In [None]:
data["train"][0]

### Tokenize the dataset

In [None]:
# process the examples in input and target text format and the eos token at the end 
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples
def convert_to_features(examples):
    model_inputs = tokenizer(examples['input_text'], pad_to_max_length=True, max_length=512, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], pad_to_max_length=True, max_length=128, truncation=True)
        temp = np.array(labels["input_ids"])
        temp[temp == tokenizer.pad_token_id] = -100
        labels["input_ids"] = temp.tolist()

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_data, test_data = data["train"], data["test"]

train_data = train_data.map(add_eos_to_examples, load_from_cache_file=False)
train_data = train_data.map(convert_to_features, batched=True, load_from_cache_file=False)

test_data = test_data.map(add_eos_to_examples, load_from_cache_file=False)
test_data = test_data.map(convert_to_features, batched=True, load_from_cache_file=False)

In [None]:
train_data[0]

### Fine tunning

In [None]:
import numpy as np
import torch
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments


name = model_name.split("/")[-1]
training_args = Seq2SeqTrainingArguments(
    output_dir = f"./models",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=25,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False
)

# with this batch size the base model fits on a GPU with 8GB of memory
# training_args = Seq2SeqTrainingArguments(
#     output_dir = f"./models",
#     evaluation_strategy = "epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     weight_decay=0.01,
#     save_total_limit=3,
#     num_train_epochs=25,
#     predict_with_generate=True,
#     fp16=True,
#     push_to_hub=False
# )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()
trainer.save_model(f"./models/{name}-finetuned-NLB-QA")

### Evaluation

In [None]:
def get_answer(question, context):
    input_text = "question: %s  context: %s" % (question, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask'])

    return tokenizer.decode(output[0])

In [None]:
question = test_data[0]["question"]
context = test_data[0]["context"]
answer = test_data[0]["answers"]["text"][0]
print(f"Question: {question} \nContext: {context} \nAnswer: {answer}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
print(f"Squad model answer: {get_answer(question, context)}")

tokenizer = AutoTokenizer.from_pretrained(f"./models/{name}-finetuned-NLB-QA", local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(f"./models/{name}-finetuned-NLB-QA", local_files_only=True)
print(f"Our model answer: {get_answer(question, context)}")

In [None]:
answers = [temp["answers"]["text"][0] for temp in test_data]

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
squad_answers = [get_answer(question, context) for question, context in zip(test_data["question"], test_data["context"])]

tokenizer = AutoTokenizer.from_pretrained(f"./models/{name}-finetuned-NLB-QA", local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(f"./models/{name}-finetuned-NLB-QA", local_files_only=True)
our_answers = [get_answer(question, context) for question, context in zip(test_data["question"], test_data["context"])]

In [None]:
squad_answers

In [None]:
our_answers

In [None]:
import evaluate
bertscore = evaluate.load("bertscore")

results = bertscore.compute(predictions=squad_answers, references=answers, lang="en")
# Embeddings bases evaluation
print(f"Squad\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

results = bertscore.compute(predictions=our_answers, references=answers, lang="en")
# Embeddings bases evaluation
print(f"Our model\nF1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

In [None]:
bleu = evaluate.load("bleu")

results = bleu.compute(predictions=squad_answers, references=answers)
print(f"Squad\n{results}")
results = bleu.compute(predictions=our_answers, references=answers)
print(f"Our model\n{results}")