# Reference search paper


[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683)

# Dependencies installation and required libraries


In [None]:
!pip install pytesseract transformers datasets rouge-score nltk py7zr evaluate --upgrade
!sudo apt-get install git-lfs --yes   # for pushing model and logs to the hugging face hub TO COMMENT
!pip install transformers[torch] accelerate -U

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Collecting transformers
  Downloading transformers-4.42.3-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr
  Downloading py7zr-0.21.1-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint
import numpy as np

from huggingface_hub import HfFolder

from datasets import load_dataset, concatenate_datasets
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from tqdm import tqdm

import evaluate
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
!nvidia-smi

Sun Jul  7 08:29:15 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Setting Hugging Face's Access Token

Useful for pushing the model/dataset to Hugging Face's platform

In [None]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')

# os.makedirs('/content/drive/MyDrive/NLP_Project')

# # Save your token in a text file on your Google Drive
# with open('/content/drive/MyDrive/NLP_Project/hf_token.txt', 'w') as f:
#     f.write("INSERT YOUR TOKEN HERE")  # replace with your access token

# # Read the token from the file and set it as an environment variable
# with open('/content/drive/MyDrive/NLP_Project/hf_token.txt', 'r') as f:
#     token = f.read().strip()

# os.environ['HF_TOKEN'] = token

Mounted at /content/drive


# Loading Samsum dataset

The dataset that will be used to fine-tuned our model is SAMSum dataset, that contains messenger-like conversations with summaries in English.

*(Analysis of Samsum dataset can be seen in a separate file)*

In [None]:
dataset_id = "samsum"

dataset = load_dataset(dataset_id, trust_remote_code=True)

# Removal of missing null row in train
train_data_list = dataset['train']
train_dataset_filtered = dataset['train'].filter(lambda example: example['id'] != 13828807)
dataset['train'] = train_dataset_filtered

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Train dataset size: 14732
Test dataset size: 819
Validation dataset size: 818


# Initialize the model

In [None]:
model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
# Load FLAN-t5-base from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Testing the loaded model into the test data




Here we try a first experiment: test the chat summarization task using the pretrained model, i.e. not fine-tuned, on chat summarization task, this is done just to look at the result that the model give.

In [None]:
sample = dataset['test'][0]
print(f"Dialogue: \n{sample['dialogue']}\n---------------")
print(f"Summary: \n{sample['summary']}\n---------------")

Dialogue: 
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
---------------
Summary: 
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
---------------


In [None]:
# Creation of a pipeline that perform a task, with the model and tokenizer given in input
pipe = pipeline("summarization", model=model, tokenizer=tokenizer)
output = pipe(sample['dialogue'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 200, but your input_length is only 133. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=66)


In [None]:
pprint(output)

[{'summary_text': "Amanda can't find Betty's number. Amanda asks Larry to call "
                  'her last time they were at the park together. Hannah '
                  "doesn't know him well. Amanda prefers to text him."}]


The result is already better than BART model, however it is not understading completely the dialogue making evident comprehension mistakes! At least it is already understanding to not consider the names along with the ":" punctuation.

Let's now see how the loaded model deal with the test set of the dataset, in order to compare the result after the fine tuning to see how much the model improved.

In [None]:
# Load ROUGE metric
metric = evaluate.load("rouge")

def evaluate_model(model, tokenizer, texts, references, batch_size=8):
    model.eval()
    all_predictions = []
    all_references = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Evaluating", unit="batch"):
        batch_texts = texts[i:i+batch_size]
        batch_references = references[i:i+batch_size]

        inputs = tokenizer(batch_texts, return_tensors="pt", max_length=1024, truncation=True, padding="longest").to(device)
        with torch.no_grad():
            summary_ids = model.generate(inputs['input_ids'])

        predictions = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in summary_ids]

        all_predictions.extend(predictions)
        all_references.extend(batch_references)

    result = metric.compute(predictions=all_predictions, references=all_references, use_stemmer=True)
    return result

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
model_test = model
model_test.to(device)
rouge_scores = evaluate_model(model_test, tokenizer, dataset['test']['dialogue'], dataset['test']['summary'], batch_size=8)

Evaluating: 100%|██████████| 103/103 [01:16<00:00,  1.35batch/s]


In [None]:
pd.DataFrame(rouge_scores, index=[f'{model_id}'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
google/flan-t5-base,0.452405,0.214928,0.378381,0.378496


Rouge metrics of flan-T5 with respect to the experimental results of the BART model are much higher.

# Data preprocessing


With the following function we tokenize each dataset example and then we get the maximum length value of the biggest input example size, and same for the output.
Therefore in order to give an input, to the encoder, of the same length by padding/truncating the length.

In [None]:
def get_max_length(dataset, text_field):
    concatenated_dataset = concatenate_datasets([dataset["train"], dataset["validation"], dataset["test"]])

    # Tokenize the dataset and get input_ids
    tokenized_dataset = concatenated_dataset.map(
        lambda x: tokenizer(x[text_field], truncation=True),
        batched=True,
        remove_columns=[text_field]
    )

    # Find the maximum length of the tokenized input_ids
    max_length = max(len(x) for x in tokenized_dataset["input_ids"])

    return max_length

# Get maximum lengths for source and target fields
max_source_length = get_max_length(dataset, "dialogue")
print(f"Max source length: {max_source_length}")

max_target_length = get_max_length(dataset, "summary")
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/16369 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/16369 [00:00<?, ? examples/s]

Max target length: 95


As stated in the reference paper, since T5 is capable of performing many NLP tasks.
Thanks the *text-to-text approach*, instead of treating different NLP tasks separately, T5 converts each task into a text-to-text format (i.e. `<task>: <input_text>`), so it is necessary to follow it, adding a prefix that defines the task that T5 must perform.
It therefore make easier to generalize across task.

Then the padding added to the samples, they must not to be taken into account, to do this we can simply replace that padding with `-100`, that is the *ignore index*.

In [None]:
def preprocess_function(sample, padding="max_length"):
    # Add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["dialogue"]]

    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
print(tokenized_dataset['train'][0])

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']
{'input_ids': [21603, 10, 21542, 10, 27, 13635, 5081, 5, 531, 25, 241, 128, 58, 16637, 10, 10625, 55, 21542, 10, 27, 31, 195, 830, 25, 5721, 3, 10, 18, 61, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# Fine-tuning on Samsum

Here we compute two main things:

1. decode predictions and labels;
2. clean the outputs in order to compute the ROUGE scores, by removing the leading/trailing whitespace;
3. compute ROUGE scores.

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decode predictions and labels
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Postprocess text: remove leading/trailing whitespace and tokenize at the sentence level
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    decoded_preds = ["\n".join(sent_tokenize(pred)) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label)) for label in decoded_labels]

    # Compute ROUGE metrics
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}

    # Calculate average length of predictions
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    return result

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

We start defining a DataCollator instance, that is a utility for batching and formatting inputs during training or evaluation.
It adds padding to ensure uniform input lengths and handles special tokens appropriately.

In this setup:
- `label_pad_token_id` is set to -100 to ignore padding tokens in the loss computation.
- Sequences are optionally padded to a multiple of 8 for efficiency.

This ensures data is properly prepared for the model.

In [None]:
# We want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

Definition of hyperparameters configuration:


*   hyperparameters' values for training
*   values for logging, evaluation and saving strategies

The arguments for training the model where:
- **output_dir**: where the model checkpoints and predictions should be saved;
- **per_device_train_batch_size**: define the number of examplea at the time will be processed per device during training;
- **per_device_eval_batch_size**: same but for validation;
- **predict_with_generate**: whether to use generate to calculate generative metrics;
- **fp16**: refers to using 16-bit precision for floating-point numbers in model training. This approach, known as mixed precision training, combines 16-bit and 32-bit computations to increase training speed and reduce memory usage while maintaining model accuracy;
- **learning_rate**;
- **num_train_epochs**: the number of times the entire training dataset will be passed through the model;
- **logging_dir**: directory to save the training logs.
- **logging_steps**: interval of steps between logging events;
- **eval_strategy**: strategy for evaluation during training (e.g., "steps" or "epoch");
- **save_strategy**: strategy for saving model checkpoints (e.g., "steps" or "epoch"),
- **save_total_limit**: maximum number of checkpoints to keep;
- **load_best_model_at_end**: load the best model (based on evaluation metric) at the end of training;
- **push_to_hub**: whether to push the model to the HuggingFace;
- **hub_strategy**: strategy for pushing to the Hub (e.g., "every_save");
- **hub_model_id**: identifier for the model in the Hugging Face Hub;
- **hub_token**: authentication token for the HuggingFace.

In [None]:
# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,
    learning_rate=5e-5,
    num_train_epochs=3,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    push_to_hub=False,
    #hub_strategy="every_save",
    #hub_model_id=repository_id,
    #hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)

In [None]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.4116,1.385111,48.539,25.203,39.2261,45.201,36.875306
2,1.3387,1.374263,48.562,25.0327,39.1235,45.0049,36.836186


Non-default generation parameters: {'max_length': 200, 'min_length': 30, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 200, 'min_length': 30, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.4116,1.385111,48.539,25.203,39.2261,45.201,36.875306
2,1.3387,1.374263,48.562,25.0327,39.1235,45.0049,36.836186
3,1.3189,1.374832,48.7279,25.0598,39.1658,45.1284,36.943765


Non-default generation parameters: {'max_length': 200, 'min_length': 30, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 200, 'min_length': 30, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=5526, training_loss=1.3661933185999868, metrics={'train_runtime': 8084.3483, 'train_samples_per_second': 5.467, 'train_steps_per_second': 0.684, 'total_flos': 3.026353594879181e+16, 'train_loss': 1.3661933185999868, 'epoch': 3.0})

In [None]:
validation_results = trainer.evaluate()

In [None]:
pd.DataFrame(validation_results, index=[f'{model_id}'])

Unnamed: 0,eval_loss,eval_rouge1,eval_rouge2,eval_rougeL,eval_rougeLsum,eval_gen_len,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
google/flan-t5-base,1.374263,48.562,25.0327,39.1235,45.0049,36.836186,315.5705,2.592,0.326,3.0


# Saving model

Here we can save the result model after the fine-tuning to Hugging Face platform in order to use it later.

In [None]:
# Save our tokenizer and create model card
# tokenizer.save_pretrained(repository_id)
# trainer.create_model_card()
# Push the results to the hub
# trainer.push_to_hub()

Non-default generation parameters: {'max_length': 200, 'min_length': 30, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


events.out.tfevents.1720289711.087ef5761dd7.169.0:   0%|          | 0.00/124k [00:00<?, ?B/s]

events.out.tfevents.1720298111.087ef5761dd7.169.1:   0%|          | 0.00/613 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Seba213/flan-t5-base-samsum/commit/d2ddcf01eed2854ca27a54e0f561a73a678799a7', commit_message='End of training', commit_description='', oid='d2ddcf01eed2854ca27a54e0f561a73a678799a7', pr_url=None, pr_revision=None, pr_num=None)

# Evaluate fine-tuned model

In [None]:
model_fine_tuned = AutoModelForSeq2SeqLM.from_pretrained("Seba213/flan-t5-base-samsum").to(device)
tokenizer_fine_tuned = AutoTokenizer.from_pretrained("Seba213/flan-t5-base-samsum")

rouge_scores_final = evaluate_model(model_fine_tuned, tokenizer_fine_tuned, dataset['test']['dialogue'], dataset['test']['summary'], batch_size=8)

Evaluating: 100%|██████████| 103/103 [05:07<00:00,  2.99s/batch]


In [None]:
pd.DataFrame(rouge_scores_final, index=['google/flan-t5-base'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
google/flan-t5-base,0.473751,0.237624,0.378463,0.378495
