# Reference search paper

[BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461)

## Importing libraries and installing dependencies

In [None]:
!pip install pytesseract transformers datasets rouge-score nltk py7zr evaluate --upgrade
!sudo apt-get install git-lfs --yes   # for pushing model and logs to the hugging face hub
!pip install transformers[torch] accelerate -U

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Collecting transformers
  Downloading transformers-4.42.3-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr
  Downloading py7zr-0.21.1-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.9 MB/s[0m eta [36m0:0

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint
import numpy as np

from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, GenerationConfig
from datasets import load_dataset
import evaluate

from huggingface_hub import HfFolder

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from tqdm import tqdm

import torch

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
!nvidia-smi

Mon Jul  8 05:27:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Setting Hugging Face's Access Token

Useful for pushing the model/dataset to Hugging Face's platform

In [None]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')

# os.makedirs('/content/drive/MyDrive/NLP_Project')

# # Save your token in a text file on your Google Drive
# with open('/content/drive/MyDrive/NLP_Project/hf_token.txt', 'w') as f:
#     f.write("INSERT YOUR TOKEN HERE")  # replace with your access token

# # Read the token from the file and set it as an environment variable
# with open('/content/drive/MyDrive/NLP_Project/hf_token.txt', 'r') as f:
#     token = f.read().strip()

# os.environ['HF_TOKEN'] = token

Mounted at /content/drive


# Loading Samsum dataset

The dataset that will be used to fine-tuned our model is SAMSum dataset, that contains messenger-like conversations with summaries in English.

*(Analysis of Samsum dataset can be seen in a separate file)*

In [None]:
dataset_id = "samsum"

dataset = load_dataset(dataset_id, trust_remote_code=True)

# Removal of missing null row in train
train_dataset_filtered = dataset['train'].filter(lambda example: example['id'] != 13828807)
dataset['train'] = train_dataset_filtered

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Train dataset size: 14732
Test dataset size: 819
Validation dataset size: 818


# Initialize the model




In [None]:
model_id = "facebook/bart-large-cnn"

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

## Testing the loaded model into a dialogue

In this section we roughly try the imported model with no changes, into Samsung/samsum dataset to see how it performs.




In [None]:
sample = dataset['test'][0]
print(f"Dialogue: \n{sample['dialogue']}\n---------------")
print(f"Summary: \n{sample['summary']}\n---------------")

Dialogue: 
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
---------------
Summary: 
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
---------------


In [None]:
pipe = pipeline("summarization", model=model, tokenizer=tokenizer)
output = pipe(sample['dialogue'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 142, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)


In [None]:
pprint(output)

[{'summary_text': "Hannah asks Amanda for Betty's number. Amanda can't find "
                  'it. Hannah asks Larry. Amanda asks Larry to text him. '
                  "Hannah says she'll text him back. Hannah calls it a day and "
                  'says she\'s going to go home. Hannah: "Bye bye"'}]


As a test to try the model not already fine-tuned, it is not losing much information of the dialogue, however the way it is outputting the summary is not really good.  In fact as a final result we want to obtain a fluent and exhaustive text, with the correct punctuation.

In [None]:
# Load ROUGE metric
rouge_metric = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def evaluate_model(model, tokenizer, texts, references, batch_size=8):
    model.eval()
    all_predictions = []
    all_references = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Evaluating", unit="batch"):
        batch_texts = texts[i:i+batch_size]
        batch_references = references[i:i+batch_size]

        inputs = tokenizer(batch_texts, return_tensors="pt", max_length=1024, truncation=True, padding="longest").to(device)
        with torch.no_grad():
            summary_ids = model.generate(inputs['input_ids'])

        predictions = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in summary_ids]

        all_predictions.extend(predictions)
        all_references.extend(batch_references)

    result = rouge_metric.compute(predictions=all_predictions, references=all_references, use_stemmer=True)
    return result

In [None]:
rouge_scores = evaluate_model(model, tokenizer, dataset['test']['dialogue'], dataset['test']['summary'], batch_size=8)

Evaluating: 100%|██████████| 103/103 [05:49<00:00,  3.39s/batch]


In [None]:
pd.DataFrame(rouge_scores, index=[f'Bart {model_id}'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
Bart facebook/bart-large-cnn,0.307515,0.102575,0.23109,0.230958


Results seem not really bad, however now we prepare the data to fine tune the model with Samsung/samsum dataset.

## Prepare the data to fine tune the model with dialogue dataset

We do this step for each dialogue in Samsung/samsum: we use the model's tokenizer to produce tokens and prepare the input for the model.

In [None]:
def convert_examples_to_tokens(example_batch):
  input_encodings = tokenizer(example_batch['dialogue'],truncation=True,max_length=1024)

  with tokenizer.as_target_tokenizer():
    target_encodings = tokenizer(text_target=example_batch['summary'],truncation=True,max_length=128)

  return {
    'input_ids': input_encodings['input_ids'],
    # these are the dialogues that are transformed into tokens (words,subwords,..) and converted into IDs (numbers that correspond to words in vocabulary)
    'attention_mask': input_encodings['attention_mask'],
    # Indicate which tokens should be attended, so if it's 1 it will considered.
    # Indicates which tokens are actual input data and which are padding. This helps the model focus on the relevant tokens and ignore the padding during training.
    'labels': target_encodings['input_ids'] # represent tokens for gold summaries
  }

tokenized_dataset = dataset.map(convert_examples_to_tokens, batched=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

# Fine-tuning on Samsum

Here we define the DataCollator that is a utility that helps in batching and collating input during training or evaluation of a model. Its primary purpose is to format the data in a way that is suitable for the specific requirements of the model being trained or evaluated. It means that it adds padding if necessary to dialogues.


Also we have the arguments for training the model where:
- **output_dir**: where the model checkpoints and predictions should be saved;
- **num_train_epochs**: the number of times the entire training dataset will be passed through the model;
- **fp16**:  refers to using 16-bit precision for floating-point numbers in model training. This approach, known as mixed precision training, combines 16-bit and 32-bit computations to increase training speed and reduce memory usage while maintaining model accuracy;
- **per_device_train_batch_size**: one example at the time will be processed per device during training;
- **per_device_eval_batch_size**: same but for validation;
- **predict_with_generate**: whether to use generate to calculate generative metrics;
- **weight_decay**: L2 regularization;
- **learning_rate**;
- **logging_dir**: directory to save the training logs.
- **logging_steps**: interval of steps between logging events;
- **eval_strategy**: strategy for evaluation during training (e.g., "steps" or "epoch");
- **save_strategy**: strategy for saving model checkpoints (e.g., "steps" or "epoch"),
- **save_total_limit**: maximum number of checkpoints to keep;
- **load_best_model_at_end**: load the best model (based on evaluation metric) at the end of training;
- **eval_steps**: the model will be evaluated every X steps;
- **save_steps**: the frequency at which the model checkpoints will be saved;
- **gradient_accumulation_steps**: number of steps to accumulate gradients before performing an update of model parameters. The effective batch size becomes per_device_train_batch_size * gradient_accumulation_steps *testo in corsivo*. For example, if per_device_train_batch_size is 1 and gradient_accumulation_steps is 16, the effective batch size is 16. This approach allows you to use a smaller batch size that fits in memory, while still benefiting from the stability and performance improvements of a larger batch size. Basically In this setup:
  - Each GPU processes 1 sample per batch.
  - Gradients are accumulated for 16 batches.
  - The model parameters are updated after every 16 batches.

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

trainer_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    learning_rate=1e-05,
    predict_with_generate=True,
    fp16=True,
    gradient_accumulation_steps=4,
    # logging and evaluation strategies
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=400,
    save_steps=1e6,
    save_total_limit=3,
    load_best_model_at_end=True,
    logging_dir=f"{repository_id}/logs",
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=trainer_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
400,1.3477,1.373215
800,1.1696,1.370361
1200,1.136,1.371425


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=1380, training_loss=1.216053763679836, metrics={'train_runtime': 1944.6172, 'train_samples_per_second': 22.727, 'train_steps_per_second': 0.71, 'total_flos': 3.265419456744653e+16, 'train_loss': 1.216053763679836, 'epoch': 2.996742671009772})

In [None]:
validation_results = trainer.evaluate()

In [None]:
pd.DataFrame(validation_results, index=[f'{model_id}'])

Unnamed: 0,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
facebook/bart-large-cnn,1.37096,10.1038,80.959,10.194,2.996743


## Saving the model

In [None]:
# Save our tokenizer and create model card
#tokenizer.save_pretrained(repository_id)
#trainer.create_model_card()
# Push the results to the hub
#trainer.push_to_hub()

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


events.out.tfevents.1720416908.f9cf3738eb70.329.0:   0%|          | 0.00/7.20k [00:00<?, ?B/s]

events.out.tfevents.1720417028.f9cf3738eb70.329.1:   0%|          | 0.00/36.2k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

events.out.tfevents.1720418983.f9cf3738eb70.329.2:   0%|          | 0.00/359 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Seba213/bart-large-cnn-samsum/commit/b9aeba4b6f2d8859e6e31e5f3140469080ffe37e', commit_message='End of training', commit_description='', oid='b9aeba4b6f2d8859e6e31e5f3140469080ffe37e', pr_url=None, pr_revision=None, pr_num=None)

# Evaluate fine-tuned model

In [None]:
fine_tuned_model_id = "Seba213/bart-large-cnn-samsum"

model_fine_tuned = AutoModelForSeq2SeqLM.from_pretrained(fine_tuned_model_id).to(device)
tokenizer_fine_tuned = AutoTokenizer.from_pretrained(fine_tuned_model_id)

rouge_scores_final = evaluate_model(model_fine_tuned, tokenizer_fine_tuned, dataset['test']['dialogue'], dataset['test']['summary'], batch_size=8)

config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/358 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Evaluating: 100%|██████████| 103/103 [04:37<00:00,  2.70s/batch]


In [None]:
pd.DataFrame(rouge_scores_final, index=[fine_tuned_model_id])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
Seba213/bart-large-cnn-samsum,0.432004,0.215871,0.333226,0.333454
