In [None]:
!pip install datasets
!pip install evaluate
!pip install rouge_score


Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━

In [None]:
import numpy as np
import pandas as pd
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import evaluate


Vediamo alcuni metodi che sono utilizzati per calcolare e valutare la qualità di un modello generativo!


# ROUGE scores

È una metrica molto usata per calcolare le performances per task generativi come Summarization o Translation (perchè si basa sull'avere un testo - una risposta ottimale al problema di riferimento - da utilizzare come Golden Standard o Ground Truth).
Si mettono a confronto gli N-Grammi del testo di riferimento e del testo generato dal modello Generativo

## ROUGE-1

Proviamo a calcolare a mano la ROUGE-1 di questi esempi:

In [None]:
candidate = "Summarization is cool"
reference_1 = "Summarization is beneficial and cool"
reference_2 = "Summarization saves time"


In [None]:
n_overlapping_unigrams_reference_1 = len([x for x in candidate.split() if x in reference_1.split()])
n_overlapping_unigrams_reference_2 = len([x for x in candidate.split() if x in reference_2.split()])

total_unigrams_candidate = len(candidate.split())
total_unigrams_reference_1 = len(reference_1.split())
total_unigrams_reference_2 = len(reference_2.split())

In [None]:
precision_reference_1 = n_overlapping_unigrams_reference_1 / total_unigrams_candidate
recall_reference_1 = n_overlapping_unigrams_reference_1 / total_unigrams_reference_1

f1_score_reference_1 = (2* precision_reference_1 * recall_reference_1)/(recall_reference_1 + precision_reference_1)

print(f"Recall reference_1: {recall_reference_1}")
print(f"Precision reference_1: {precision_reference_1}")
print(f"ROUGE-1 Score reference_1: {f1_score_reference_1}")

Recall reference_1: 0.6
Precision reference_1: 1.0
ROUGE-1 Score reference_1: 0.7499999999999999


In [None]:
precision_reference_2 = n_overlapping_unigrams_reference_2 / total_unigrams_candidate
recall_reference_2 = n_overlapping_unigrams_reference_2 / total_unigrams_reference_2

f1_score_reference_2 = (2* precision_reference_2 * recall_reference_2)/(recall_reference_2 + precision_reference_2)

print(f"Recall reference_2: {recall_reference_2}")
print(f"Precision reference_2: {precision_reference_2}")
print(f"ROUGE-1 Score reference_2: {f1_score_reference_2}")

Recall reference_2: 0.3333333333333333
Precision reference_2: 0.3333333333333333
ROUGE-1 Score reference_2: 0.3333333333333333


Calcoliamo ora la ROUGE score utilizzando la funzione nella libreria 'evaluate' di HuggingFace

In [None]:
rouge_score = evaluate.load("rouge")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
rouge_score.compute(predictions=[candidate],
        references=[reference_1])

{'rouge1': 0.7499999999999999,
 'rouge2': 0.3333333333333333,
 'rougeL': 0.7499999999999999,
 'rougeLsum': 0.7499999999999999}

In [None]:
rouge_score.compute(predictions=[candidate],
        references=[reference_2])

{'rouge1': 0.3333333333333333,
 'rouge2': 0.0,
 'rougeL': 0.3333333333333333,
 'rougeLsum': 0.3333333333333333}

Vediamo ora un esempio di come calcolarla per un task di Summarization confrontando un modello base con un modello trainato per questo task

In [None]:
model_name = "t5-base"

In [None]:
tokenizer= AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def create_summaries(texts_list, tokenizer, model, max_l=125):

    # We are going to add a prefix to each article to be summarized
    # so that the model knows what it should do
    prefix = "Summarize this news: "
    summaries_list = [] #Will contain all summaries

    texts_list = [prefix + text for text in texts_list]

    for text in texts_list:

        summary=""

        #calculate the encodings
        input_encodings = tokenizer(text,
                                    max_length=1024,
                                    return_tensors='pt',
                                    padding=True,
                                    truncation=True)

        # Generate summaries
        with torch.no_grad():
            output = model.generate(
                input_ids=input_encodings.input_ids,
                attention_mask=input_encodings.attention_mask,
                max_length=max_l,  # Set the maximum length of the generated summary
                num_beams=2,     # Set the number of beams for beam search
                early_stopping=True
            )

        #Decode to get the text
        summary = tokenizer.batch_decode(output, skip_special_tokens=True)

        #Add the summary to summaries list
        summaries_list += summary
    return summaries_list


In [None]:
cnn_dataset = load_dataset(
    "cnn_dailymail", "3.0.0"
)

#Get just a few news to test
sample_cnn = cnn_dataset["test"].select(range(10))

sample_cnn


Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 10
})

In [None]:
max_length = max(len(item['highlights']) for item in sample_cnn)
max_length = max_length + 10


In [None]:
rouge_score = evaluate.load("rouge")


In [None]:
summaries_t5_base = create_summaries(sample_cnn["article"],
                                      tokenizer,
                                      model,
                                      max_l=max_length)

In [None]:
real_summaries = sample_cnn['highlights']

In [None]:
summaries = pd.DataFrame.from_dict(
        {
            "base": summaries_t5_base,
            "reference": real_summaries,
        }
    )
summaries.head()

Unnamed: 0,base,reference
0,a preliminary examination into alleged crimes ...,Membership gives the ICC jurisdiction over all...
1,a stray pooch in Washington state has been bur...,"Theia, a bully breed mix, was apparently hit b..."
2,mohammad Javad Zarif is the foreign minister o...,Mohammad Javad Zarif has spent more time with ...
3,five americans who were monitored for three we...,17 Americans were exposed to the Ebola virus w...
4,university officials say student admitted to h...,Student is no longer on Duke University campus...


In [None]:
rouge_score.compute(
        predictions=summaries_t5_base,
        references=real_summaries)

{'rouge1': 0.35096872887967234,
 'rouge2': 0.11788605316988776,
 'rougeL': 0.22862199059003951,
 'rougeLsum': 0.24943038242108984}