## T5

In [1]:
import pandas as pd
from rouge import Rouge
from transformers import T5ForConditionalGeneration, T5Tokenizer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Carga del dataset
df_ref = pd.read_csv("df_ref.csv")
df_ref

Unnamed: 0,article,abstract,article_length,abstract_length,top_words,similarity,similarity_percentage
0,direct central nervous system ( cns ) involvem...,a 25-year - old male patient presented to our ...,1394,293,"['of', 'and', 'the', 'in', 'with', 'a', 's', '...",0.955112,95.511218
1,primary nocturnal enuresis is intermittent noc...,several therapeutic options have been describe...,2014,247,"['the', 'in', 'of', 'and', 'desmopressin', 'to...",0.882804,88.280379
2,guillain - barr syndrome ( gbs ) is an immune ...,psychiatric symptoms in guillain - barr syndro...,4195,108,"['and', 'of', 'the', 'her', 'to', 'with', 'was...",0.877994,87.799449
3,progress in three - dimensional ( 3d ) echocar...,this review covers the role of three - dimensi...,4951,121,"['the', 'and', 'of', 'in', 'with', 'a', 'valve...",0.905861,90.586089
4,municipal solid waste workers ( mswws ) or ref...,background : solid waste management has emerge...,6167,375,"['and', 'the', 'of', 'to', 'waste', 'in', 'wor...",0.872562,87.256238
5,stroke is a clinical emergency that commonly r...,to assess the stroke workload of italian neuro...,7252,220,"['the', 'of', 'and', 'stroke', 'in', 'nu', 'fo...",0.748964,74.896378
6,chronic kidney disease ( ckd ) is one of the m...,glomerular filtration rate ( gfr ) is still a ...,4526,227,"['the', 'of', 'and', 'a', 'to', 'in', 'gfr', '...",0.866245,86.62448
7,"over the past decade , it has increasingly bec...",study backgroundpatients who experience a recu...,1344,268,"['the', 'of', 'a', 'patients', 'to', 'and', 'v...",0.936864,93.686421
8,prostate cancer is one of the most prevalent t...,"purposein clinical practice , atypical small a...",2279,279,"['the', 'in', 'of', 'cancer', 'and', 'prostate...",0.954418,95.441754
9,congenital cervical swelling constitutes a het...,an 18-year - old male patient presented with a...,841,146,"['the', 'of', 'in', 'was', 'a', 'and', 'cyst',...",0.926442,92.644246


In [3]:
# objeto Rouge para calcular la métrica ROUGE
rouge = Rouge()

In [4]:
# modelo T5 preentrenado y su tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 6.99MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 4.14MB/s]
config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 1.22MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and

In [5]:
# listas para almacenar los resultados de la métrica ROUGE
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

In [6]:
# Iteración sobre cada fila del DataFrame
for index, row in df_ref.iterrows():
    article = row['article']
    original_summary = row['abstract']
    
    input_text = "summarize: " + article
    
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    summary_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
    
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    scores = rouge.get_scores(generated_summary, original_summary)
    
    rouge_1_scores.append(scores[0]['rouge-1']['f'])
    rouge_2_scores.append(scores[0]['rouge-2']['f'])
    rouge_l_scores.append(scores[0]['rouge-l']['f'])

In [7]:
# DataFrame para almacenar los resultados de ROUGE
rouge_scores_df = pd.DataFrame({
    'ROUGE-1': rouge_1_scores,
    'ROUGE-2': rouge_2_scores,
    'ROUGE-L': rouge_l_scores
})

In [9]:
rouge_scores_df

Unnamed: 0,ROUGE-1,ROUGE-2,ROUGE-L
0,0.170854,0.026403,0.150754
1,0.214286,0.015504,0.178571
2,0.224138,0.097902,0.206897
3,0.384615,0.17341,0.338462
4,0.222222,0.065934,0.204444
5,0.217391,0.017544,0.202899
6,0.211765,0.052632,0.2
7,0.379487,0.137705,0.358974
8,0.161074,0.023904,0.134228
9,0.235294,0.031088,0.220588


In [10]:
# Para posteriores cálculos se extrae el dataframe 'rouge_scores_T5_df'
rouge_scores_df.to_csv('rouge_scores_T5_df.csv', index=False)

In [11]:
average_rouge_scores = rouge_scores_df.mean()
print("Promedio de métricas ROUGE para todo el dataset:")
average_rouge_scores

Promedio de métricas ROUGE para todo el dataset:


ROUGE-1    0.240686
ROUGE-2    0.066609
ROUGE-L    0.221416
dtype: float64