In [8]:
import os
import numpy as np
from dataclasses import dataclass
from typing import Dict, Any, List, Tuple

import torch
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)   


In [9]:
@dataclass
class FineTuneConfig:
    """
    Configuración para el fine-tuning del modelo.
    """
    model_name: str
    output_dir: str
    max_source_length: int = 1024
    max_target_length: int = 128
    batch_size: int = 4
    num_train_epochs: int = 1
    learning_rate: float = 2e-5
    weight_decay: float = 0.01
    eval_strategy: str = "epoch"
    predict_with_generate: bool = True
    num_beams: int = 4


In [10]:
class SummarizationPreprocessor:
    """
    Clase para preprocesar el dataset CNN/DailyMail para tareas de resumen.
    """
    def __init__(self, tokenizer, max_source_length: int = 1024, max_target_length: int = 128):
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def preprocess_batch(self, batch: Dict[str, List[str]]) -> Dict[str, Any]:
        """
        Preprocesa un batch de datos del dataset CNN/DailyMail.

        Args:
            batch (Dict[str, List[str]]): Batch con claves "article" y "highlights".

        Returns:
            Dict[str, Any]: Diccionario con entradas tokenizadas y etiquetas.
        """
        inputs = batch["article"]
        targets = batch["highlights"]
        model_inputs = self.tokenizer(
            inputs, 
            max_length=self.max_source_length, 
            truncation=True, 
            padding="max_length",
            return_tensors="pt"
        )
        
        # Convertir las etiquetas eficientemente a tensores
        labels = self.tokenizer(
            targets, 
            max_length=self.max_target_length, 
            truncation=True, 
            padding="max_length",
            return_tensors="np"
        )["input_ids"]
        
        model_inputs["labels"] = torch.tensor(np.array(labels), dtype=torch.int64)
        return model_inputs


In [11]:
class SummarizationModel:
    """
    Clase para encapsular la carga del modelo y el tokenizer,
    facilitando la inicialización de diferentes LLM (T5, BART, PEGASUS).
    """

    def __init__(self, model_name: str):
        """
        Args:
            model_name (str): Nombre del modelo en Hugging Face Hub.
        """
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    def get_components(self) -> Tuple[Any, Any]:
        """
        Retorna el tokenizer y el modelo.

        Returns:
            (tokenizer, model)
        """
        return self.tokenizer, self.model


In [12]:
class SummarizationTrainer:
    """
    Clase responsable de entrenar, evaluar y guardar el modelo de resumen.
    """

    def __init__(
            self,
            model_name: str,
            config: FineTuneConfig,
            train_dataset,
            val_dataset,
            metric_name: str = "rouge"
    ):
        """
        Inicializa el entrenador para el modelo de resumen.

        Args:
            model_name (str): Nombre del modelo en Hugging Face Hub.
            config (FineTuneConfig): Configuración de fine-tuning.
            train_dataset: Dataset de entrenamiento tokenizado.
            val_dataset: Dataset de validación tokenizado.
            metric_name (str): Nombre de la métrica a cargar (por defecto, 'rouge').
        """
        self.model_name = model_name
        self.config = config
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.metric = evaluate.load(metric_name)
        self.tokenizer, self.model = SummarizationModel(model_name).get_components()

        self.data_collator = DataCollatorForSeq2Seq(
            self.tokenizer, model=self.model, padding="longest"
        )

        self.training_args = TrainingArguments(
            output_dir=self.config.output_dir,
            eval_strategy=self.config.eval_strategy,
            per_device_train_batch_size=self.config.batch_size,
            per_device_eval_batch_size=self.config.batch_size,
            learning_rate=self.config.learning_rate,
            weight_decay=self.config.weight_decay,
            num_train_epochs=self.config.num_train_epochs,
            overwrite_output_dir=True
        )


    def compute_metrics(self, eval_preds):
        """
        Computa métricas a partir de las predicciones del modelo.

        Args:
            eval_preds: Tupla (predictions, labels) del conjunto de validación.

        Returns:
            Dict[str, float]: Métricas calculadas (ROUGE).
        """
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = self.tokenizer.batch_decode(
            preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        decoded_labels = self.tokenizer.batch_decode(
            labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )

        # ROUGE eval
        result = self.metric.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True
        )
        # Promediar y redondear
        result = {k: round(v.mid.fmeasure * 100, 2) for k, v in result.items()}
        return result

    def train_and_evaluate(self):
        """
        Entrena el modelo y lo evalúa en el conjunto de validación,
        luego guarda el modelo fine-tuneado.
        """
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            tokenizer=self.tokenizer,
            data_collator=self.data_collator,
            compute_metrics=self.compute_metrics
        )
        trainer.train()
        eval_results = trainer.evaluate()
        print(f"Evaluation results for {self.model_name}: {eval_results}")

        # Guardar el modelo fine-tuneado
        trainer.save_model(self.config.output_dir)
        self.tokenizer.save_pretrained(self.config.output_dir)
        print(f"Modelo guardado en: {self.config.output_dir}")


In [13]:
def main():
    """
    Función principal que ejecuta el pipeline de:
    1. Carga y preprocesamiento del dataset CNN/DailyMail.
    2. Entrenamiento y evaluación de T5, BART y PEGASUS.
    3. Guardado de los modelos fine-tuneados.
    """
    # Carga el dataset
    dataset = load_dataset("cnn_dailymail", "3.0.0")
    train_dataset_raw = dataset["train"]
    val_dataset_raw = dataset["validation"]

    # Configuraciones de cada modelo
    models_to_train = [
        {
            "model_name": "t5-small",
            "output_dir": "resources/t5_finetuned_cnn",
        },
        {
            "model_name": "facebook/bart-large-cnn",
            "output_dir": "resources/bart_finetuned_cnn",
        },
        {
            "model_name": "google/pegasus-cnn_dailymail",
            "output_dir": "resources/pegasus_finetuned_cnn",
        }
    ]
    # t5-small | google-t5/t5-large
    # facebook/bart-large-cnn | facebook/bart-large
    # google/pegasus-cnn_dailymail | google/pegasus-large

    for model_info in models_to_train:
        model_name = model_info["model_name"]
        output_dir = model_info["output_dir"]

        # Inicializar preprocesador
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        preprocessor = SummarizationPreprocessor(
            tokenizer=tokenizer,
            max_source_length=1024,
            max_target_length=128
        )

        # Tokenizar datasets
        tokenized_train = train_dataset_raw.map(
            preprocessor.preprocess_batch,
            batched=True,
            remove_columns=["article", "highlights", "id"]
        )
        tokenized_val = val_dataset_raw.map(
            preprocessor.preprocess_batch,
            batched=True,
            remove_columns=["article", "highlights", "id"]
        )

        tokenized_train.set_format("torch")
        tokenized_val.set_format("torch")

        # Crear config de fine-tuning
        config = FineTuneConfig(
            model_name=model_name,
            output_dir=output_dir,
            max_source_length=1024,
            max_target_length=128,
            batch_size=10,
            num_train_epochs=100,
            learning_rate=2e-5,
            weight_decay=0.01,
            eval_strategy="epoch",
            predict_with_generate=True,
            num_beams=4
        )

        # Entrenar y evaluar el modelo
        trainer = SummarizationTrainer(
            model_name=model_name,
            config=config,
            train_dataset=tokenized_train,
            val_dataset=tokenized_val
        )
        trainer.train_and_evaluate()


In [None]:
main()

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]