In [19]:
import nltk
import evaluate
import numpy as np
import wandb
import logging
import torch
import huggingface_hub
from datasets import load_dataset
from transformers import (
    AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, EncoderDecoderCache
)
from colorlog import ColoredFormatter

import warnings

warnings.filterwarnings("ignore")

In [23]:
class FineTuner:
    """
    Clase para entrenar y ajustar modelos T5 para la tarea de resumen de textos.
    Proporciona métodos para cargar datasets, preprocesar datos, configurar entrenamiento, 
    entrenar el modelo y generar resúmenes.
    """

    def __init__(self, model_name, project_name):
        """
        Inicializa el modelo, el tokenizer y configura logging y W&B.
        
        Args:
            model_name (str): Nombre del modelo preentrenado en Hugging Face.
            project_name (str): Nombre del proyecto en Weights & Biases.
        """
        self.model_name = model_name
        self.project_name = project_name
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Iniciar logging
        self._configure_logging()
        self.logger.info("✨ Configuración de logging completada.")

        # Iniciar Weights & Biases
        wandb.init(project=self.project_name)
        self.logger.info("🚀 Weights & Biases inicializado.")

        # Cargar el tokenizador y el modelo
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(self.device)
        self.logger.info("🤖 Modelo y tokenizer cargados.")

    def _configure_logging(self):
        """
        Configura el sistema de logging con colores para una salida más clara.
        """
        formatter = ColoredFormatter(
            "%(log_color)s%(asctime)s - %(levelname)s: %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
            log_colors={
                'DEBUG': 'cyan',
                'INFO': 'green',
                'WARNING': 'yellow',
                'ERROR': 'red',
                'CRITICAL': 'bold_red',
            },
            secondary_log_colors={},
            style='%'
        )
        handler = logging.StreamHandler()
        handler.setFormatter(formatter)
        logging.basicConfig(level=logging.INFO, handlers=[handler])
        self.logger = logging.getLogger(__name__)

    def load_datasets(self):
        """
        Carga el dataset CNN/DailyMail desde Hugging Face y prepara los conjuntos de entrenamiento y validación.
        """
        self.logger.info("📚 Cargando dataset...")
        self.cnn_dailymail = load_dataset("cnn_dailymail", "3.0.0")
        self.train_dataset = self.cnn_dailymail["train"]#.shuffle(seed=42).select(range(2000))
        self.val_dataset = self.cnn_dailymail["validation"]#.shuffle(seed=42).select(range(400))
        self.logger.info("🗂️ Datasets de entrenamiento y validación preparados.")

    def preprocess_function(self, examples):
        """
        Preprocesa los datos añadiendo un prefijo al texto y tokenizando las entradas y etiquetas.
        
        Args:
            examples (dict): Diccionario con textos y etiquetas del dataset.

        Returns:
            dict: Datos tokenizados listos para el modelo.
        """
        prefix = "summarize: "
        inputs = [prefix + doc for doc in examples["article"]]
        model_inputs = self.tokenizer(inputs, max_length=1024, truncation=True)
        labels = self.tokenizer(text_target=examples["highlights"], max_length=256, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def preprocess_datasets(self):
        """
        Aplica la función de preprocesamiento a los datasets de entrenamiento y validación.
        """
        self.logger.info("🔄 Preprocesando datasets...")
        self.tokenized_train = self.train_dataset.map(self.preprocess_function, batched=True)
        self.tokenized_val = self.val_dataset.map(self.preprocess_function, batched=True)
        self.logger.info("📝 Preprocesamiento completado.")

    def configure_training(self):
        """
        Configura los argumentos de entrenamiento y el entrenador (Seq2SeqTrainer).
        """
        self.logger.info("⚙️ Configurando entrenamiento...")
        self.metric = evaluate.load("rouge")

        def compute_metrics(eval_preds):
            """
            Calcula las métricas Rouge para las predicciones y etiquetas.

            Args:
                eval_preds (tuple): Predicciones y etiquetas del modelo.

            Returns:
                dict: Métricas calculadas.
            """
            preds, labels = eval_preds
            if isinstance(preds, tuple):
                preds = EncoderDecoderCache.from_legacy_cache(preds)
            preds = np.where(preds < 0, 0, preds)
            labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
            decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
            decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
            decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
            decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
            return self.metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

        self.training_args = Seq2SeqTrainingArguments(
            run_name=f"{self.model_name}-cnn_dailymail",
            output_dir=f"./models/{self.model_name}/results",
            eval_strategy="epoch",
            logging_strategy="epoch",
            logging_dir=f"./models/{self.model_name}/logs",
            report_to=["wandb"],
            learning_rate=2e-5,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=32,
            weight_decay=0.01,
            save_total_limit=3,
            num_train_epochs=100,
            fp16=True,
            predict_with_generate=True,
            generation_max_length=64
        )

        self.trainer = Seq2SeqTrainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.tokenized_train,
            eval_dataset=self.tokenized_val,
            processing_class=self.tokenizer,
            data_collator=DataCollatorForSeq2Seq(tokenizer=self.tokenizer, model=self.model),
            compute_metrics=compute_metrics
        )
        self.logger.info("🏋️‍♂️ Entrenador configurado.")

    def train(self):
        """
        Entrena el modelo utilizando el conjunto de entrenamiento configurado.
        """
        self.logger.info("🚦 Iniciando el entrenamiento...")
        self.trainer.train()
        self.logger.info("✅ Entrenamiento completado.")

    def save_to_hub(self):
        """
        Guarda el modelo y el tokenizer en Hugging Face Hub.
        """
        self.logger.info("📤 Guardando modelo en Hugging Face Hub...")
        self.model.push_to_hub(f"{self.model_name}-cnn-dailymail")
        self.tokenizer.push_to_hub(f"{self.model_name}-cnn-dailymail")
        self.logger.info("✅ Modelo y tokenizer guardados.")

    """
    def summarize(self, article):
        ""
        Genera un resumen para un artículo dado.

        Args:
            article (str): Texto del artículo a resumir.

        Returns:
            str: Resumen generado por el modelo.
        ""
        self.logger.info("✏️ Generando resumen...")
        input_text = f"summarize: {article}"
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(self.device)
        outputs = self.model.generate(
            inputs["input_ids"],
            max_length=128,
            num_beams=4,
            early_stopping=True
        )
        summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        self.logger.info("📰 Resumen generado:")
        self.logger.info(summary)
        return summary
    """

In [24]:
# Tokens de acceso
hf_token = "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"
wandb_token = "22aa2ac85315f4d6a7221cdcfd57fa25acc3eb06"

# Iniciar sesión en Hugging Face y Weights & Biases
from huggingface_hub import login

wandb.login(key=wandb_token)
login(hf_token)



In [25]:
# Ejecución
if __name__ == "__main__":
    fine_tuner = FineTuner("t5-small", "FineTune-T5-Abstract-Summary-CNN-DailyMail")
    fine_tuner.load_datasets()
    fine_tuner.preprocess_datasets()
    fine_tuner.configure_training()
    fine_tuner.train()
    fine_tuner.save_to_hub()


2024-12-12 04:47:27 - INFO: ✨ Configuración de logging completada.
2024-12-12 04:47:27 - INFO: 🚀 Weights & Biases inicializado.
2024-12-12 04:47:28 - INFO: 🤖 Modelo y tokenizer cargados.
2024-12-12 04:47:28 - INFO: 📚 Cargando dataset...
2024-12-12 04:47:33 - INFO: 🗂️ Datasets de entrenamiento y validación preparados.
2024-12-12 04:47:33 - INFO: 🔄 Preprocesando datasets...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

2024-12-12 04:47:37 - INFO: 📝 Preprocesamiento completado.
2024-12-12 04:47:37 - INFO: ⚙️ Configurando entrenamiento...
2024-12-12 04:47:38 - INFO: 🏋️‍♂️ Entrenador configurado.
2024-12-12 04:47:38 - INFO: 🚦 Iniciando el entrenamiento...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.3157,1.883471,0.396875,0.185417,0.289142,0.371881
2,2.1152,1.830926,0.396201,0.179915,0.283186,0.369113
3,2.0828,1.817722,0.399056,0.184794,0.288457,0.372571
4,2.0639,1.81122,0.401587,0.185681,0.289056,0.374514
5,2.0581,1.806868,0.403782,0.187435,0.290574,0.377623
6,2.0404,1.805469,0.403452,0.187158,0.290286,0.376683
7,2.0347,1.804703,0.404205,0.187647,0.291513,0.377506
8,2.0333,1.802167,0.40601,0.189242,0.292565,0.379832
9,2.0299,1.802107,0.405317,0.18869,0.2917,0.378986
10,2.025,1.802013,0.40602,0.189275,0.292739,0.379256


2024-12-12 04:49:49 - INFO: Using default tokenizer.
2024-12-12 04:51:30 - INFO: Using default tokenizer.
2024-12-12 04:53:53 - INFO: Using default tokenizer.
2024-12-12 04:56:41 - INFO: Using default tokenizer.
2024-12-12 04:58:59 - INFO: Using default tokenizer.
2024-12-12 05:00:34 - INFO: Using default tokenizer.
2024-12-12 05:02:09 - INFO: Using default tokenizer.
2024-12-12 05:03:49 - INFO: Using default tokenizer.
2024-12-12 05:05:31 - INFO: Using default tokenizer.
2024-12-12 05:07:13 - INFO: Using default tokenizer.
2024-12-12 05:07:14 - INFO: ✅ Entrenamiento completado.
2024-12-12 05:07:14 - INFO: 📤 Guardando modelo en Hugging Face Hub...


README.md:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
2024-12-12 05:07:30 - INFO: ✅ Modelo y tokenizer guardados.
2024-12-12 05:07:30 - INFO: ✏️ Generando resumen...
2024-12-12 05:07:30 - INFO: 📰 Resumen generado:
2024-12-12 05:07:30 - INFO: Aqu va un texto de prueba.
