In [30]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
from evaluate import load as load_evaluate


In [31]:

class AbstractSummarizerTrainer:
    """
    Entrena y ajusta finamente un modelo T5 para resumen abstractivo utilizando CNN/DailyMail.
    Incluye técnicas para controlar la longitud del resumen y mantener coherencia.
    """

    def __init__(self, model_name: str = "t5-small", max_input_length: int = 512, max_output_length: int = 50):
        """
        Inicializa el tokenizador y modelo preentrenado.
        :param model_name: Nombre del modelo T5 en Hugging Face (e.g., 't5-small').
        :param max_input_length: Longitud máxima de los textos de entrada.
        :param max_output_length: Longitud máxima de los textos de salida.
        """
        self.tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name, ignore_mismatched_sizes=True)
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def preprocess_function(self, examples):
        """
        Preprocesa los datos para que sean compatibles con el modelo T5.
        :param examples: Ejemplos del dataset.
        :return: Diccionario con textos tokenizados.
        """
        inputs = ["summarize: " + doc for doc in examples["article"]]
        model_inputs = self.tokenizer(
            inputs, max_length=self.max_input_length, truncation=True, padding="max_length"
        )
        labels = self.tokenizer(
            text_target=examples["highlights"], max_length=self.max_output_length, truncation=True, padding="max_length"
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def train(self, train_dataset, eval_dataset, output_dir: str = "models/t5-cnn", epochs: int = 3, batch_size: int = 8):
        """
        Ajusta el modelo utilizando el dataset de entrenamiento y evaluación.
        :param train_dataset: Dataset de entrenamiento.
        :param eval_dataset: Dataset de evaluación.
        :param output_dir: Directorio donde guardar el modelo ajustado.
        :param epochs: Número de épocas de entrenamiento.
        :param batch_size: Tamaño del batch.
        """
        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            eval_strategy="epoch",
            learning_rate=5e-5,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=0.01,
            predict_with_generate=True,
            generation_max_length=self.max_output_length,
            save_strategy="no",
            logging_steps=50,
        )

        trainer = Seq2SeqTrainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=self.tokenizer,
        )

        trainer.train()
        self.save_model(output_dir)

    def save_model(self, output_dir: str):
        """
        Guarda el modelo ajustado en el directorio especificado.
        :param output_dir: Ruta para guardar el modelo.
        """
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)

    def evaluate(self, eval_dataset, min_length: int = 30):
        """
        Evalúa el modelo en el dataset de prueba, controlando la longitud mínima.
        :param eval_dataset: Dataset de prueba.
        :param min_length: Longitud mínima del resumen.
        :return: Métricas de evaluación (ROUGE).
        """
        metric = load_evaluate("rouge")

        def generate_summary(batch):
            """
            Genera resúmenes con restricciones de longitud mínima.
            :param batch: Batch del dataset.
            :return: Resúmenes generados.
            """
            inputs = self.tokenizer(
                batch["article"], max_length=self.max_input_length, truncation=True, padding="max_length",
                return_tensors="pt"
            )
            inputs = {key: value.to(self.model.device) for key, value in inputs.items()}
            summaries = self.model.generate(
                inputs["input_ids"], max_length=self.max_output_length, min_length=min_length, num_beams=4
            )
            return self.tokenizer.batch_decode(summaries, skip_special_tokens=True)

        # Generar resúmenes para todo el conjunto de evaluación
        predictions = [generate_summary({"article": [example["article"]]})[0] for example in eval_dataset]
        references = [example["highlights"] for example in eval_dataset]

        # Calcular métricas ROUGE
        results = metric.compute(predictions=predictions, references=references)
        return results

    def infer(self, text: str, min_length: int = 30) -> str:
        """
        Genera un resumen para un texto de entrada.
        :param text: Texto de entrada.
        :param min_length: Longitud mínima del resumen.
        :return: Resumen generado.
        """
        input_text = "summarize: " + text
        inputs = self.tokenizer(input_text, max_length=self.max_input_length, truncation=True, return_tensors="pt")
        inputs = {key: value.to(self.model.device) for key, value in inputs.items()}
        summary_ids = self.model.generate(
            inputs["input_ids"], max_length=self.max_output_length, min_length=min_length, num_beams=4
        )
        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [32]:
# Cargar el dataset CNN/DailyMail y usar un subconjunto para optimizar tiempo
dataset = load_dataset("cnn_dailymail", "3.0.0")
train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
eval_dataset = dataset["validation"].shuffle(seed=42).select(range(200))
test_dataset = dataset["test"].shuffle(seed=42).select(range(200))


In [33]:
# Instanciar el entrenador y preprocesar los datasets
trainer = AbstractSummarizerTrainer(model_name="t5-small")
train_dataset = train_dataset.map(trainer.preprocess_function, batched=True)
eval_dataset = eval_dataset.map(trainer.preprocess_function, batched=True)
test_dataset = test_dataset.map(trainer.preprocess_function, batched=True)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [34]:
# Entrenar y guardar el modelo ajustado
trainer.train(train_dataset, eval_dataset, output_dir="models", epochs=10, batch_size=8)


Epoch,Training Loss,Validation Loss
1,2.2658,1.929556
2,2.0476,1.879679
3,1.968,1.855671
4,1.9573,1.841938
5,1.8464,1.837652
6,1.853,1.830479
7,1.8266,1.832381
8,1.8228,1.82959
9,1.8055,1.828883
10,1.793,1.829309


In [35]:
# Evaluar el modelo en el conjunto de prueba
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Métricas ROUGE en el conjunto de prueba:", metrics)


Métricas ROUGE en el conjunto de prueba: {'rouge1': 0.3403463037773344, 'rouge2': 0.1379420423174355, 'rougeL': 0.23670830358883072, 'rougeLsum': 0.29149551729812473}


In [36]:
metrics.items()

dict_items([('rouge1', 0.3403463037773344), ('rouge2', 0.1379420423174355), ('rougeL', 0.23670830358883072), ('rougeLsum', 0.29149551729812473)])

In [37]:
## 4. Pruebas de Inferencia
example_text = ("""
    Deputy police commissioner Nick Kaldas is giving evidence at an inquiry . Kaldas, 57, is a counter terrorism expert who has trained Iraqi police . He arrived in Australia aged 12 and fluent in English, French and Arabic . The inquiry is into a illegal police bugging operation of 114 people in 2000 . Kaldas is the highest ranking officer secretly bugged by his rival Kath Burn . He has 'explosive' evidence about bugging which has 'denigrated' his career . He has suffered reprisals for speaking out about the bugging scandal . The bugging operation threatens to blow apart NSW police hierarchy . He said independent inquiry into bugging scandal has left him fearful . Claimed Operation Prospect had sided with the officers being complained about and targeted him and other victims .
    """
)


In [38]:
generated_summary = trainer.infer(example_text)
print("\nTexto Original:\n", example_text)
print("\nResumen Generado:\n", generated_summary)



Texto Original:
 
    Deputy police commissioner Nick Kaldas is giving evidence at an inquiry . Kaldas, 57, is a counter terrorism expert who has trained Iraqi police . He arrived in Australia aged 12 and fluent in English, French and Arabic . The inquiry is into a illegal police bugging operation of 114 people in 2000 . Kaldas is the highest ranking officer secretly bugged by his rival Kath Burn . He has 'explosive' evidence about bugging which has 'denigrated' his career . He has suffered reprisals for speaking out about the bugging scandal . The bugging operation threatens to blow apart NSW police hierarchy . He said independent inquiry into bugging scandal has left him fearful . Claimed Operation Prospect had sided with the officers being complained about and targeted him and other victims .
    

Resumen Generado:
 Deputy police commissioner Nick Kaldas is giving evidence at an inquiry. Kaldas, 57, is a counter terrorism expert who has trained Iraqi police. He arrived in Australi