# Proyecto Final - Modelos de Lenguaje
Este cuaderno reemplaza los scripts en Python planos y concentra en un solo lugar el flujo de trabajo del proyecto final.

## Cargar dependencias
Importamos las bibliotecas necesarias para manipular datos, entrenar y evaluar modelos de lenguaje.

In [None]:
from pathlib import Path
from typing import Dict, Optional

from distutils.version import LooseVersion

import transformers
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args_kwargs = dict(
    output_dir="models/spanish-bert-finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

if LooseVersion(transformers.__version__) >= LooseVersion("3.1.0"):
    training_args_kwargs["evaluation_strategy"] = "epoch"
else:
    training_args_kwargs["evaluate_during_training"] = True

training_args = TrainingArguments(**training_args_kwargs)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

trainer.train()


## Preparar datos
Cargamos el corpus directamente desde `datasets` y preparamos etiquetas adecuadas para una tarea de clasificación binaria.


In [None]:
def build_classification_dataset(split: str = "train", sample_size: Optional[int] = 2000) -> Dataset:
    """Carga un subconjunto del dataset CNN/DailyMail y crea etiquetas binarizadas."""
    dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split=split)
    if sample_size is not None:
        dataset = dataset.shuffle(seed=42).select(range(sample_size))

    def convert_example(example: Dict[str, str]) -> Dict[str, object]:
        summary_length = len(example["highlights"].split())
        label = 1 if summary_length > 60 else 0
        return {"text": example["article"], "label": label}

    return dataset.map(convert_example, remove_columns=dataset.column_names)

dataset = build_classification_dataset()
dataset


## Tokenización
Tokenizamos los ejemplos con el tokenizer adecuado al modelo base seleccionado.

In [None]:
model_name = "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(batch: Dict[str, str]):
    return tokenizer(batch["text"], padding='max_length', truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset


## Entrenamiento
Se definen los argumentos y se lanza el proceso de fine-tuning del modelo de lenguaje.

In [None]:
from distutils.version import LooseVersion

import transformers

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args_kwargs = dict(
    output_dir="models/spanish-bert-finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

if LooseVersion(transformers.__version__) >= LooseVersion("3.1.0"):
    training_args_kwargs["evaluation_strategy"] = "epoch"
else:
    training_args_kwargs["evaluate_during_training"] = True

training_args = TrainingArguments(**training_args_kwargs)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

trainer.train()


## Evaluación y guardado
Calculamos métricas y guardamos los artefactos del modelo para su posterior reutilización.

In [None]:
metrics = trainer.evaluate()
metrics

Path("models/spanish-bert-finetuned").mkdir(parents=True, exist_ok=True)
trainer.save_model()
tokenizer.save_pretrained("models/spanish-bert-finetuned")
