# 1. Importar Librerías

In [1]:
# Si es Google Colab
#!pip install transformers datasets scikit-learn
import torch

# Comprobar si PyTorch detecta la GPU
print(f"PyTorch versión: {torch.__version__}")
print(f"CUDA disponible: {torch.cuda.is_available()}")
print(f"Dispositivos CUDA: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Nombre GPU: {torch.cuda.get_device_name(0)}")
    print(f"Versión de CUDA: {torch.version.cuda}")

from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    Trainer, 
    TrainingArguments
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Usa GPU si está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando dispositivo:", device)
from huggingface_hub import login

PyTorch versión: 2.5.1+cu121
CUDA disponible: True
Dispositivos CUDA: 1
Nombre GPU: NVIDIA GeForce RTX 3050 Laptop GPU
Versión de CUDA: 12.1
Usando dispositivo: cuda


# 2. Cargamos el Conjunto de datos (Dataset)
https://huggingface.co/datasets/manueltonneau/spanish-hate-speech-superset/tree/main

In [3]:
login() # Sigue las instrucciones en consola. User: Yao924380 / Pass: fd7a76b02dda2bFba54fd37e69659dc5
dataset = load_dataset("manueltonneau/spanish-hate-speech-superset")

# Verifica las columnas disponibles
print(dataset)
print(dataset['train'].column_names)

# Ejemplo de muestra
print(dataset['train'][0])

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'source', 'dataset', 'nb_annotators', 'tweet_id', 'post_author_country_location'],
        num_rows: 29855
    })
})
['text', 'labels', 'source', 'dataset', 'nb_annotators', 'tweet_id', 'post_author_country_location']
{'text': 'Eran tan pero tan feministas que invisibilizaban constantemente a las trabajadoras sexuales, haciéndole creer al mundo que eran incapaces de decidir y que cada vez que ejercían su derecho a hacerlo estaban siendo violadas.', 'labels': 0.0, 'source': 'Twitter', 'dataset': 'chileno', 'nb_annotators': 3, 'tweet_id': 1401281102535442433, 'post_author_country_location': 'unknown'}


# 3. Preprocesamiento

In [4]:
# Algunos datasets traen etiquetas como strings; convertimos a numéricas.
# En este dataset, la columna suele ser 'labels' con 0 = no hate, 1 = hate
def clean_labels(example):
    example["label"] = int(example["labels"])
    return example

dataset = dataset.map(clean_labels)

# Dividir en entrenamiento y validación (90% / 10%)
dataset_split = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print(f"Tamaño train: {len(train_dataset)}  |  eval: {len(eval_dataset)}")

Tamaño train: 26869  |  eval: 2986


# 4. Cargar modelo BETO (BERT ES)

In [5]:
model_name = "dccuchile/bert-base-spanish-wwm-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    trust_remote_code=True,
    use_safetensors=True
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 5. Tokenización y preparación de datos

In [6]:
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_fn, batched=True)
eval_dataset = eval_dataset.map(tokenize_fn, batched=True)

# Dejamos solo las columnas necesarias
cols = ['input_ids', 'attention_mask', 'label']
train_dataset.set_format(type='torch', columns=cols)
eval_dataset.set_format(type='torch', columns=cols)

# 6. Configuración de entrenamiento

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # Solo usar pin memory si hay GPU disponible
    dataloader_pin_memory=torch.cuda.is_available(),
)

# 7. Definir métricas

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# 8. Fine-tuning

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1621,0.498647,0.864032,0.698366,0.802048,0.618421
2,0.2016,0.443739,0.862358,0.720218,0.746121,0.696053
3,0.0677,0.661492,0.862693,0.710042,0.767584,0.660526


TrainOutput(global_step=5040, training_loss=0.1686083081222716, metrics={'train_runtime': 2112.2116, 'train_samples_per_second': 38.162, 'train_steps_per_second': 2.386, 'total_flos': 5302148209850880.0, 'train_loss': 0.1686083081222716, 'epoch': 3.0})

# 9. Evaluación y guardado

In [11]:
results = trainer.evaluate()
print("Resultados de evaluación:", results)

# Guardar modelo y tokenizer
model.save_pretrained("./DetectorDeOdio-finetuned")
tokenizer.save_pretrained("./DetectorDeOdio-finetuned")

Resultados de evaluación: {'eval_loss': 0.4437393248081207, 'eval_accuracy': 0.8623576691225721, 'eval_f1': 0.720217835262083, 'eval_precision': 0.7461212976022567, 'eval_recall': 0.6960526315789474, 'eval_runtime': 20.397, 'eval_samples_per_second': 146.394, 'eval_steps_per_second': 2.304, 'epoch': 3.0}


('./DetectorDeOdio-finetuned\\tokenizer_config.json',
 './DetectorDeOdio-finetuned\\special_tokens_map.json',
 './DetectorDeOdio-finetuned\\vocab.txt',
 './DetectorDeOdio-finetuned\\added_tokens.json')