

# ==>   **MODELO MULTICLASE**



1. Modelo personalizado: RobertaMultitaskClassifier

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    AutoModel
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# ========== PARTE 2: Modelo multitarea ==========
class RobertaMultitaskClassifier(nn.Module):
    def __init__(self, model_name, num_labels_age=3, num_labels_gender=2, dropout=0.3):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            use_safetensors=True
        )
        self.dropout = nn.Dropout(dropout)
        hidden_size = self.encoder.config.hidden_size
        self.classifier_age = nn.Linear(hidden_size, num_labels_age)
        self.classifier_gender = nn.Linear(hidden_size, num_labels_gender)

    def forward(self, input_ids, attention_mask, labels_age=None, labels_gender=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]
        pooled = self.dropout(pooled)
        logits_age = self.classifier_age(pooled)
        logits_gender = self.classifier_gender(pooled)

        loss = None
        if labels_age is not None and labels_gender is not None:
            loss_age = nn.CrossEntropyLoss()(logits_age, labels_age)
            loss_gender = nn.CrossEntropyLoss()(logits_gender, labels_gender)
            loss = loss_age + loss_gender

        return {"loss": loss, "logits_age": logits_age, "logits_gender": logits_gender}

In [None]:
# ========== PARTE 1: Preparar dataset ==========

# Cargar CSV
# Asegúrate de que el CSV tenga: text_es, age_group, gender
df = pd.read_csv("publico_blog_authorship_translated.csv")

# Mapear labels
df = df[df['age_group'].isin(['18-29', '30-39', '40-49'])]
df = df[df['gender'].isin(['male', 'female'])]
df['label_age'] = df['age_group'].map({'18-29': 0, '30-39': 1, '40-49': 2})
df['label_gender'] = df['gender'].map({'male': 0, 'female': 1})

# Crear dataset de Hugging Face
dataset = Dataset.from_pandas(df)
tokenizer = AutoTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")

def tokenize(example):
    tokens = tokenizer(example['text_es'], padding='max_length', truncation=True, max_length=128)
    tokens['labels_age'] = example['label_age']
    tokens['labels_gender'] = example['label_gender']
    return tokens

dataset = dataset.map(tokenize)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels_age', 'labels_gender'])

split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split['train']
test_dataset = split['test']


# ========== PARTE 3: Métricas ==========
def compute_metrics_multitask(eval_pred):
    logits_age, logits_gender = eval_pred.predictions
    labels_age, labels_gender = eval_pred.label_ids
    preds_age = np.argmax(logits_age, axis=1)
    preds_gender = np.argmax(logits_gender, axis=1)

    return {
        "accuracy_age": accuracy_score(labels_age, preds_age),
        "f1_age": f1_score(labels_age, preds_age, average="macro"),
        "accuracy_gender": accuracy_score(labels_gender, preds_gender),
        "f1_gender": f1_score(labels_gender, preds_gender, average="macro")
    }

# ========== PARTE 4: Trainer personalizado ==========
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels_age = inputs.pop("labels_age")
        labels_gender = inputs.pop("labels_gender")
        outputs = model(**inputs, labels_age=labels_age, labels_gender=labels_gender)
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss

# ========== PARTE 5: Entrenamiento ==========
model = RobertaMultitaskClassifier("PlanTL-GOB-ES/roberta-base-bne")

training_args = TrainingArguments(
    output_dir="./modelo_publico",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=6,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1_age",
    greater_is_better=True
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics_multitask
)

trainer.train()



Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Some weights of RobertaModel were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Age,F1 Age,Accuracy Gender,F1 Gender
1,1.6667,1.689277,0.425,0.348831,0.621667,0.621287
2,1.3553,1.712081,0.485,0.480164,0.6425,0.641747
3,0.8439,2.040062,0.476667,0.463638,0.6275,0.626535
4,0.2375,2.796715,0.4725,0.472489,0.618333,0.618265
5,0.0778,3.412782,0.466667,0.46497,0.611667,0.61119
6,0.0248,3.704114,0.455833,0.455596,0.619167,0.618974


AttributeError: 'RobertaMultitaskClassifier' object has no attribute 'save_pretrained'

In [None]:
# Guardar modelo y tokenizer
output_dir = "modelo_publico"

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Modelo y tokenizer guardados en {output_dir}")

Modelo y tokenizer guardados en modelo_publico


In [3]:
import torch
from safetensors.torch import load_file
from transformers import AutoTokenizer

# Cargar tokenizer
tokenizer = AutoTokenizer.from_pretrained("modelo_publico")

# Reconstruir modelo multitarea
model = RobertaMultitaskClassifier(
    model_name="PlanTL-GOB-ES/roberta-base-bne",
    num_labels_age=3,
    num_labels_gender=2
)

# Cargar pesos desde archivo .safetensors
state_dict = load_file("modelo_publico/model.safetensors", device="cuda")  # o "cpu"
model.load_state_dict(state_dict)
model.to("cuda")  # si estás usando GPU
model.eval()


Some weights of RobertaModel were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaMultitaskClassifier(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50262, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [7]:
# Evaluación final
from transformers import Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer
    # Puedes agregar args, dataset, etc.
)
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(test_df['labels'], preds))

  trainer = Trainer(


NameError: name 'test_dataset' is not defined

In [None]:
import torch
import torch.nn.functional as F

def predecir_batch(textos):
    model.eval()
    inputs = tokenizer(textos, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Si estás usando GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs_age = F.softmax(outputs["logits_age"], dim=1)
        probs_gender = F.softmax(outputs["logits_gender"], dim=1)

        preds_age = torch.argmax(probs_age, dim=1).tolist()
        preds_gender = torch.argmax(probs_gender, dim=1).tolist()

    resultados = []
    for i in range(len(textos)):
        resultados.append({
            "texto": textos[i],
            "edad": ["18-29", "30-39", "40-49"][preds_age[i]],
            "probs_edad": [round(p, 4) for p in probs_age[i].tolist()],
            "genero": ["male", "female"][preds_gender[i]],
            "probs_genero": [round(p, 4) for p in probs_gender[i].tolist()]
        })

    return resultados

In [None]:
textos = [
    "No entiendo cómo usan TikTok, me siento viejo.",
    "Ey bro, qué onda ese juego nuevo que salió ayer?",
    "Prefiero leer el diario en papel, como antes."
]

resultados = predecir_batch(textos)

for r in resultados:
    print(f"Texto: {r['texto']}")
    print(f"Edad predicha: {r['edad']}  -  Probabilidades: {r['probs_edad']}")
    print(f"Género predicho: {r['genero']}  -  Probabilidades: {r['probs_genero']}")
    print("-" * 60)


Texto: No entiendo cómo usan TikTok, me siento viejo.
Edad predicha: 18-29  -  Probabilidades: [0.555, 0.3704, 0.0746]
Género predicho: male  -  Probabilidades: [0.7323, 0.2677]
------------------------------------------------------------
Texto: Ey bro, qué onda ese juego nuevo que salió ayer?
Edad predicha: 18-29  -  Probabilidades: [0.7734, 0.1396, 0.087]
Género predicho: female  -  Probabilidades: [0.4795, 0.5205]
------------------------------------------------------------
Texto: Prefiero leer el diario en papel, como antes.
Edad predicha: 18-29  -  Probabilidades: [0.464, 0.4135, 0.1225]
Género predicho: male  -  Probabilidades: [0.728, 0.272]
------------------------------------------------------------
