# TAREA 2. POLAR SemEval-2026
## mLLMs
### Juan Carlos Perez Ramirez

### Descripcion de propuesta:
Se propone un modelo basado en RoBERTuito, con un paradigma de entrenamiento inspirado en PPBERT (preentrenamiento + posentrenamiento + fine-tuning), donde el fine-tuning hace uso de adapters dada la reducida cantidad de datos de entrenamiento.

Datasets usados en el posentrenamiento:
- Analisis de sentimientos TASS 2020
- Analisis de emociones TASS 2020
- Datos en espanol de HatEval 2019

#### Carga de librerias

In [1]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset, Dataset
import pandas as pd
import os
import numpy as np
import random
import torch
import torch.nn as nn
from dataclasses import dataclass
from typing import Dict, List

from transformers import Trainer, TrainingArguments, set_seed
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/polar/"
POSTTRAIN_DIR = os.path.join(BASE_DIR, "posttrain")
os.makedirs(POSTTRAIN_DIR, exist_ok=True)

def set_all_seeds(seed: int = 42):
    # 1. Python
    random.seed(seed)

    # 2. Variables de entorno (afecta algunos backends)
    os.environ["PYTHONHASHSEED"] = str(seed)

    # 3. NumPy
    np.random.seed(seed)

    # 4. PyTorch (CPU y CUDA)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # para multi-GPU

    # 5. PyTorch â€“ comportamiento determinista (opcional pero recomendable)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # 6. Hugging Face (usa internamente lo anterior, pero ayuda para Trainer)
    set_seed(seed)

# Llamar al inicio del notebook/script
set_all_seeds(42)


Mounted at /content/drive


In [2]:
os.environ["HF_TOKEN"] = "TOKEN"
from huggingface_hub import login
login(new_session=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Posentrenamiento
#### Carga de datasets

In [3]:
sentiment_ds = load_dataset(
    "csv",
    data_files={"train": "/content/drive/MyDrive/polar/tass2020sen.csv"},
    delimiter="\t"
)

emotion_ds = load_dataset(
    "csv",
    data_files={"train": "/content/drive/MyDrive/polar/tass2020emo.csv"},
    delimiter="\t"
)

dataset_hateval = load_dataset("valeriobasile/HatEval")
hateval_ds = dataset_hateval["train"].filter(lambda x: x["language"] == "es")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

README.md:   0%|          | 0.00/4.87k [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.51M [00:00<?, ?B/s]

data/dev-00000-of-00001.parquet:   0%|          | 0.00/184k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/496k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13500 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4570 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13500 [00:00<?, ? examples/s]

#### Normalizacion y unificacion de datasets

In [4]:
def to_common_df(ds, text, label, task_name):
    df = ds.to_pandas()
    df = df[[text, label]].copy()
    df.columns = ["text", "labels"]
    df["task"] = task_name
    return df


In [5]:
sentiment_task = to_common_df(sentiment_ds["train"],
                                      text="text",
                                      label="sentiment",
                                      task_name="sentiment")
emotion_task = to_common_df(emotion_ds["train"],
                                    text="tweet",
                                    label="label ",
                                    task_name="emotion")
hate_task = to_common_df(hateval_ds,
                                    text="text",
                                    label="HS",
                                    task_name="hate")

if sentiment_task["labels"].dtype == object:
    mapping_sent = {label: i for i, label in enumerate(sorted(sentiment_task["labels"].unique()))}
    sentiment_task["labels"] = sentiment_task["labels"].map(mapping_sent)

if emotion_task["labels"].dtype == object:
    mapping_emo = {label: i for i, label in enumerate(sorted(emotion_task["labels"].unique()))}
    emotion_task["labels"] = emotion_task["labels"].map(mapping_emo)

multitask_train = Dataset.from_pandas(pd.concat([sentiment_task, emotion_task, hate_task], ignore_index=True))

In [6]:
# Mapeo task -> id
task_names = ["sentiment", "emotion", "hate"]
task2id = {t: i for i, t in enumerate(task_names)}
id2task = {i: t for t, i in task2id.items()}

def add_task_id(batch):
    batch["task_id"] = [task2id[t] for t in batch["task"]]
    return batch

multitask_train = multitask_train.map(add_task_id, batched=True)


Map:   0%|          | 0/18488 [00:00<?, ? examples/s]

#### Carga de modelo

In [7]:
MODEL_NAME = "pysentimiento/robertuito-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

multitask_train = multitask_train.map(tokenize_batch, batched=True)

multitask_train.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels", "task_id"],
)

multitask_train.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels", "task_id"],)

tokenizer_config.json:   0%|          | 0.00/323 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Map:   0%|          | 0/18488 [00:00<?, ? examples/s]

In [8]:
# Clase para fase de posentrenamiento
class MultiTaskRoberto(nn.Module):
    def __init__(self, model_name, num_labels_dict, dropout=0.1, task_id2name=None):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = self.encoder.config.hidden_size

        self.num_labels_dict = num_labels_dict
        self.max_labels = max(num_labels_dict.values())

        self.classifiers = nn.ModuleDict({
            task: nn.Linear(self.hidden_size, n_labels)
            for task, n_labels in num_labels_dict.items()
        })

        self.task_id2name = task_id2name

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        task_id=None,
        labels=None,
        **kwargs,
    ):
        enc_out = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        cls = enc_out.last_hidden_state[:, 0]  # CLS
        cls = self.dropout(cls)                # (batch_size, hidden)

        batch_size = cls.size(0)

        # logits rellenos con -1e9 para que esas clases "fantasma" nunca ganen
        logits = cls.new_full((batch_size, self.max_labels), fill_value=-1e9)

        # Para cada ejemplo del batch, aplicar la head de su tarea
        for i in range(batch_size):
            t_id = task_id[i].item()
            t_name = self.task_id2name[t_id]
            head = self.classifiers[t_name]
            n_labels_t = head.out_features

            logits_i = head(cls[i:i+1])  # (1, n_labels_t)
            logits[i, :n_labels_t] = logits_i  # copiar en las primeras posiciones

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.max_labels), labels.view(-1))

        return {"loss": loss, "logits": logits}

In [9]:
num_labels_dict = {
    "sentiment": 3,  # negative / neutral / positive
    "emotion": 7,      # anger / disgust / fear / joy / sadness / surprise / others
    "hate": 2,       # no-hate / hate
}

model = MultiTaskRoberto(
    MODEL_NAME,
    num_labels_dict=num_labels_dict,
    task_id2name=id2task
)


config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/435M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at pysentimiento/robertuito-base-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
@dataclass
class MultiTaskCollator:
    def __call__(self, features: List[Dict]):
        batch = {}
        for key in ["input_ids", "attention_mask", "labels", "task_id"]:
            batch[key] = torch.stack([f[key] for f in features])
        return batch

data_collator = MultiTaskCollator()

#### Posentrenamiento

In [12]:
training_args = TrainingArguments(
    output_dir=POSTTRAIN_DIR,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=100,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=multitask_train,
    data_collator=data_collator,
)

In [None]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjuanc-perez-rmz[0m ([33mjuanc-perez-rmz-cimat[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
100,1.1362
200,0.9161
300,0.771


Step,Training Loss
100,1.1362
200,0.9161
300,0.771
400,0.7171
500,0.6787
600,0.6722
700,0.6626
800,0.6429
900,0.6636
1000,0.6589




In [13]:
trainer.train(resume_from_checkpoint="/content/drive/MyDrive/polar/checkpoint-2000")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 6cdf7d6d002e584f6bb07d148552a98327a6ffd6


[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjuanc-perez-rmz[0m ([33mjuanc-perez-rmz-cimat[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Didn't manage to set back the RNG states of the CUDA because of the following error:
 'cuda'
This won't yield the same results as if the training had not been interrupted.


Step,Training Loss
2100,0.3753
2200,0.3481
2300,0.3612
2400,0.3228
2500,0.3762
2600,0.3449
2700,0.3453
2800,0.3466
2900,0.3369
3000,0.3424


TrainOutput(global_step=3468, training_loss=0.14570220476478304, metrics={'train_runtime': 835.9018, 'train_samples_per_second': 66.352, 'train_steps_per_second': 4.149, 'total_flos': 0.0, 'train_loss': 0.14570220476478304, 'epoch': 3.0})

In [14]:
FINAL_ENCODER_DIR = os.path.join(BASE_DIR, "robertuito-posttrained-ppbert-style")
os.makedirs(FINAL_ENCODER_DIR, exist_ok=True)

# guardar solo el encoder posentrenado
model.encoder.save_pretrained(FINAL_ENCODER_DIR)
tokenizer.save_pretrained(FINAL_ENCODER_DIR)


('/content/drive/MyDrive/polar/robertuito-posttrained-ppbert-style/tokenizer_config.json',
 '/content/drive/MyDrive/polar/robertuito-posttrained-ppbert-style/special_tokens_map.json',
 '/content/drive/MyDrive/polar/robertuito-posttrained-ppbert-style/tokenizer.json')

### Adapter

In [20]:
!pip uninstall -y peft
!pip install -U adapters
from transformers.trainer_utils import get_last_checkpoint
from transformers import AutoTokenizer, TrainingArguments, Trainer, EvalPrediction
from adapters import AutoAdapterModel, AdapterConfig
import os
from datasets import load_dataset
import torch
import torch.nn as nn
import random
from sklearn.metrics import f1_score

def set_all_seeds(seed: int = 42):
    # 1. Python
    random.seed(seed)

    # 2. Variables de entorno (afecta algunos backends)
    os.environ["PYTHONHASHSEED"] = str(seed)

    # 3. NumPy
    np.random.seed(seed)

    # 4. PyTorch (CPU y CUDA)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # para multi-GPU

    # 5. PyTorch â€“ comportamiento determinista (opcional pero recomendable)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # 6. Hugging Face (usa internamente lo anterior, pero ayuda para Trainer)
    #set_seed(seed)

# Llamar al inicio del notebook/script
set_all_seeds(42)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Definicion de directorios de origen

In [21]:
BASE_DIR = "/content/drive/MyDrive/polar/"
FINAL_ENCODER_DIR = os.path.join(BASE_DIR, "robertuito-posttrained-ppbert-style")
ADAPTER_RUN_DIR = os.path.join(BASE_DIR, "polarization_with_adapter")
os.makedirs(ADAPTER_RUN_DIR, exist_ok=True)

print("Encoder dir:", FINAL_ENCODER_DIR)
print("Adapter run dir:", ADAPTER_RUN_DIR)

Encoder dir: /content/drive/MyDrive/polar/robertuito-posttrained-ppbert-style
Adapter run dir: /content/drive/MyDrive/polar/polarization_with_adapter


#### Configuracion de adapter

In [43]:
TASK_NAME = "polarization"
NUM_LABELS = 2

config = AdapterConfig.load("pfeiffer")

# Cargamos el encoder posentrenado desde Drive
tokenizer = AutoTokenizer.from_pretrained(FINAL_ENCODER_DIR)
model = AutoAdapterModel.from_pretrained(FINAL_ENCODER_DIR)

# AÃ±adir adapter de tarea
model.add_adapter(TASK_NAME, config=config)

# AÃ±adir cabeza de clasificaciÃ³n para polarizaciÃ³n
model.add_classification_head(
    head_name   = TASK_NAME,
    num_labels  = NUM_LABELS,
    id2label    = {0: "No polarizado", 1: "Polarizado"}
)

# Activar este adapter por defecto
model.set_active_adapters(TASK_NAME)

# Solo entrenar el adapter + head (backbone congelado)
model.train_adapter(TASK_NAME)





#### Carga de dataset

In [44]:
from torch.utils.data import Dataset
import pandas as pd

# Clase para ordenar datos
class polar(Dataset):

  def __init__(self, Dir, split, tokenizer, sep='\t', use_labels=True):
    self.use_labels = use_labels

    csv_file   = os.path.join(Dir, split + '.csv')

    self.df = pd.read_csv(csv_file, sep=sep)

    if self.use_labels:
      self.labels    = self.df['polarization']

    self.texts = self.df['text'].tolist()

    self.encodings = tokenizer(
        self.texts,
        max_length = 128,
        truncation = True,
        padding = True
      )

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['text']   = self.texts[idx]
    if self.use_labels:
      item['labels'] = torch.tensor(self.labels[idx])
    return item

In [45]:
train_dataset = polar(BASE_DIR, 'train', tokenizer)
val_dataset   = polar(BASE_DIR, 'test'  , tokenizer, use_labels=False)

#### Entrenamiento del adapter

In [None]:
training_args = TrainingArguments(
    learning_rate               = 1e-4,
    #weight_decay                 = 0.01,
    num_train_epochs            = 5,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size  = 32,
    logging_steps               = 100,
    output_dir                  = "./training_output",
    overwrite_output_dir        = True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns       = False,
    metric_for_best_model       = "macro_f1"
)

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    macro_f1 = f1_score(p.label_ids, preds, average="macro")
    acc = (preds == p.label_ids).mean()
    return {
        "macro_f1": macro_f1,
        "accuracy": acc,
    }

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_dataset,
    eval_dataset    = val_dataset,
    compute_metrics = compute_metrics,
)
trainer.args._n_gpu = 1


In [55]:
train_result = trainer.train()

Step,Training Loss
100,0.3846


#### Guardado del modelo final

In [None]:
# Guardar modelo completo (encoder + adapter + head) en Drive
FULL_MODEL_DIR = os.path.join(ADAPTER_RUN_DIR, "final_full_model")
os.makedirs(FULL_MODEL_DIR, exist_ok=True)
trainer.save_model(FULL_MODEL_DIR)
tokenizer.save_pretrained(FULL_MODEL_DIR)

print("Modelo completo guardado en:", FULL_MODEL_DIR)


#### Evaluacion

In [56]:
import numpy as np

predictions = trainer.predict(val_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
predicted_probabilities = predictions.predictions

results_df = pd.DataFrame({
    'text': [val_dataset[i]['text'] for i in range(len(val_dataset))],
    'predicted_label': predicted_labels,
    'predicted_prob_0': predicted_probabilities[:, 0] if predicted_probabilities.shape[1] > 1 else None,
    'predicted_prob_1': predicted_probabilities[:, 1] if predicted_probabilities.shape[1] > 1 else None
})

# Mostrar estadÃ­sticas
print(f"\nDistribuciÃ³n de predicciones:")
print(results_df['predicted_label'].value_counts().sort_index())


DistribuciÃ³n de predicciones:
predicted_label
0    175
1    156
Name: count, dtype: int64


#### Almacenamiento de resultados

In [57]:
# Obtener predicciones
predictions = trainer.predict(val_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Crear DataFrame en el mismo formato que tu entrada
output_df = pd.DataFrame()

# Si el dataset original tenÃ­a columna 'id'
if hasattr(val_dataset, 'df') and 'id' in val_dataset.df.columns:
    output_df['id'] = val_dataset.df['id'].tolist()
else:
    # Crear IDs secuenciales
    output_df['id'] = [f'pred_{i:06d}' for i in range(len(predicted_labels))]

# AÃ±adir texto original
#output_df['text'] = [val_dataset[i]['text'] for i in range(len(val_dataset))]

# AÃ±adir predicciÃ³n como 'polarization'
output_df['polarization'] = predicted_labels

# Guardar en CSV
output_file = '/content/test_predictions_with_polarization.csv'
output_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"âœ… CSV guardado: {output_file}")
print(f"ðŸ“Š Formato: id, text, polarization")
print(f"\nðŸ“„ Vista previa:")
print(output_df[['id', 'polarization']].head())

âœ… CSV guardado: /content/test_predictions_with_polarization.csv
ðŸ“Š Formato: id, text, polarization

ðŸ“„ Vista previa:
                                     id  polarization
0  spa_b5517c1f058b477c72b68aa6691cfbff             0
1  spa_05984d49e2365ba0542e1a05df578114             1
2  spa_ab648cde892ea2dc2148424020467487             0
3  spa_2750f54273a1914c2ed42d39a45ca02a             1
4  spa_2584268795f274a38c85414c77893dce             1
