In [None]:
import pandas as pd
import json

# Opción 1: si es una lista de objetos JSON
with open("/content/dataset_politico_auto_2000.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
df = df.dropna()
df.head()

In [None]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v:k for k,v in label2id.items()}

df["label_id"] = df["label"].map(label2id)
df["label_id"].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label_id"]
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df["label_id"]
)

print(len(train_df), len(val_df), len(test_df))

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

model_ckpt = "pysentimiento/robertuito-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Crear datasets desde los DataFrames
train_ds = Dataset.from_pandas(train_df[["text", "label_id"]])
val_ds   = Dataset.from_pandas(val_df[["text", "label_id"]])
test_ds  = Dataset.from_pandas(test_df[["text", "label_id"]])

# Renombrar a "labels"
train_ds = train_ds.rename_column("label_id", "labels")
val_ds   = val_ds.rename_column("label_id", "labels")
test_ds  = test_ds.rename_column("label_id", "labels")

# Tokenizar
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

# Eliminar texto original
train_ds = train_ds.remove_columns(["text"])
val_ds   = val_ds.remove_columns(["text"])
test_ds  = test_ds.remove_columns(["text"])

train_ds.set_format("torch")
val_ds.set_format("torch")
test_ds.set_format("torch")


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

In [None]:
!pip install -q datasets evaluate accelerate scikit-learn


In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)
    f1_macro = f1.compute(predictions=preds, references=labels, average="macro")
    return {"accuracy": acc["accuracy"], "f1_macro": f1_macro["f1"]}

In [None]:
!pip install -q "transformers>=4.30.0" "datasets" "evaluate" "accelerate" "scikit-learn"


In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding


args = TrainingArguments(
    output_dir="beto-politica-ec",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=50,
    fp16=True,
    report_to="none",
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
trainer.train()

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model_ckpt = "pysentimiento/robertuito-sentiment-analysis"

base_model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

eval_args = TrainingArguments(
    output_dir="beto-base-eval",
    per_device_eval_batch_size=32,
    report_to="none"
)

base_trainer = Trainer(
    model=base_model,
    args=eval_args,
    eval_dataset=test_ds,      # el mismo test_ds
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

base_metrics = base_trainer.evaluate()
print(base_metrics)

In [None]:
import pandas as pd

# métricas del modelo base
base_df = pd.DataFrame([base_metrics])
base_df

ft_metrics = trainer.evaluate(test_ds)

res = pd.DataFrame(
    [base_metrics, ft_metrics],
    index=["modelo_base", "modelo_finetuned"]
)
res[["eval_accuracy", "eval_f1_macro", "eval_loss"]]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

preds = trainer.predict(test_ds)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

print(classification_report(y_true, y_pred, target_names=[id2label[i] for i in range(3)]))
print(confusion_matrix(y_true, y_pred))

In [None]:
trainer.save_model("fineTuning/modelo_final")
tokenizer.save_pretrained("fineTuning/modelo_final")