In [None]:
nvidia-smi

NameError: name 'nvidia' is not defined

In [None]:
!pip install transformers[torch] datasets scikit-learn --quiet

# Asegurarnos de tener una versión reciente de huggingface_hub si es necesario, aunque suele instalarse con transformers
!pip install --upgrade huggingface_hub --quiet



In [1]:
import torch
print(torch.__version__)
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())

2.5.1
NVIDIA GeForce RTX 3050
True


In [2]:
import transformers
import huggingface_hub

print(transformers.__version__)
print(huggingface_hub.__version__)


4.52.4
0.33.0


In [7]:
# Importación de librerías
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import torch

In [34]:
# Cargar el dataset (puedes cambiar aquí el nombre del archivo para cada tarea)
df = pd.read_csv("agresividad_civil_comments_traducido.csv")  # Cambia el archivo para cada tarea
df = df.rename(columns={"agresividad_nivel": "labels"})
# Asegúrate que tus columnas son: text, label (numéricas)
# Convertir la columna 'labels' a tipo entero y restar 1 para hacerlas cero-indexadas
df['labels'] = df['labels'].astype(int) - 1
df['labels'] = df['labels'].replace({3: 2, 4: 2})

# --- Added check for label values ---
print("Unique label values after subtraction:", df['labels'].unique())
if not all(label in [0, 1, 2] for label in df['labels'].unique()):
    print("Warning: Labels are not in the expected range [0, 1, 2]. Please check your data.")
# --- End of added check ---


print(df.head())

Unique label values after subtraction: [0 1 2]
                                            texto_en  labels  \
0  This is so cool. It's like, 'would you want yo...       0   
1  Thank you!! This would make my life a lot less...       0   
2  This is such an urgent design problem; kudos t...       0   
3  Is this something I'll be able to install on m...       0   
4                        hahahahahahahahhha suck it.       0   

                                            texto_es  
0  Esto es tan genial. Es como, '¿Quieres que tu ...  
1  Gracias!! Esto haría mi vida mucho menos ansie...  
2  Este es un problema de diseño tan urgente; gra...  
3  ¿Es esto algo que podré instalar en mi sitio? ...  
4                                Jajajajaja chúpalo.  


In [37]:
print(df["labels"].unique())
print(df.dtypes)
print(df['labels'].value_counts())
print(df['labels'].value_counts(normalize=True) * 100)

[0 1 2]
texto_en    object
labels       int64
texto_es    object
dtype: object
labels
0    10000
1    10000
2     6407
Name: count, dtype: int64
labels
0    37.868747
1    37.868747
2    24.262506
Name: proportion, dtype: float64


In [40]:
# ========= PARTE 1: Preparar datos =========
# División de datos
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["labels"], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ========= PARTE 2: Tokenización =========

model_name = "Brandon-h/distilbert-finetuned-spanish-offensive-language"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["texto_es"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ========= PARTE 3: Métricas =========

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# ========= PARTE 4: Cargar modelo y congelar encoder =========

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)

# Fase 1: congelar encoder (entrenar solo la capa de clasificación)
for param in model.distilbert.parameters():
    param.requires_grad = False


Map:   0%|          | 0/21125 [00:00<?, ? examples/s]

Map:   0%|          | 0/5282 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at Brandon-h/distilbert-finetuned-spanish-offensive-language and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
# Argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./resultados_agresividad",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Entrenar
trainer.train()

# Evaluación final
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(test_df['labels'], preds))


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7236,0.787347,0.629496
2,0.767,0.760686,0.649186
3,0.8229,0.754214,0.652026


              precision    recall  f1-score   support

           0       0.71      0.82      0.76      2000
           1       0.59      0.61      0.60      2000
           2       0.66      0.45      0.54      1282

    accuracy                           0.65      5282
   macro avg       0.65      0.63      0.63      5282
weighted avg       0.65      0.65      0.64      5282



In [42]:
# Guardar modelo y tokenizer
output_dir = "modelo_agresividad"

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Modelo y tokenizer guardados en {output_dir}")


Modelo y tokenizer guardados en modelo_agresividad


BLOQUE 2 – Cargar el modelo y tokenizer guardados (sin reentrenar)

In [43]:
from transformers import BertTokenizer, BertForSequenceClassification

# Dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Ruta del modelo entrenado
model_path = "modelo_agresividad"

# Cargar tokenizer y modelo
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

# Ahora puedes usarlo con un Trainer, pipeline o a mano

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# Ejemplo con Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer
    # Puedes agregar args, dataset, etc.
)


  trainer = Trainer(


In [44]:
# Lista de textos a clasificar
textos = [
    "Eres un idiota y no deberías estar aquí.",
    "Buenos días, ¿podrías ayudarme con esto?",
    "Eres un hijo de puta",
    "No estoy de acuerdo con eso, pero respeto tu opinión."
]

In [45]:
import torch.nn.functional as F
resultados = []
# Etiquetas
label_map = {
    0: "no agresivo",
    1: "neutro",
    2: "agresivo"
}

# Tokenizar todos los textos y mover a dispositivo
inputs = tokenizer(textos, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Realizar inferencia
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = F.softmax(logits, dim=1).cpu().tolist()
    pred_indices = torch.argmax(logits, dim=1).cpu().tolist()
    # preds = outputs.logits.argmax(dim=1).tolist()

# Mostrar resultados
for texto, idx, prob in zip(textos, pred_indices, probs):
    resultados.append({
        "texto": texto,
        "etiqueta": label_map[idx],
        "indice": idx,
        "probabilidades": prob
    })


for r in resultados:
    print(f"Texto: {r['texto']}")
    print(f"→ Etiqueta: {r['etiqueta']} (índice {r['indice']})")
    print(f"→ Probabilidades: {r['probabilidades']}\n")

Texto: Eres un idiota y no deberías estar aquí.
→ Etiqueta: neutro (índice 1)
→ Probabilidades: [0.03878844156861305, 0.4972328841686249, 0.46397864818573]

Texto: Buenos días, ¿podrías ayudarme con esto?
→ Etiqueta: no agresivo (índice 0)
→ Probabilidades: [0.8399632573127747, 0.1335805058479309, 0.026456264778971672]

Texto: Eres un hijo de puta
→ Etiqueta: agresivo (índice 2)
→ Probabilidades: [0.04046919569373131, 0.4166504740715027, 0.5428802967071533]

Texto: No estoy de acuerdo con eso, pero respeto tu opinión.
→ Etiqueta: no agresivo (índice 0)
→ Probabilidades: [0.7541329860687256, 0.2074529230594635, 0.038414131850004196]

