In [None]:
nvidia-smi

NameError: name 'nvidia' is not defined

In [None]:
!pip install transformers[torch] datasets scikit-learn --quiet

# Asegurarnos de tener una versión reciente de huggingface_hub si es necesario, aunque suele instalarse con transformers
!pip install --upgrade huggingface_hub --quiet



In [None]:
import torch
print(torch.__version__)
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())

2.5.1
NVIDIA GeForce RTX 3050
True


In [None]:
import transformers
import huggingface_hub

print(transformers.__version__)
print(huggingface_hub.__version__)


4.52.4
0.33.0


In [None]:
# Importación de librerías
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [None]:
# Cargar el dataset (puedes cambiar aquí el nombre del archivo para cada tarea)
df = pd.read_csv("formalidad_123_bookcorpus_es.csv")  # Cambia el archivo para cada tarea
df = df.rename(columns={"formalidad_nivel": "labels"})
# Asegúrate que tus columnas son: text, label (numéricas)
# Convertir la columna 'labels' a tipo entero y restar 1 para hacerlas cero-indexadas
df['labels'] = df['labels'].astype(int) - 1

# --- Added check for label values ---
print("Unique label values after subtraction:", df['labels'].unique())
if not all(label in [0, 1, 2] for label in df['labels'].unique()):
    print("Warning: Labels are not in the expected range [0, 1, 2]. Please check your data.")
# --- End of added check ---

print(df.head())

Unique label values after subtraction: [0 1 2]
                                            sentence  labels  \
0  most of what you will want to see can be seen ...       0   
1         i don't want to leak anything huge on her.       0   
2                              'nothing has changed'       0   
3                   i'm running breezy on my laptop.       0   
4                                         thanks....       0   

                                            texto_es  
0  La mayor parte de lo que querrás ver se puede ...  
1             No quiero filtrar nada enorme en ella.  
2                                 "Nada ha cambiado"  
3               Estoy corriendo breezy en mi laptop.  
4                                         Gracias...  


In [None]:
# Dividir en train y test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)

# Convertir a Dataset de HuggingFace
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Cargar el tokenizer de BETO
model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Función de tokenización
def tokenize_function(examples):
    return tokenizer(examples["texto_es"], padding="max_length", truncation=True, max_length=128)

# Aplicar tokenización
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Ajustar el formato para PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Cargar el modelo (indicar el número de clases de cada tarea)
num_labels = 3  # Cambia este valor según tu tarea (3 o 4)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    use_safetensors=True  # 👈 Este fix evita el error sin necesidad de torch >= 2.6
)


Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(df["labels"].unique())
print(df.dtypes)


[0 1 2]
sentence    object
labels       int64
texto_es    object
dtype: object


In [None]:
# Definir argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none",
)

# Definir Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Entrenar el modelo
trainer.train()

# Evaluación final
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(test_df['labels'], preds))


Epoch,Training Loss,Validation Loss
1,0.1678,0.257227
2,0.1849,0.252446
3,0.0828,0.305253


              precision    recall  f1-score   support

           0       0.91      0.93      0.92       400
           1       0.94      0.94      0.94       400
           2       0.98      0.96      0.97       400

    accuracy                           0.94      1200
   macro avg       0.94      0.94      0.94      1200
weighted avg       0.94      0.94      0.94      1200



In [None]:
# Entrenar el modelo
trainer.train()

# Guardar modelo y tokenizer
output_dir = "modelo_guardado"

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Modelo y tokenizer guardados en {output_dir}")


Epoch,Training Loss,Validation Loss
1,0.0365,0.29979
2,0.0318,0.370792
3,0.0005,0.424554


Modelo y tokenizer guardados en modelo_guardado


BLOQUE 2 – Cargar el modelo y tokenizer guardados (sin reentrenar)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Ruta del modelo entrenado
model_path = "modelo_formalidad"

# Cargar tokenizer y modelo
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.to(device)

# Ahora puedes usarlo con un Trainer, pipeline o a mano

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Ejemplo con Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer
    # Puedes agregar args, dataset, etc.
)


  trainer = Trainer(


In [None]:
# Lista de textos a clasificar
textos = [
    "Los niños juegan en el parque con sus juguetes.",
    "Este libro es perfecto para adolescentes de secundaria.",
    "Las políticas de inversión a largo plazo deben evaluarse con criterio."
]

In [None]:
import torch.nn.functional as F
resultados = []
# Etiquetas
label_map = {
    0: "informal",
    1: "neutro",
    2: "formal"
}

# Tokenizar todos los textos y mover a dispositivo
inputs = tokenizer(textos, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Realizar inferencia
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = F.softmax(logits, dim=1).cpu().tolist()
    pred_indices = torch.argmax(logits, dim=1).cpu().tolist()
    # preds = outputs.logits.argmax(dim=1).tolist()

# Mostrar resultados
for texto, idx, prob in zip(textos, pred_indices, probs):
    resultados.append({
        "texto": texto,
        "etiqueta": label_map[idx],
        "indice": idx,
        "probabilidades": prob
    })


for r in resultados:
    print(f"Texto: {r['texto']}")
    print(f"→ Etiqueta: {r['etiqueta']} (índice {r['indice']})")
    print(f"→ Probabilidades: {r['probabilidades']}\n")

Texto: Los niños juegan en el parque con sus juguetes.
→ Etiqueta: informal (índice 0)
→ Probabilidades: [0.9039178490638733, 0.08085069805383682, 0.01523139514029026]

Texto: Este libro es perfecto para adolescentes de secundaria.
→ Etiqueta: informal (índice 0)
→ Probabilidades: [0.9993970394134521, 0.00029426312539726496, 0.00030862606945447624]

Texto: Las políticas de inversión a largo plazo deben evaluarse con criterio.
→ Etiqueta: formal (índice 2)
→ Probabilidades: [0.0030349406879395247, 0.0002135898103006184, 0.9967514276504517]

