In [1]:
from transformers import BertForSequenceClassification, AdamW, BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [2]:
# Verifica si hay una GPU disponible y, en caso contrario, utiliza la CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Carga el modelo pre-entrenado y el tokenizador
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5).to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
df = pd.read_json('dataTrain.json')
df_test = pd.read_json('dataTest.json')

In [4]:
df.columns

Index(['id', 'text', 'label'], dtype='object')

In [5]:
df = df[:200]

In [6]:
# Supongamos que `texts` es una lista de textos y `labels` es una lista de etiquetas
texts_train, texts_val, labels_train, labels_val = train_test_split(df.text, df.label, test_size=0.2)

# Codifica los textos
train_encodings = tokenizer(texts_train.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(texts_val.tolist(), truncation=True, padding=True)

# Convertir las representaciones tokenizadas a tensores
train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_mask = torch.tensor(train_encodings['attention_mask'])

val_input_ids = torch.tensor(val_encodings['input_ids'])
val_attention_mask = torch.tensor(val_encodings['attention_mask'])

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(labels_train)
val_labels_encoded = label_encoder.transform(labels_val)

# Convierte las etiquetas a tensores
train_labels = torch.tensor(train_labels_encoded)
val_labels = torch.tensor(val_labels_encoded)

# Crea los conjuntos de datos
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_mask, val_labels)

print('datasets')


# Crea los dataloaders
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=1)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=1)

print('dataloaders')

# Define el optimizador
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

print('opitmizer')

datasets
dataloaders
opitmizer


In [7]:
epochs = 3

# Entrena el modelo
for epoch in range(epochs):
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

In [None]:

# Carga el modelo BERT preentrenado
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Inicializa la lista para las predicciones
predictions = []

# Cambia al modo de evaluación
model.eval()

# Itera sobre los datos de prueba
for item in df_test:
    # Tokeniza el texto
    inputs = tokenizer(item['text'], return_tensors='pt')
    
    # Realiza una pasada hacia adelante (predice)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Obtiene las predicciones
    logits = outputs[0]
    predicted_class = torch.argmax(logits, dim=1)
    
    # Almacena las predicciones
    predictions.append(predicted_class.item())


In [8]:
from sklearn.metrics import accuracy_score, classification_report

# Cambia al modo de evaluación
model.eval()

# Inicializa las listas para las verdaderas y las predicciones
true_labels = []
pred_labels = []

# Itera sobre los lotes en el dataloader de validación
for batch in validation_dataloader:
    # Agrega el lote al dispositivo
    batch = tuple(t.to(device) for t in batch)
    
    # Desempaqueta los inputs del dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # No calcula ni almacena los gradientes para ahorrar memoria y mejorar la velocidad
    with torch.no_grad():
        # Realiza una pasada hacia adelante (predice)
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Obtiene las predicciones
    logits = outputs[0]

    # Mueve los logits y las etiquetas a la CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Almacena las verdaderas y las predicciones
    true_labels.append(label_ids)
    pred_labels.append(np.argmax(logits, axis=1))

# Aplana las listas de verdaderas y predicciones
true_labels = [item for sublist in true_labels for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]

# Calcula la precisión
acc = accuracy_score(true_labels, pred_labels)

print('La precisión del modelo en el conjunto de validación es: {:.3f}'.format(acc))
print('Reporte de clasificación:')
print(classification_report(true_labels, pred_labels))


La precisión del modelo en el conjunto de validación es: 1.000
Reporte de clasificación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

