In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from torch.optim import AdamW
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Cargar el dataset
df = pd.read_csv("/disaster_preprocessed.csv")


# Verificar si GPU está disponible y configurar el dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)


Device: cuda


In [4]:
# Mapear etiquetas de `Informativeness_label`
label_mapping = {
    'informative': 1,
    'not related or not informative': 0
}
df['Informativeness_label_mapped'] = df['Informativeness_label'].map(label_mapping)
df = df.dropna(subset=['Informativeness_label_mapped'])

In [5]:
# División inicial: Training (80%), Validation (5000), Test (5000)
train_df, temp_df = train_test_split(df, test_size=10000, random_state=42, stratify=df['Informativeness_label_mapped'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['Informativeness_label_mapped'])

print(f"Tamaño de training: {len(train_df)}, validación: {len(val_df)}, prueba: {len(test_df)}")

Tamaño de training: 64346, validación: 5000, prueba: 5000


In [7]:
from transformers import BertTokenizer

# Tokenizer preentrenado
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
from sklearn.preprocessing import LabelEncoder

# Codificación de etiquetas
train_labels = train_df['Informativeness_label_mapped'].values
val_labels = val_df['Informativeness_label_mapped'].values
test_labels = test_df['Informativeness_label_mapped'].values

In [9]:
# Tokenización
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset compatible con PyTorch
class DisasterDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        return {key: val.squeeze() for key, val in encoding.items()}, torch.tensor(self.labels[idx])

# Crear datasets y DataLoaders
train_dataset = DisasterDataset(list(train_df['ProcessedText']), train_labels, tokenizer)
val_dataset = DisasterDataset(list(val_df['ProcessedText']), val_labels, tokenizer)
test_dataset = DisasterDataset(list(test_df['ProcessedText']), test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [10]:
print(train_df.columns)
print(train_df['Informativeness_label_mapped'].value_counts())


Index(['InformationType', 'event', 'TweetID', 'TweetText', 'location', 'year',
       'Informativeness_label', 'ProcessedText', 'ProcessedText_length',
       'Informativeness_label_mapped'],
      dtype='object')
Informativeness_label_mapped
1    37684
0    26662
Name: count, dtype: int64


In [12]:
# Función de evaluación
def evaluate(loader, model):
    model.eval()
    predictions, true_labels = [], []
    total_loss = 0
    loss_fn = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in loader:
            batch_inputs, batch_labels = batch
            batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()}
            batch_labels = batch_labels.to(device)

            outputs = model(**batch_inputs)
            loss = loss_fn(outputs.logits, batch_labels)
            total_loss += loss.item()

            predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            true_labels.extend(batch_labels.cpu().numpy())

    return total_loss / len(loader), classification_report(true_labels, predictions, digits=3)

# Early Stopping y Guardar el mejor modelo
early_stopping_patience = 2
best_val_loss = float('inf')
early_stopping_counter = 0

# Cross-Validation con StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_index, val_index) in enumerate(skf.split(train_df['ProcessedText'], train_df['Informativeness_label_mapped'])):
    train_texts = train_df.iloc[train_index]['ProcessedText'].values
    val_texts = train_df.iloc[val_index]['ProcessedText'].values
    train_labels_fold = train_df.iloc[train_index]['Informativeness_label_mapped'].values
    val_labels_fold = train_df.iloc[val_index]['Informativeness_label_mapped'].values

    train_dataset_fold = DisasterDataset(train_texts, train_labels_fold, tokenizer)
    val_dataset_fold = DisasterDataset(val_texts, val_labels_fold, tokenizer)

    train_loader_fold = DataLoader(train_dataset_fold, batch_size=32, shuffle=True)
    val_loader_fold = DataLoader(val_dataset_fold, batch_size=32)

    # Modelo BERT para este fold
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=2,
        hidden_dropout_prob=0.3
    )
    model.to(device)

    # Optimización y Scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    scheduler = get_scheduler(
        'linear',
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_loader_fold) * 3
    )
    loss_fn = torch.nn.CrossEntropyLoss()

    # Entrenamiento con Early Stopping
    for epoch in range(10):  # Hasta 10 épocas, pero con Early Stopping
        model.train()
        total_loss = 0
        for batch in train_loader_fold:
            batch_inputs, batch_labels = batch
            batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()}
            batch_labels = batch_labels.to(device)

            optimizer.zero_grad()
            outputs = model(**batch_inputs, labels=batch_labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        # Evaluación
        val_loss, val_report = evaluate(val_loader_fold, model)
        print(f"Epoch {epoch + 1} - Training Loss: {total_loss / len(train_loader_fold)}, Validation Loss: {val_loss}")

        # Guardar el mejor modelo
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f"best_model_fold_{fold + 1}.pt")
            early_stopping_counter = 0
            print(" Mejor modelo guardado")
        else:
            early_stopping_counter += 1
            print(f"Early Stopping Counter: {early_stopping_counter}/{early_stopping_patience}")

        # Verificar Early Stopping
        if early_stopping_counter >= early_stopping_patience:
            print(" Early stopping activado.")
            break

    fold_results.append(best_val_loss)

# Promedio de validación cruzada
print(f"\n📊 Cross-validation average loss: {np.mean(fold_results):.4f} ± {np.std(fold_results):.4f}")

# Evaluación final en el conjunto de prueba
model.load_state_dict(torch.load("best_model_fold_1.pt"))
test_loss, test_report = evaluate(test_loader, model)
print(f"Test Loss: {test_loss}")
print(f"Test Classification Report:\n{test_report}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.4018289534070612, Validation Loss: 0.36914002468287205
 Mejor modelo guardado
Epoch 2 - Training Loss: 0.33523269796441846, Validation Loss: 0.36497097991166577
 Mejor modelo guardado
Epoch 3 - Training Loss: 0.30244950238890794, Validation Loss: 0.3686865176219798
Early Stopping Counter: 1/2
Epoch 4 - Training Loss: 0.291620808283667, Validation Loss: 0.3686865176219798
Early Stopping Counter: 2/2
 Early stopping activado.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.40335273499108015, Validation Loss: 0.3859610794199903
Early Stopping Counter: 3/2
 Early stopping activado.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.4003746763214404, Validation Loss: 0.390299836048861
Early Stopping Counter: 4/2
 Early stopping activado.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.40340957499350577, Validation Loss: 0.3810268862001949
Early Stopping Counter: 5/2
 Early stopping activado.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.4053020338192721, Validation Loss: 0.3429148486323155
 Mejor modelo guardado
Epoch 2 - Training Loss: 0.336424826056858, Validation Loss: 0.3513126565564063
Early Stopping Counter: 1/2
Epoch 3 - Training Loss: 0.30623411803029943, Validation Loss: 0.3669780609902762
Early Stopping Counter: 2/2
 Early stopping activado.

📊 Cross-validation average loss: 0.3606 ± 0.0088


  model.load_state_dict(torch.load("best_model_fold_1.pt"))


Test Loss: 0.3664361433523476
Test Classification Report:
              precision    recall  f1-score   support

           0      0.881     0.751     0.811      2072
           1      0.841     0.928     0.882      2928

    accuracy                          0.855      5000
   macro avg      0.861     0.840     0.847      5000
weighted avg      0.857     0.855     0.853      5000



In [18]:
# Modelo
model.save_pretrained("bert_informativeness_classifier")

# Tokenizer
tokenizer.save_pretrained("bert_informativeness_classifier")

print("Modelo y tokenizer guardados en 'bert_event_classifier'")

Modelo y tokenizer guardados en 'bert_event_classifier'


In [19]:
import joblib

# Guardar el diccionario como archivo pickle
joblib.dump(label_mapping, 'label_mapping.pkl')
print("Diccionario de mapeo guardado como label_mapping.pkl")

Diccionario de mapeo guardado como label_mapping.pkl


In [20]:
!zip -r bert_informativeness_classifier.zip bert_informativeness_classifier


  adding: bert_informativeness_classifier/ (stored 0%)
  adding: bert_informativeness_classifier/config.json (deflated 49%)
  adding: bert_informativeness_classifier/tokenizer_config.json (deflated 75%)
  adding: bert_informativeness_classifier/model.safetensors (deflated 7%)
  adding: bert_informativeness_classifier/special_tokens_map.json (deflated 42%)
  adding: bert_informativeness_classifier/vocab.txt (deflated 53%)
