# Event Classification Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Librerías & Data Load

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/tfm_DataScience/dataset/disaster_preprocessed.csv', encoding = 'utf-8')

dataset.head()

Unnamed: 0,InformationType,event,TweetID,TweetText,location,year,Informativeness_label,ProcessedText,ProcessedText_length
0,not related or not informative,landslides,'577702417766060032',HAPPY SAINT PATRIC'S DAAAAAAAAAAAAY ÃÂ _ÃÂ0...,Worldwide,unknown,not related or not informative,happy saint patric ' s daaaaaaaaaaaay,6
1,caution and advice,hurricane,7.60519e+16,@7News: New tornado warning for storm in Sprin...,Missouri,2011,informative,new tornado warning for storm in springfield /...,47
2,not related or not informative,landslides,'573782308551004160',"""@irishcraicc: ÃÂ __8ÃÂ¤ÃÂÃÂÃÂ¥ÃÂÃÂ...",Worldwide,unknown,not related or not informative,""" 8 quote and rt this for free follows a # kca...",20
3,donations and volunteering,hurricane,7.3445996344e+16,As organizations and first responders take adv...,Missouri,2011,informative,as organizations and first responders take adv...,40
4,not related or not informative,hurricane,'541686284302188546',PS 63:3-4 Ur love Lord s better than life my l...,Philipinnes,2014,not related or not informative,ps 63 : 3 - 4 ur love lord s better than life ...,42


## Training-Test split

In [None]:
from sklearn.model_selection import train_test_split

# División inicial: Training (80%), Validation (5000), Test (5000)
train_df, temp_df = train_test_split(dataset, test_size=10000, random_state=42, stratify=dataset['event'])

# Dividimos temp_df en validation y test (50% cada uno)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['event'])

print(f"Tamaño de training: {len(train_df)}, validación: {len(val_df)}, prueba: {len(test_df)}")


Tamaño de training: 64346, validación: 5000, prueba: 5000


## Tokenización

In [None]:
from transformers import BertTokenizer

# Tokenizer preentrenado
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

# Codificación de etiquetas
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['event'])
val_labels = label_encoder.transform(val_df['event'])
test_labels = label_encoder.transform(test_df['event'])

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Dataset compatible con PyTorch
class DisasterDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        return {key: val.squeeze() for key, val in encoding.items()}, torch.tensor(self.labels[idx])

# Crear datasets y DataLoaders
train_dataset = DisasterDataset(list(train_df['ProcessedText']), train_labels, tokenizer)
val_dataset = DisasterDataset(list(val_df['ProcessedText']), val_labels, tokenizer)
test_dataset = DisasterDataset(list(test_df['ProcessedText']), test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

## Modelo BERT

In [None]:
from transformers import BertForSequenceClassification

# Definir el modelo con Dropout
event_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_),
    hidden_dropout_prob=0.3  # Regularización para evitar sobreajuste
)

# Enviar modelo a GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
event_model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Función de evaluación

from sklearn.metrics import classification_report

def evaluate(loader, model):
    """Evalúa el modelo en un conjunto de datos."""
    event_model.eval()
    predictions, true_labels = [], []
    total_loss = 0

    with torch.no_grad():
        for batch in loader:
            batch_inputs, batch_labels = batch
            batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()}
            batch_labels = batch_labels.to(device)

            outputs = event_model(**batch_inputs)
            loss = loss_fn(outputs.logits, batch_labels)
            total_loss += loss.item()

            predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            true_labels.extend(batch_labels.cpu().numpy())

    return total_loss / len(loader), classification_report(true_labels, predictions, target_names=label_encoder.classes_, digits=3)


## Entrenamiento

In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
from transformers import AdamW, get_scheduler

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
fold_results = []

for fold, (train_index, val_index) in enumerate(skf.split(train_df['ProcessedText'], train_labels)):
    print(f"\n🔄 Fold {fold + 1}")

    # División de los datos en fold
    train_texts = train_df.iloc[train_index]['ProcessedText']
    val_texts = train_df.iloc[val_index]['ProcessedText']
    train_labels_fold = train_labels[train_index]
    val_labels_fold = train_labels[val_index]

    # Datasets y dataloaders
    train_dataset_fold = DisasterDataset(list(train_texts), train_labels_fold, tokenizer)
    val_dataset_fold = DisasterDataset(list(val_texts), val_labels_fold, tokenizer)

    train_loader_fold = DataLoader(train_dataset_fold, batch_size=32, shuffle=True)
    val_loader_fold = DataLoader(val_dataset_fold, batch_size=32)

    # Modelo para el fold
    event_model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(label_encoder.classes_),
        hidden_dropout_prob=0.3
    )
    event_model.to(device)

    # Función de pérdida, optimizador y scheduler
    optimizer = AdamW(event_model.parameters(), lr=2e-5, eps=1e-8)
    loss_fn = torch.nn.CrossEntropyLoss()
    scheduler = get_scheduler(
        'linear',
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_loader_fold) * 3
    )

    # Entrenamiento en fold
    for epoch in range(3):
        event_model.train()
        total_loss = 0

        for batch in train_loader_fold:
            batch_inputs, batch_labels = batch
            batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()}
            batch_labels = batch_labels.to(device)

            outputs = event_model(**batch_inputs, labels=batch_labels)
            loss = outputs.loss
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

        # Evaluación en validación
        val_loss, val_report = evaluate(val_loader_fold, event_model)
        print(f"Epoch {epoch + 1} - Training Loss: {total_loss / len(train_loader_fold)}, Validation Loss: {val_loss}")
        print(val_report)

    fold_results.append(val_loss)

# Promedio de validación cruzada
print(f"\n📊 Cross-validation average loss: {np.mean(fold_results):.4f} ± {np.std(fold_results):.4f}")



🔄 Fold 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.3938213453144057, Validation Loss: 0.12750306761103242
                   precision    recall  f1-score   support

building collapse      0.978     0.960     0.968       272
       earthquake      0.941     0.984     0.962      5926
        explosion      0.923     0.874     0.898       262
           floods      0.982     0.926     0.953      2505
             haze      0.990     1.000     0.995       203
        hurricane      0.992     0.978     0.985      8903
       landslides      0.725     0.697     0.711       750
           meteor      0.981     0.989     0.985       264
        terrorism      0.910     0.979     0.943       571
    traffic crash      0.983     0.997     0.990       688
          volcano      0.979     0.754     0.852        61
        wildfires      0.983     0.986     0.984      1044

         accuracy                          0.963     21449
        macro avg      0.947     0.927     0.936     21449
     weighted avg      0.963   

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.40964670513050266, Validation Loss: 0.12629988017990273
                   precision    recall  f1-score   support

building collapse      0.964     0.993     0.978       273
       earthquake      0.952     0.977     0.964      5926
        explosion      0.887     0.840     0.863       262
           floods      0.975     0.942     0.958      2505
             haze      0.953     0.995     0.974       204
        hurricane      0.988     0.980     0.984      8903
       landslides      0.753     0.698     0.724       749
           meteor      0.923     1.000     0.960       264
        terrorism      0.922     0.970     0.945       570
    traffic crash      0.973     0.997     0.985       688
          volcano      0.977     0.689     0.808        61
        wildfires      0.969     0.973     0.971      1044

         accuracy                          0.963     21449
        macro avg      0.936     0.921     0.926     21449
     weighted avg      0.963  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.40525660920529827, Validation Loss: 0.12256729161910207
                   precision    recall  f1-score   support

building collapse      0.989     0.982     0.985       273
       earthquake      0.951     0.975     0.963      5925
        explosion      0.879     0.920     0.899       261
           floods      0.977     0.936     0.956      2504
             haze      0.995     0.995     0.995       204
        hurricane      0.983     0.985     0.984      8903
       landslides      0.743     0.641     0.689       750
           meteor      0.964     1.000     0.981       264
        terrorism      0.955     0.974     0.964       570
    traffic crash      0.974     0.996     0.985       688
          volcano      1.000     0.836     0.911        61
        wildfires      0.978     0.982     0.980      1045

         accuracy                          0.963     21448
        macro avg      0.949     0.935     0.941     21448
     weighted avg      0.963  

## Evaluación en test

In [None]:
test_loss, test_report = evaluate(test_loader, event_model)
print(f"Test Loss: {test_loss}")
print(f"Test Classification Report:\n{test_report}")

Test Loss: 0.09278645935604811
Test Classification Report:
                   precision    recall  f1-score   support

building collapse      1.000     1.000     1.000        63
       earthquake      0.970     0.978     0.974      1381
        explosion      1.000     1.000     1.000        61
           floods      0.968     0.971     0.969       584
             haze      1.000     0.979     0.989        47
        hurricane      0.991     0.987     0.989      2076
       landslides      0.821     0.760     0.789       175
           meteor      0.984     0.984     0.984        61
        terrorism      0.957     0.992     0.974       133
    traffic crash      0.994     0.994     0.994       161
          volcano      1.000     1.000     1.000        14
        wildfires      0.980     0.992     0.986       244

         accuracy                          0.976      5000
        macro avg      0.972     0.970     0.971      5000
     weighted avg      0.975     0.976     0.975      

## Guardar modelo

In [None]:
# Modelo
event_model.save_pretrained('bert_event_classifier')

# Tokenizer
tokenizer.save_pretrained('bert_event_classifier')

print("Modelo y tokenizer guardados en 'bert_event_classifier'")

Modelo y tokenizer guardados en 'bert_event_classifier'


In [None]:
# Label encoder
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [None]:
!zip -r bert_event_classifier.zip bert_event_classifier

  adding: bert_event_classifier/ (stored 0%)
  adding: bert_event_classifier/special_tokens_map.json (deflated 42%)
  adding: bert_event_classifier/model.safetensors (deflated 7%)
  adding: bert_event_classifier/tokenizer_config.json (deflated 75%)
  adding: bert_event_classifier/config.json (deflated 58%)
  adding: bert_event_classifier/vocab.txt (deflated 53%)
