In [None]:
import os

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from kornia.losses import focal_loss
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, WeightedRandomSampler

from models.hybrid_CNN_BiLSTM import HybridCNNBiLSTM
from models.simple_biLSTM import BiLSTMClassifier
from src.dataset_factory import DatasetFactory
from src.nn_trainer import NNTrainer
from src.nn_utils import JournalDataset, build_vocab

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Trabajando con: {device}")


factory = DatasetFactory()

Trabajando con: cpu


In [None]:
import pandas as pd

def perform_downsampling(df, target_column='label_idx'):
    
    counts = df[target_column].value_counts()
    min_class_size = counts.min() 
    
    target_size = max(min_class_size, 300) 
    
    downsampled_parts = []
    
    for class_index in counts.index:
        class_subset = df[df[target_column] == class_index]
        
        if len(class_subset) > target_size:
            
            downsampled_parts.append(class_subset.sample(target_size, random_state=42))
        else:
            
            downsampled_parts.append(class_subset)
            
    return pd.concat(downsampled_parts).sample(frac=1, random_state=42).reset_index(drop=True)


def perform_soft_downsampling(df, target_column='label_idx', ratio=2.0, min_floor=300):
    counts = df[target_column].value_counts()
    min_class_size = counts.min()
    
    
    base_size = max(min_class_size, min_floor)
    
    
    
    max_allowed_size = int(base_size * ratio)
    
    downsampled_parts = []
    
    for class_index in counts.index:
        class_subset = df[df[target_column] == class_index]
        n_samples = len(class_subset)
        
        if n_samples > max_allowed_size:
            
            downsampled_parts.append(class_subset.sample(max_allowed_size, random_state=42))
        else:
            
            downsampled_parts.append(class_subset)
            
    return pd.concat(downsampled_parts).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df = factory.create_dl_dataset('DL_Experiment_Base_aggresive')


TARGET_TEST_SIZE = 0.15
TARGET_VAL_SIZE = 0.15


train_val_df, test_df = train_test_split(
    df, test_size=TARGET_TEST_SIZE, stratify=df['label_idx'], random_state=42
)

val_relative_size = TARGET_VAL_SIZE / (1 - TARGET_TEST_SIZE)
train_df, val_df = train_test_split(
    train_val_df, test_size=val_relative_size, stratify=train_val_df['label_idx'], random_state=42
)


print("Distribution after Downsampling:\n", train_df['label_idx'].value_counts())


vocab = build_vocab(train_df['processed_text'], max_features= 3000)

print(f"Total vocabulary size: {len(vocab)}")

print("Vocab sample:", list(vocab.items())[:10])


train_loader = DataLoader(
    JournalDataset(train_df['processed_text'], train_df['label_idx'], vocab), 
    batch_size=32, shuffle=True
)

val_loader = DataLoader(JournalDataset(val_df['processed_text'], val_df['label_idx'], vocab), batch_size=32, shuffle=False)
test_loader = DataLoader(JournalDataset(test_df['processed_text'], test_df['label_idx'], vocab), batch_size=32, shuffle=False)



Generando dataset DL para experimento: DL_Experiment_Base_aggresive...
Dataset DL y mapeo guardados en data/experiments/DL_Experiment_Base_aggresive
Distribución tras Downsampling:
 label_idx
2    7168
4    3162
1    1725
3     796
0     787
5     590
Name: count, dtype: int64
Tamaño total del vocabulario: 3002
Muestra del vocabulario: [('proposed', 2), ('model', 3), ('data', 4), ('method', 5), ('learning', 6), ('results', 7), ('performance', 8), ('methods', 9), ('network', 10), ('based', 11)]


In [None]:
criterion = lambda preds, targets: focal_loss(preds, targets, alpha=0.5, gamma=2.0, reduction='mean')

In [None]:

exp_path_simple = os.path.join('data', 'experiments', 'BiLSTM_Simple_V11')
os.makedirs(exp_path_simple, exist_ok=True)

model_simple = BiLSTMClassifier(
    vocab_size=len(vocab), 
    embed_dim=128, 
    hidden_dim=128, 
    output_dim=df['label_idx'].nunique()
).to(device)

optimizer = optim.Adam(model_simple.parameters(), lr=0.001, weight_decay=1e-5)

mode='max' 
factor=0.1 
patience=2 
scheduler = ReduceLROnPlateau(optimizer, mode=mode, factor=factor, patience=patience)


trainer_simple = NNTrainer(
    model=model_simple, 
    criterion=criterion, 
    optimizer=optimizer, 
    device=device, 
    exp_path=exp_path_simple,
    scheduler=scheduler 
)
trainer_simple.fit(train_loader, val_loader, epochs=30)


y_true, y_pred, acc = trainer_simple.evaluate(test_loader)
print(f"Test Accuracy: {acc:.3f}")

trainer_simple.save_results(y_true, y_pred, vocab)

trainer_simple.plot_learning_curves()

Iniciando entrenamiento en cpu...
Epoch 01/30 | Loss: 0.063 | Val Acc: 0.562 -> ¡Mejor modelo guardado!
Epoch 02/30 | Loss: 0.046 | Val Acc: 0.618 -> ¡Mejor modelo guardado!
Epoch 03/30 | Loss: 0.039 | Val Acc: 0.648 -> ¡Mejor modelo guardado!
Epoch 04/30 | Loss: 0.036 | Val Acc: 0.671 -> ¡Mejor modelo guardado!
Epoch 05/30 | Loss: 0.033 | Val Acc: 0.671
Epoch 06/30 | Loss: 0.030 | Val Acc: 0.676 -> ¡Mejor modelo guardado!
Epoch 07/30 | Loss: 0.028 | Val Acc: 0.684 -> ¡Mejor modelo guardado!
Epoch 08/30 | Loss: 0.027 | Val Acc: 0.681
Epoch 09/30 | Loss: 0.025 | Val Acc: 0.693 -> ¡Mejor modelo guardado!
Epoch 10/30 | Loss: 0.024 | Val Acc: 0.694 -> ¡Mejor modelo guardado!
Epoch 11/30 | Loss: 0.022 | Val Acc: 0.688
Epoch 12/30 | Loss: 0.020 | Val Acc: 0.691
Epoch 13/30 | Loss: 0.019 | Val Acc: 0.647
Epoch 14/30 | Loss: 0.015 | Val Acc: 0.696 -> ¡Mejor modelo guardado!
Epoch 15/30 | Loss: 0.014 | Val Acc: 0.691
Epoch 16/30 | Loss: 0.014 | Val Acc: 0.695
Epoch 17/30 | Loss: 0.013 | Val Acc

In [None]:

exp_path_hybrid = os.path.join('data', 'experiments', 'Hybrid_CNN_BiLSTM_V7')
os.makedirs(exp_path_hybrid, exist_ok=True)

model_hybrid = HybridCNNBiLSTM(
    vocab_size=len(vocab), 
    embed_dim=128,      
    n_filters=100,       
    filter_sizes=[3, 4, 5], 
    hidden_dim=32,      
    output_dim=df['label_idx'].nunique(),
    dropout=0.6         
).to(device)


optimizer_h = optim.Adam(
    model_hybrid.parameters(), 
    lr=0.001, 
    weight_decay=1e-5  
)


scheduler = ReduceLROnPlateau(optimizer_h, mode='max', factor=0.5, patience=1)
trainer_hybrid = NNTrainer(
    model=model_hybrid, 
    criterion=criterion, 
    optimizer=optimizer_h, 
    device=device, 
    exp_path=exp_path_hybrid,
    scheduler=scheduler
)


trainer_hybrid.fit(train_loader, val_loader, epochs=50, patience=3)

Iniciando entrenamiento en cpu...


  return F.conv1d(


Epoch 01/50 | Loss: 0.070 | Val Acc: 0.531 -> ¡Mejor modelo guardado!
Epoch 02/50 | Loss: 0.058 | Val Acc: 0.595 -> ¡Mejor modelo guardado!
Epoch 03/50 | Loss: 0.050 | Val Acc: 0.613 -> ¡Mejor modelo guardado!
Epoch 04/50 | Loss: 0.043 | Val Acc: 0.637 -> ¡Mejor modelo guardado!
Epoch 05/50 | Loss: 0.040 | Val Acc: 0.654 -> ¡Mejor modelo guardado!
Epoch 06/50 | Loss: 0.037 | Val Acc: 0.657 -> ¡Mejor modelo guardado!
Epoch 07/50 | Loss: 0.034 | Val Acc: 0.674 -> ¡Mejor modelo guardado!
Epoch 08/50 | Loss: 0.032 | Val Acc: 0.679 -> ¡Mejor modelo guardado!
Epoch 09/50 | Loss: 0.030 | Val Acc: 0.681 -> ¡Mejor modelo guardado!
Epoch 10/50 | Loss: 0.027 | Val Acc: 0.683 -> ¡Mejor modelo guardado!
Epoch 11/50 | Loss: 0.027 | Val Acc: 0.672
Epoch 12/50 | Loss: 0.024 | Val Acc: 0.690 -> ¡Mejor modelo guardado!
Epoch 13/50 | Loss: 0.022 | Val Acc: 0.689
Epoch 14/50 | Loss: 0.021 | Val Acc: 0.678
Epoch 15/50 | Loss: 0.019 | Val Acc: 0.688

[Early Stopping] El modelo no ha mejorado en 3 épocas. De

In [None]:

y_true_h, y_pred_h, acc_h = trainer_hybrid.evaluate(test_loader)
trainer_hybrid.save_results(y_true_h, y_pred_h, vocab)
print(f"Test Accuracy Hybrid: {acc_h:.3f}")
trainer_hybrid.plot_learning_curves()

Resultados guardados en data\experiments\Hybrid_CNN_BiLSTM_V6
Test Accuracy Hybrid: 0.691
