<a href="https://colab.research.google.com/github/UN-GCPDS/Curso-Corto-LLMs/blob/main/2.Entrenamiento_Tabnet/Entrenamiento_Tabnet_Parte_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![Logo UNAL CHEC](https://github.com/UN-GCPDS/curso_IA_CHEC/blob/main/logo_unal_chec.jpg?raw=1)

# **Entrenamiento modelo Tabnet**

## **Descripción**

Entrenamiento de modelo Tabnet bajo diversas condiciones.

### **Profesor - Sesión 1:** Andrés Marino Álvarez Meza y Diego Armando Pérez Rosero

# Datos

**TabNet para criticidad en redes de media tensión — Planteamiento y datos (Regresión)**

Sea el conjunto de datos

$$
\mathbf{X}\in\mathbb{R}^{N\times M},\qquad
\mathbf{y}\in\mathbb{R}^{N}.
$$

Cada fila de $\mathbf{X}$ representa un evento o periodo entre 2019 y 2024 y contiene las características de los elementos asociados al equipo que operó. El vector $\mathbf{y}$ almacena el valor continuo del indicador a modelar (SAIDI o SAIFI) para ese mismo evento/periodo.

Definimos

$$
\mathcal{F}:\mathcal{X}\subseteq\mathbb{R}^{M}\to\mathbb{R},\qquad
\hat{y}=\mathcal{F}(\mathbf{x})
=
\bigl(\,\breve{f}_{L}\circ \breve{f}_{L-1}\circ \cdots \circ \breve{f}_{1}\,\bigr)(\mathbf{x}),
$$

donde $\breve{f}_{l}(\cdot)$ denota el $l$-ésimo bloque del modelo ($l\in\{1,\dots,L\}$) y $\circ$ es el operador de composición.

En caso multisalida para $(\text{SAIDI},\text{SAIFI})$, se toma $\mathcal{F}:\mathbb{R}^{M}\to\mathbb{R}^{2}$ y $\mathbf{y}\in\mathbb{R}^{N\times 2}$.
![Logo UNAL CHEC](https://raw.githubusercontent.com/Daprosero/Deep-Convolutional-Generative-Adversarial-Network/refs/heads/master/Mercados%20CHEC.png)

In [1]:
#@title Librerías
# Instalación de paquetes necesarios
!pip install -q gdown
!pip install openTSNE
!pip install pytorch-tabnet optuna
!pip install wget --quiet

# Importación de librerías necesarias
import optuna
import warnings
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.special import softmax
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.neighbors import NearestNeighbors
from pytorch_tabnet.tab_model import TabNetRegressor, TabNetClassifier
from pytorch_tabnet.augmentations import RegressionSMOTE
from google.colab import drive
import tensorflow as tf
import tensorflow_probability as tfp
import os
from pathlib import Path
import math
import wget
!gdown --id 1o_fZIhk6ErrtrM3eVZPF9s2qj8l4FoqS -O SuperEventos_Criticidad_AguasAbajo_CODEs.zip
!gdown --id 1lBrseLoEmr6-VwNSCHOp2zuc4sKKrkbQ -O model.zip
!gdown --id 16VIuHLgPGpX4J723Wd48UAPhHivLuUaH -O Data_CHEC.zip

import zipfile
import os

zip_path = "SuperEventos_Criticidad_AguasAbajo_CODEs.zip"
extract_dir = "CHEC"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

zip_path = "model.zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

zip_path = "Data_CHEC.zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Supresión de warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning)

# Función auxiliar para etiquetas
def get_labels(x: pd.Series) -> pd.Series:
    labels, _ = pd.factorize(x)
    return pd.Series(labels, name=x.name, index=x.index)

# Definición de funciones personalizadas de pérdida
def my_mse_loss_fn(y_pred, y_true):
    mse_loss = (y_true - y_pred) ** 2
    return torch.mean(mse_loss)
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def plot_var_band(
    df,
    var_token,
    row_index=0,
    hours_back=24,
    col_patterns=None,
    display_name=None,
    units=None,
    event_label="evento reportado",
):
    """
    Grafica una variable climática en una franja de horas hacia atrás.

    Parámetros
    ----------
    df : pd.DataFrame
        Contiene columnas por hora para la variable elegida.
        Ejemplos de nombres soportados automáticamente:
        - 'h0-<var>', 'h1-<var>', ..., 'h24-<var>'
        - '<var>_h0', '<var>_h1', ...
        con separadores '_' o '-'.

    var_token : str
        Nombre base de la variable en los nombres de columna (p.ej. 'wind_gust_spd',
        'air_temp', 'precip'). Debe coincidir con lo que aparece en las columnas.

    row_index : int
        Fila (evento) a graficar.

    hours_back : int
        Cuántas horas hacia atrás mostrar.

    col_patterns : list[str] | None
        Lista de regex opcionales para detectar columnas por hora.
        Si None, se generan automáticamente a partir de var_token.

    display_name : str | None
        Etiqueta legible para el eje Y (p.ej. 'Ráfaga de viento').
        Si None, se usa var_token.

    units : str | None
        Unidades para concatenar en la etiqueta Y (p.ej. 'm/s', '°C', 'mm').

    event_label : str
        Texto para la flecha en la hora 0.
    """
    # --- 1) Preparar patrones de columnas ---
    if col_patterns is None:
        # Permitir '_' o '-' (o espacio) entre partes del var_token
        parts = re.split(r'[_\-\s]+', var_token.strip())
        # Construimos un regex que tolere '_' o '-' entre partes
        # ej: 'wind[_-]?gust[_-]?spd'
        var_regex = r'[_-]?'.join(map(re.escape, parts))

        col_patterns = [
            rf'^h(\d{{1,2}})[-_]?{var_regex}$',   # h0-<var>  o  h0_<var>
            rf'^{var_regex}[-_]?h(\d{{1,2}})$',   # <var>-h0  o  <var>_h0
        ]

    # --- 2) Detectar columnas y mapear a hora ---
    hour_to_col = {}
    for c in df.columns:
        for pat in col_patterns:
            m = re.match(pat, str(c), flags=re.IGNORECASE)
            if m:
                h = int(m.group(1))
                hour_to_col[h] = c
                break

    if not hour_to_col:
        raise ValueError(
            f"No se encontraron columnas con horas para la variable '{var_token}'.\n"
            f"Prueba ajustando 'var_token' o pasando 'col_patterns' personalizados."
        )

    # --- 3) Construir serie horas [0..hours_back] si existen, orden ascendente ---
    hours = [h for h in sorted(hour_to_col.keys()) if 0 <= h <= hours_back]
    vals = np.array(
        [pd.to_numeric(df.loc[df.index[row_index], hour_to_col[h]], errors='coerce') for h in hours],
        dtype=float
    )

    # --- 4) Graficar ---
    fig, ax = plt.subplots(figsize=(10, 5))

    # línea y puntos
    ax.plot(hours, vals, marker='o')

    # invertir eje X para que se vea 24 -> 0
    ax.set_xlim(hours_back, 0)

    # franja sombreada
    ymin = np.nanmin(vals) if np.isfinite(np.nanmin(vals)) else 0.0
    ymax = np.nanmax(vals) if np.isfinite(np.nanmax(vals)) else 1.0
    pad  = 0.05 * (ymax - ymin if ymax > ymin else 1.0)
    ax.set_ylim(ymin - pad, ymax + pad)
    ax.axvspan(0, hours_back, alpha=0.15)

    # flecha y etiqueta en hora 0
    y0 = vals[hours.index(0)] if 0 in hours else np.nan
    if not np.isfinite(y0):
        y0 = np.nanmedian(vals) if np.isfinite(np.nanmedian(vals)) else (ymin + ymax) / 2.0

    ax.annotate(
        event_label,
        xy=(0, y0),
        xytext=(max(2, min(4, hours_back*0.15)), y0 + (ymax - y0)*0.15),
        arrowprops=dict(arrowstyle="->", lw=1),
        ha='left', va='bottom'
    )

    # etiquetas
    ylab = display_name if display_name else var_token
    if units:
        ylab = f"{ylab} [{units}]"
    ax.set_xlabel("Horas antes del evento")
    ax.set_ylabel(ylab)
    ax.grid(True, alpha=0.3)

    # ticks principales (24, 18, 12, 6, 0) si corresponde
    xticks = [h for h in [hours_back, 18, 12, 6, 0] if 0 <= h <= hours_back]
    ax.set_xticks(xticks)

    plt.tight_layout()
    plt.show()


# --- Ejemplo de uso:
# plot_wind_gust_band(df=tu_dataframe, row_index=0, hours_back=24)

def my_rmse_loss_fn(y_pred, y_true):
    mse_loss = (y_true - y_pred) ** 2
    mean_mse_loss = torch.mean(mse_loss)
    rmse_loss = torch.sqrt(mean_mse_loss)
    return rmse_loss

def my_mae_loss_fn(y_pred, y_true):
    mae_loss = torch.abs(y_true - y_pred)
    return torch.mean(mae_loss)

def my_mape_loss_fn(y_pred, y_true):
    mape_loss = torch.abs((y_true - y_pred) / y_true) * 100
    return torch.mean(mape_loss)

def my_r2_score_fn(y_pred, y_true):
    total_variance = torch.var(y_true, unbiased=False)
    unexplained_variance = torch.mean((y_true - y_pred) ** 2)
    r2_score = 1 - (unexplained_variance / total_variance)
    return 1-r2_score

# Etapa 0: imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
# ==== Librerías ====
import numpy as np
import cupy as cp
import xgboost as xgb

from cuml.ensemble import RandomForestRegressor as cuRF
from cuml.metrics import r2_score as r2_gpu

# Si quieres comparar con CPU para sanity-check:
from sklearn.metrics import r2_score as r2_cpu
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
# ==== Utilidades ====
def to_cpu(a):
    """Convierte CuPy -> NumPy si aplica."""
    try:
        if isinstance(a, cp.ndarray):
            return cp.asnumpy(a)
    except Exception:
        pass
    return a

def metrics_gpu(y_true_cp, y_pred_cp):
    """MAE, RMSE, R2 calculados en GPU (CuPy)."""
    y_true_cp = cp.asarray(y_true_cp)
    y_pred_cp = cp.asarray(y_pred_cp)
    mae  = float(cp.mean(cp.abs(y_true_cp - y_pred_cp)))
    rmse = float(cp.sqrt(cp.mean((y_true_cp - y_pred_cp)**2)))
    ssr  = float(cp.sum((y_true_cp - y_pred_cp)**2))
    sst  = float(cp.sum((y_true_cp - cp.mean(y_true_cp))**2))
    r2   = 1.0 - ssr / sst if sst > 0 else np.nan
    return mae, rmse, r2

def permutation_importance_rf_gpu(model, X_val_cp, y_val_cp, n_repeats=3, max_feats=None, random_state=42):
    """
    Permutation importance en GPU para RF cuML.
    Devuelve importancia por feature (drop medio de R2 en valid).
    Si max_feats no es None, calcula solo para las primeras max_feats columnas (para acelerar).
    """
    rs = cp.random.RandomState(random_state)
    X_val_cp = cp.asarray(X_val_cp)
    y_val_cp = cp.asarray(y_val_cp)

    # R2 base
    y_pred_base = model.predict(X_val_cp)
    _, _, r2_base = metrics_gpu(y_val_cp, y_pred_base)

    n, d = X_val_cp.shape
    d_eval = d if max_feats is None else int(min(max_feats, d))
    importances = cp.zeros(d, dtype=cp.float32)

    for j in range(d_eval):
        drops = []
        for _ in range(n_repeats):
            Xp = X_val_cp.copy()
            idx = rs.permutation(n)
            Xp[:, j] = Xp[idx, j]  # permutar solo la columna j
            y_pred_p = model.predict(Xp)
            _, _, r2_p = metrics_gpu(y_val_cp, y_pred_p)
            drops.append(r2_base - r2_p)
        importances[j] = cp.mean(cp.asarray(drops))

    return importances  # CuPy array
def regression_metrics(y_true, y_pred):
    mae  = float(np.mean(np.abs(y_true - y_pred)))
    rmse = float(np.sqrt(np.mean((y_true - y_pred)**2)))
    ss_res = float(np.sum((y_true - y_pred)**2))
    ss_tot = float(np.sum((y_true - np.mean(y_true))**2))
    r2 = 1 - ss_res/ss_tot if ss_tot > 0 else np.nan
    return mae, rmse, r2
class CustomTabNetRegressor(TabNetRegressor):
    def __init__(self, *args, **kwargs):
        super(CustomTabNetRegressor, self).__init__(*args, **kwargs)

    def forward(self, X):
        output, M_loss = self.network(X)
        output = torch.relu(output)
        return output, M_loss

    def predict(self, X):
        device = next(self.network.parameters()).device
        if not isinstance(X, torch.Tensor):
            X = torch.tensor(X, dtype=torch.float32)
        X = X.to(device)
        with torch.no_grad():
            output, _ = self.forward(X)
        return output.cpu().numpy()
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from tqdm import tqdm
from ast import literal_eval
from pandas.api.types import is_numeric_dtype


import numpy as np
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
def make_strat_labels(y_vals, n_bins=3, min_per_class=2):
    """
    Genera etiquetas para estratificar a partir de un objetivo continuo.
    Reduce bins si no hay suficientes muestras por clase.
    """
    y1d = y_vals.reshape(-1)
    for bins in range(n_bins, 1, -1):
        pct = np.linspace(0, 100, bins + 1)[1:-1]
        cuts = np.percentile(y1d, pct)
        if np.any(np.diff(cuts) <= 0):
            continue
        labels = np.digitize(y1d, bins=cuts).astype(int)
        counts = Counter(labels)
        if all(c >= min_per_class for c in counts.values()) and len(counts) > 1:
            return labels
    return None

def stratify_from_df_or_y(df_labels, idx, y_subset, col='NIVEL_C'):
    """Intenta usar df[col] como etiqueta; si falla, usa percentiles en y_subset."""
    try:
        ycat_full = df_labels.loc[:, col].values.astype(int)
        ycat = ycat_full[idx]
        c10 = Counter(ycat)
        if all(v >= 2 for v in c10.values()) and len(c10) > 1:
            return ycat
    except Exception:
        pass
    return make_strat_labels(y_subset[:,0], n_bins=3, min_per_class=2)

def split_subset(X, y, df_labels=None, n_sub=1000, test_size=0.20, seed=42):
    """
    1) Toma un subset aleatorio de tamaño n_sub.
    2) Escala y (MinMax) sobre el subset.
    3) Split train/test con estratificación si es viable.
    4) Split train/valid (20% del train), con re-estratificación si es posible.
    """
    rng = np.random.RandomState(seed)
    n_total = X.shape[0]
    n_sub = min(n_sub, n_total)
    idx_sub = rng.choice(n_total, size=n_sub, replace=False)

    X_sub = X[idx_sub]
    y_sub = y[idx_sub]
    # etiquetas auxiliares para estratificación
    ycat_sub = stratify_from_df_or_y(df_labels, idx_sub, y_sub) if df_labels is not None else make_strat_labels(y_sub[:,0])
    # escalar objetivo en el subset
    scaler = MinMaxScaler()
    y_sub_scaled = scaler.fit_transform(y_sub)

    split_kwargs = dict(test_size=test_size, random_state=seed, shuffle=True)
    if ycat_sub is not None:
        X_tr, X_te, y_tr, y_te, ycat_tr, ycat_te = train_test_split(
            X_sub, y_sub_scaled, ycat_sub, stratify=ycat_sub, **split_kwargs
        )
    else:
        X_tr, X_te, y_tr, y_te = train_test_split(X_sub, y_sub_scaled, **split_kwargs)
        ycat_tr = ycat_te = None

    # Validación (20% del train)
    if ycat_tr is not None:
        y_tr_raw = y_tr[:,0]
        ycat_t = make_strat_labels(y_tr_raw, n_bins=3, min_per_class=2)
        if ycat_t is not None:
            X_tr, X_va, y_tr, y_va, ycat_tr, ycat_va = train_test_split(
                X_tr, y_tr, ycat_tr, test_size=0.20, random_state=seed, stratify=ycat_t
            )
        else:
            X_tr, X_va, y_tr, y_va = train_test_split(
                X_tr, y_tr, test_size=0.20, random_state=seed, shuffle=True
            )
            ycat_va = None
    else:
        X_tr, X_va, y_tr, y_va = train_test_split(
            X_tr, y_tr, test_size=0.20, random_state=seed, shuffle=True
        )
        ycat_va = None

    # Reporte rápido
    print("Originales (conservados):", X_orig.shape, y_orig.shape)
    print(f"Subset de {n_sub}:", X_sub.shape, y_sub.shape)
    print("Train/Valid/Test:", X_tr.shape, X_va.shape, X_te.shape)
    if ycat_sub is not None:
        print("Distribución clases subset:", Counter(ycat_sub))

    return {
        "idx_sub": idx_sub,
        "X_train": X_tr, "X_valid": X_va, "X_test": X_te,
        "y_train": y_tr, "y_valid": y_va, "y_test": y_te
    }
from copy import deepcopy
from sklearn.metrics import r2_score

def make_tabnet(cat_info, params):
    cat_idxs = [i for i, f in enumerate(features) if f in CATEGORICAL_COLUMNS]
    cat_dims = [categorical_dims[f] for f in features if f in CATEGORICAL_COLUMNS]
    cat_emb_dim = [min(params['emb'], max(4, (dim + 1)//2)) for dim in cat_dims]
    return cat_idxs, cat_dims, cat_emb_dim

def build_optimizer(optimizer_type, learning_rate, momentum, weight_decay):
    if optimizer_type == 'adam':
        return torch.optim.Adam, {'lr': float(min(max(learning_rate, 1e-4), 3e-3)), 'weight_decay': weight_decay}
    if optimizer_type == 'adamw':
        return torch.optim.AdamW, {'lr': float(min(max(learning_rate, 1e-4), 3e-3)), 'weight_decay': weight_decay}
    if optimizer_type == 'sgd':
        return torch.optim.SGD, {'lr': float(min(max(learning_rate, 1e-3), 1e-1)), 'momentum': momentum, 'weight_decay': weight_decay}
    if optimizer_type == 'rmsprop':
        return torch.optim.RMSprop, {'lr': float(min(max(learning_rate, 1e-4), 3e-3)), 'momentum': momentum, 'weight_decay': weight_decay}

def objective_regression(trial):
    # Capacidad TabNet
    n_d     = trial.suggest_int('n_d', 8, 32)
    n_a     = trial.suggest_int('n_a', 8, 32)
    n_steps = trial.suggest_int('n_steps', 3, 5)

    gamma         = trial.suggest_float('gamma', 1.0, 2.0)
    lambda_sparse = trial.suggest_float('lambda_sparse', 1e-6, 1e-3, log=True)

    batch_size  = trial.suggest_categorical('batch_size', [64, 128, 256])
    mask_type   = trial.suggest_categorical('mask_type', ['entmax', 'sparsemax'])
    emb         = trial.suggest_int('emb', 4, 24)

    momentum      = trial.suggest_float('momentum', 0.5, 0.95)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    weight_decay  = trial.suggest_float('weight_decay', 1e-6, 1e-4, log=True)

    scheduler_gamma = trial.suggest_float('scheduler_gamma', 0.95, 0.995)
    step_size       = trial.suggest_int('step_size', 5, 15)

    virtual_batch_size = trial.suggest_categorical('virtual_batch_size', [32, 64])
    if isinstance(batch_size, int) and isinstance(virtual_batch_size, int) and virtual_batch_size > batch_size:
        virtual_batch_size = batch_size // 2 if batch_size >= 64 else batch_size

    optimizer_type = trial.suggest_categorical('optimizer_type', ['adam', 'adamw', 'sgd', 'rmsprop'])
    optimizer_fn, optimizer_params = build_optimizer(optimizer_type, learning_rate, momentum, weight_decay)

    p   = trial.suggest_float('p', 0.0, 0.30)
    aug = RegressionSMOTE(p=p)

    cat_idxs = [i for i, f in enumerate(features) if f in CATEGORICAL_COLUMNS]
    cat_dims = [categorical_dims[f] for f in features if f in CATEGORICAL_COLUMNS]
    cat_emb_dim = [min(emb, max(4, (dim + 1)//2)) for dim in cat_dims]

    model = CustomTabNetRegressor(
        cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs,
        n_d=n_d, n_a=n_a, n_steps=n_steps, gamma=gamma, lambda_sparse=lambda_sparse,
        mask_type=mask_type, optimizer_fn=optimizer_fn, optimizer_params=optimizer_params,
        scheduler_params={"gamma": scheduler_gamma, "step_size": step_size},
        scheduler_fn=torch.optim.lr_scheduler.StepLR, verbose=True
    )
    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_name=['train', 'valid'],
        eval_metric=['mae'],
        loss_fn=my_r2_score_fn,  # (conserva tu lógica)
        max_epochs=100, patience=40,
        batch_size=batch_size, virtual_batch_size=virtual_batch_size,
        num_workers=1, drop_last=False, augmentations=aug,
    )
    mae = model.history['loss'][-1]
    return mae

def eval_and_print(title, clf_model, X_test, y_test):
    """Evalúa R² en escala original (inverse_transform) y lo imprime."""
    y_pred = clf_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(f"{title}: R2={r2:.4f}")
    return r2

def run_three_training_strategies(
    # modelos / kwargs
    clf_base,                   # modelo ya entrenado en la Fase 1 (con warm_start=True)
    model_init_kwargs,          # dict con los kwargs para construir un modelo nuevo idéntico (desde cero)
    # datos antiguos (Fase 1)
    X_train_old, y_train_old,   # típicamente (X_train, y_train[:,0:1]) de los 1000
    X_test_old, y_test_old,  # test y scaler usados en la Fase 1
    # datos nuevos (Fase 2)
    X_tr_new, y_tr_new,         # train de los 500
    X_va_new, y_va_new,         # valid de los 500 (para early stopping)
    X_te_new, y_te_new,  # test nuevo y su scaler
    # entrenamiento
    batch_size, virtual_batch_size, aug,
    max_epochs_ft_inc=200, patience_ft_inc=70,
    max_epochs_ft_new=200, patience_ft_new=70,
    max_epochs_scratch=200, patience_scratch=70,
    lower_lr_factor=0.1, min_lr=1e-5
):
    """
    Ejecuta:
      A) Fine-tuning incremental (old + new)
      B) Fine-tuning no incremental (solo new)
      C) Desde cero (old + new)
    y evalúa R² en test viejo y test nuevo (ambos en escala original).
    Devuelve un dict con los R².
    """
    results = {}

    # =============================
    # A) Fine-tuning incremental
    # =============================
    clf_ft_inc = deepcopy(clf_base)  # copia del clf ya entrenado
    # bajar LR para fine-tune (opcional, recomendado)
    if hasattr(clf_ft_inc, "_optimizer"):
        for g in clf_ft_inc._optimizer.param_groups:
            g["lr"] = max(g["lr"] * lower_lr_factor, min_lr)

    X_inc = np.concatenate([X_train_old, X_tr_new], axis=0)
    y_inc = np.concatenate([y_train_old, y_tr_new], axis=0)

    clf_ft_inc.fit(
        X_train=X_inc, y_train=y_inc,
        eval_set=[(X_inc, y_inc), (X_va_new, y_va_new)],
        eval_name=['train_inc', 'valid_new'],
        eval_metric=['mae'], loss_fn=my_r2_score_fn,
        max_epochs=max_epochs_ft_inc, patience=patience_ft_inc,
        batch_size=batch_size, virtual_batch_size=virtual_batch_size,
        num_workers=1, drop_last=False, augmentations=aug,
    )

    print("\n== Desempeño: Fine-tuning incremental ==")
    r2_old_inc = eval_and_print("Test viejo (FT incremental)", clf_ft_inc, X_test_old, y_test_old)
    r2_new_inc = eval_and_print("Test nuevo (FT incremental)", clf_ft_inc, X_te_new,  y_te_new)
    results["fine_tune_incremental"] = {"R2_old_test": r2_old_inc, "R2_new_test": r2_new_inc, "model": clf_ft_inc}

    # =============================
    # B) Fine-tuning no incremental (solo nuevos)
    # =============================
    clf_ft_new = deepcopy(clf_base)
    if hasattr(clf_ft_new, "_optimizer"):
        for g in clf_ft_new._optimizer.param_groups:
            g["lr"] = max(g["lr"] * lower_lr_factor, min_lr)

    clf_ft_new.fit(
        X_train=X_tr_new, y_train=y_tr_new,
        eval_set=[(X_tr_new, y_tr_new), (X_va_new, y_va_new)],
        eval_name=['train_new', 'valid_new'],
        eval_metric=['mae'], loss_fn=my_r2_score_fn,
        max_epochs=max_epochs_ft_new, patience=patience_ft_new,
        batch_size=batch_size, virtual_batch_size=virtual_batch_size,
        num_workers=1, drop_last=False, augmentations=aug,
    )

    print("\n== Desempeño: Fine-tuning NO incremental (solo nuevos) ==")
    r2_old_new = eval_and_print("Test viejo (FT no incremental)", clf_ft_new, X_test_old, y_test_old)
    r2_new_new = eval_and_print("Test nuevo (FT no incremental)", clf_ft_new, X_te_new,  y_te_new)
    results["fine_tune_only_new"] = {"R2_old_test": r2_old_new, "R2_new_test": r2_new_new, "model": clf_ft_new}

    # =============================
    # C) Desde cero (cumulative old+new)
    # =============================
    # model_init_kwargs debe contener todo lo necesario para reconstruir el TabNet
    clf_scratch = CustomTabNetRegressor(**model_init_kwargs)

    X_cum = np.concatenate([X_train_old, X_tr_new], axis=0)
    y_cum = np.concatenate([y_train_old, y_tr_new], axis=0)

    clf_scratch.fit(
        X_train=X_cum, y_train=y_cum,
        eval_set=[(X_cum, y_cum), (X_va_new, y_va_new)],
        eval_name=['train_cum', 'valid_new'],
        eval_metric=['mae'], loss_fn=my_r2_score_fn,
        max_epochs=max_epochs_scratch, patience=patience_scratch,
        batch_size=batch_size, virtual_batch_size=virtual_batch_size,
        num_workers=1, drop_last=False, augmentations=aug,
    )

    print("\n== Desempeño: Desde cero (old+new) ==")
    r2_old_sc = eval_and_print("Test viejo (desde cero)", clf_scratch, X_test_old, y_test_old)
    r2_new_sc = eval_and_print("Test nuevo (desde cero)", clf_scratch, X_te_new,  y_te_new)
    results["from_scratch"] = {"R2_old_test": r2_old_sc, "R2_new_test": r2_new_sc, "model": clf_scratch}
    return results
def pick_new_indices(n_new=500, seed=123):
    rng = np.random.RandomState(seed)
    universe = np.setdiff1d(np.arange(X.shape[0]), splits_1000["idx_sub"], assume_unique=True)
    n_new = min(n_new, universe.shape[0])
    return rng.choice(universe, size=n_new, replace=False)
Xdata = df = pd.read_pickle('/content/CHEC/SuperEventos_Criticidad_AguasAbajo_CODEs.pkl')
Xdata = Xdata[Xdata['duracion_h'] <= 100]
# ---------------------------------------------------------
# Etapa 1: seleccionar objetivo (SAIDI o SAIFI) con forma (N,1)
# Extraer variables objetivo
Dur_h = Xdata['duracion_h'].values
SAIDI = Xdata['SAIDI'].values
df1=Xdata.copy()
# Eliminar columnas no utilizadas
Xdata.drop(['inicio_evento', 'h0-solar_rad', 'h0-uv', 'h1-solar_rad', 'h1-uv', 'h2-solar_rad', 'h2-uv', 'h3-solar_rad', 'h3-uv',
            'h4-solar_rad', 'h4-uv', 'h5-solar_rad', 'h5-uv', 'h19-solar_rad', 'h19-uv', 'h20-solar_rad', 'h20-uv',
            'h21-solar_rad', 'h21-uv', 'h22-solar_rad', 'h22-uv', 'h23-solar_rad', 'h23-uv', 'evento', 'fin', 'inicio',
            'cnt_usus', 'DEP', 'MUN', 'FECHA', 'NIVEL_C', 'VALOR_C', 'TRAMOS_AGUAS_ABAJO', 'EQUIPOS_PUNTOS',
            'PUNTOS_POLIGONO', 'LONGITUD2', 'LATITUD2', 'FECHA_C','TRAMOS_AGUAS_ABAJO_CODES','ORDER_'],
           inplace=True, axis=1)

# Definir la variable objetivo y eliminarla del conjunto de características
target = ['SAIFI', 'SAIDI', 'duracion_h']
y1 = Xdata[target].values
Xdata.drop(target, axis=1, inplace=True)
y = y1[:, 0:1].astype('float32')

# Copia de trabajo de X
df = Xdata.copy()

# ---------------------------------------------------------
# Etapa 2: tipificar columnas
NUMERIC_COLUMNS = df.select_dtypes(include=['number']).columns.tolist()
CATEGORICAL_COLUMNS = df.select_dtypes(include=['object', 'category']).columns.tolist()

# ---------------------------------------------------------
# Etapa 3: imputación numérica
max_values = {}
for col in NUMERIC_COLUMNS:
    max_value = pd.to_numeric(df[col], errors='coerce').max()
    if pd.isna(max_value):
        max_value = 0.0
    max_values[col] = max_value
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(-10.0 * max_value)

# ---------------------------------------------------------
# Etapa 4: codificación categórica
label_encoders = {}
categorical_dims = {}
for col in CATEGORICAL_COLUMNS:
    enc = LabelEncoder()
    s = df[col].astype(str).fillna("no aplica")
    enc.fit(s)
    df[col] = enc.transform(s)
    label_encoders[col] = enc
    categorical_dims[col] = len(enc.classes_)

# ---------------------------------------------------------
# Etapa 5: construir matrices X, y
unused_feat = []
# Si Xdata NO incluye el target, basta con tomar todas las columnas
features = [c for c in df.columns if c not in unused_feat]
X = df[features].values.astype('float32')


Collecting openTSNE
  Downloading openTSNE-1.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)
Downloading openTSNE-1.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openTSNE
Successfully installed openTSNE-1.0.2
Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 



# Incorporación de nuevas características

In [2]:
Xdata = df = pd.read_pickle('/content/CHEC/SuperEventos_Criticidad_AguasAbajo_CODEs.pkl')
Xdata = Xdata[Xdata['duracion_h'] <= 100]
Xdata = Xdata.iloc[:1000]
Rayos=pd.read_pickle('/content/CHEC/Data_CHEC/Rayos.pkl')
Rayos['FECHA'] = pd.to_datetime(Rayos['FECHA'])
Vegetacion=pd.read_pickle('/content/CHEC/Data_CHEC/Vegetacion.pkl')
Vegetacion['LATITUD'] = Vegetacion['LATITUD'].astype(float)
Vegetacion['LONGITUD'] = Vegetacion['LONGITUD'].astype(float)

In [3]:
Rayos.head()

Unnamed: 0,ID,LATITUD,LONGITUD,ALTITUD,TIPO,CORRIENTE,ERROR,CODE,FPARENT,DISTANCIA_A_NODO,FECHA,DEP,MUN
0,202009191522446,4.9487,-75.5995,124,2,-5.3,16,1657,CHA23L16,78.252413,2020-09-03 14:15:22,CALDAS,VILLAMARÍA
1,2020076142486,4.9488,-75.7033,0,1,-5.8,61,58412,INS23L13,24.710599,2020-07-10 01:01:42,RISARALDA,SANTA ROSA DE CABAL
2,20200784256395,4.9488,-75.6958,66,2,5.2,16,4244,INS23L13,35.126961,2020-07-04 03:42:56,RISARALDA,SANTA ROSA DE CABAL
3,202008193652495,4.6288,-75.6113,101,2,5.0,3,A186,ROS40L21,289.085969,2020-08-22 14:36:52,QUINDÍO,CIRCASIA
4,20200823350497,4.9488,-76.0051,0,1,-24.7,145,36021,BOA23L14,354.30504,2020-08-13 18:35:04,RISARALDA,BALBOA


In [4]:
Rayos.columns

Index(['ID', 'LATITUD', 'LONGITUD', 'ALTITUD', 'TIPO', 'CORRIENTE', 'ERROR',
       'CODE', 'FPARENT', 'DISTANCIA_A_NODO', 'FECHA', 'DEP', 'MUN'],
      dtype='object')

In [5]:
Vegetacion.head()

Unnamed: 0,NOM_COMUN,TIPO_VEGET,ESTADO_INICIAL,FECHA,LADO_RED,DAP_ESTIM,LONG_INTER,TIPO_INTER,NIVEL_RIES,CIRCUITO_TRAMO,NODO_1,NODO_2,LONGITUD,LATITUD,DEP,MUN
0,YARUMO,Bosque natural,Nuevo,2023-11-30,Debajo,22,6,Rocería,Medio,ESM40L27,A06084,A06085,-75.714305,5.01305,RISARALDA,MARSELLA
1,GUADUA,Guadual,Nuevo,2023-11-30,Debajo,11,6,Tala,Medio,ESM40L27,A06089,A06090,-75.726387,5.023843,RISARALDA,MARSELLA
2,MANDARINO,Frutales,Nuevo,2023-11-30,Debajo,28,6,Poda,Alto,ESM40L28,A06059,A06060,-75.645768,4.992025,CALDAS,CHINCHINÁ
3,GUAMO,Cafe/sombrio,Nuevo,2023-11-30,Debajo,45,6,Poda,Medio,ESM40L27,A06070,A06071,-75.674651,4.993295,CALDAS,CHINCHINÁ
4,GUADUA,Guadual,Nuevo,2023-11-30,Debajo,12,6,Tala,Medio,ESM40L27,A06087,A06088,-75.723988,5.018694,RISARALDA,MARSELLA


In [6]:
Vegetacion.columns

Index(['NOM_COMUN', 'TIPO_VEGET', 'ESTADO_INICIAL', 'FECHA', 'LADO_RED',
       'DAP_ESTIM', 'LONG_INTER', 'TIPO_INTER', 'NIVEL_RIES', 'CIRCUITO_TRAMO',
       'NODO_1', 'NODO_2', 'LONGITUD', 'LATITUD', 'DEP', 'MUN'],
      dtype='object')

In [7]:
Xdata.shape

(1000, 355)

In [8]:
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from pandas.api.types import is_numeric_dtype
from ast import literal_eval
from tqdm import tqdm

def enriquecer_eventos_con_rayos_y_vegetacion(
    Eventos: pd.DataFrame,
    Rayos: pd.DataFrame,
    Vegetacion: pd.DataFrame,
    *,
    radio_rayos: float = 0.005,        # ~0.5 km en grados aprox.
    ventana_dias: int = 1,             # [inicio - ventana_dias, inicio]
    radio_vegetacion: float = 0.0003,  # ~30 m aprox. en grados
    veg_vars: list | None = None,      # variables de interés en Vegetación
    usar_tqdm: bool = True,
    col_lat: str = 'LATITUD',
    col_lon: str = 'LONGITUD'
) -> pd.DataFrame:
    """
    Para cada evento, toma TODAS las (LATITUD,LONGITUD) únicas de su municipio (MUN)
    como puntos de consulta y busca alrededor:
      - RAYOS: dentro de radio_rayos y en la ventana temporal [inicio - ventana_dias, inicio].
      - VEGETACIÓN: dentro de radio_vegetacion (solo espacial).

    Vegetación:
      * 'conteo_vegetacion'
      * Para cada var en veg_vars:
          - Si es numérica (o numérico-like): {mean, median, min, max, std} (std=0 si 1 dato)
          - Si es categórica: {mode}. Si var == 'NOM_COMUN' -> columna 'nombre_comun_mas_frecuente'
    """

    # ---------------------------
    # Normalización de insumos
    # ---------------------------
    Eventos = Eventos.copy()

    # Asegurar tipos numéricos de lat/lon y fecha en Rayos / Vegetación
    for df in (Rayos, Vegetacion, Eventos):
        for c in (col_lat, col_lon):
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce')

    if 'FECHA' in Rayos.columns:
        Rayos['FECHA'] = pd.to_datetime(Rayos['FECHA'], errors='coerce')
    if 'inicio' in Eventos.columns:
        Eventos['inicio'] = pd.to_datetime(Eventos['inicio'], errors='coerce')

    if 'MUN' not in Eventos.columns:
        raise ValueError("Eventos debe contener la columna 'MUN'.")

    # ---------------------------
    # Helpers
    # ---------------------------
    def _build_kd_by_group(df, key_col='MUN', lat=col_lat, lon=col_lon):
        """Construye KDTree por grupo, alineado a índices tras dropna."""
        trees = {}
        for k, g in df.groupby(key_col):
            mask = g[[lat, lon]].notna().all(axis=1)
            g_f = g.loc[mask]
            if not g_f.empty:
                coords = g_f[[lat, lon]].to_numpy()
                trees[k] = (cKDTree(coords), g_f)
        return trees

    def _query_indices_for_points(tree, points, r):
        """Une índices cercanos para múltiples puntos."""
        idxs = set()
        for (lat, lon) in points:
            try:
                lat = float(lat); lon = float(lon)
            except Exception:
                continue
            idxs.update(tree.query_ball_point([lat, lon], r=r))
        return idxs

    def _std_safe(s: pd.Series):
        s = pd.to_numeric(s, errors='coerce').dropna()
        return 0.0 if len(s) <= 1 else float(s.std())

    def _is_numeric_like(series: pd.Series) -> bool:
        """Decide si tratar una variable como numérica (dtype numérico o ≥60% convertible)."""
        if is_numeric_dtype(series):
            return True
        s_num = pd.to_numeric(series, errors='coerce')
        return s_num.notna().mean() >= 0.60

    def _veg_out_cols_for(var: str, kind: str) -> list[str]:
        """Define columnas de salida para cada variable."""
        if kind == 'numeric':
            return [f'{var}_mean', f'{var}_median', f'{var}_min', f'{var}_max', f'{var}_std']
        # categórica
        if var == 'NOM_COMUN':
            return ['nombre_comun_mas_frecuente']
        return [f'{var}_mode']

    def _veg_empty_vals_for(kind: str) -> list:
        if kind == 'numeric':
            return [np.nan, np.nan, np.nan, np.nan, 0.0]
        return [np.nan]  # categórica

    # ---------------------------
    # Puntos por municipio (cache)
    # ---------------------------
    pts_cache_mun: dict = {}
    if (col_lat in Eventos.columns) and (col_lon in Eventos.columns):
        tmp = (
            Eventos[['MUN', col_lat, col_lon]]
            .dropna()
            .drop_duplicates()
        )
        for k, g in tmp.groupby('MUN'):
            pts_cache_mun[k] = [ (float(a), float(b)) for a, b in g[[col_lat, col_lon]].to_numpy() ]
    else:
        # Si no hay LATITUD/LONGITUD en Eventos, no hay puntos municipio-latlon
        pts_cache_mun = {}

    # ---------------------------------------------------------------------
    # (1) RAYOS
    # ---------------------------------------------------------------------
    rayos_trees = _build_kd_by_group(Rayos, key_col='MUN', lat=col_lat, lon=col_lon)

    cols_rayos = [
        'ALTITUD_mean', 'ALTITUD_median', 'ALTITUD_min', 'ALTITUD_max', 'ALTITUD_std',
        'CORRIENTE_mean', 'CORRIENTE_median', 'CORRIENTE_min', 'CORRIENTE_max', 'CORRIENTE_std',
        'TIPO_1_count', 'TIPO_2_count'
    ]
    out_rayos = []

    iterator = Eventos.itertuples()
    pbar = tqdm(total=len(Eventos), desc='Rayos (por lat/lon de municipio)') if usar_tqdm else None
    for ev in iterator:
        mun = getattr(ev, 'MUN', None)
        inicio = pd.to_datetime(getattr(ev, 'inicio', pd.NaT), errors='coerce')
        puntos = pts_cache_mun.get(mun, [])

        if not mun or mun not in rayos_trees or not puntos or pd.isna(inicio):
            out_rayos.append([np.nan]*len(cols_rayos))
            if pbar: pbar.update(1)
            continue

        _, rayos_mun = rayos_trees[mun]
        rayos_mun = rayos_mun.dropna(subset=[col_lat, col_lon, 'FECHA'])
        if rayos_mun.empty:
            out_rayos.append([np.nan]*len(cols_rayos))
            if pbar: pbar.update(1)
            continue

        rayos_temp = rayos_mun[(rayos_mun['FECHA'] >= inicio - pd.Timedelta(days=ventana_dias)) &
                               (rayos_mun['FECHA'] <= inicio)]
        if rayos_temp.empty:
            out_rayos.append([np.nan]*len(cols_rayos))
            if pbar: pbar.update(1)
            continue

        tree_temp = cKDTree(rayos_temp[[col_lat, col_lon]].to_numpy())
        idxs = _query_indices_for_points(tree_temp, puntos, r=radio_rayos)
        if not idxs:
            out_rayos.append([np.nan]*len(cols_rayos))
            if pbar: pbar.update(1)
            continue

        sub = rayos_temp.iloc[list(idxs)].copy()
        if 'ALTITUD' in sub.columns:
            sub['ALTITUD'] = pd.to_numeric(sub['ALTITUD'], errors='coerce')
        else:
            sub['ALTITUD'] = np.nan

        if 'CORRIENTE' in sub.columns:
            sub['CORRIENTE'] = pd.to_numeric(sub['CORRIENTE'], errors='coerce').abs()
        else:
            sub['CORRIENTE'] = np.nan

        tipo_col = 'TIPO' if 'TIPO' in sub.columns else None
        out_rayos.append([
            sub['ALTITUD'].mean(), sub['ALTITUD'].median(), sub['ALTITUD'].min(), sub['ALTITUD'].max(), _std_safe(sub['ALTITUD']),
            sub['CORRIENTE'].mean(), sub['CORRIENTE'].median(), sub['CORRIENTE'].min(), sub['CORRIENTE'].max(), _std_safe(sub['CORRIENTE']),
            (sub[tipo_col] == 1).sum() if tipo_col else np.nan,
            (sub[tipo_col] == 2).sum() if tipo_col else np.nan,
        ])

        if pbar: pbar.update(1)
    if pbar: pbar.close()

    df_rayos = pd.DataFrame(out_rayos, columns=cols_rayos, index=Eventos.index)
    Eventos.loc[df_rayos.index, df_rayos.columns] = df_rayos

    # ---------------------------------------------------------------------
    # (2) VEGETACIÓN (dinámico por veg_vars)
    # ---------------------------------------------------------------------
    if veg_vars is None:
        veg_vars = ['NOM_COMUN'] if 'NOM_COMUN' in Vegetacion.columns else []

    # Clasificar variables (forzar 'NOM_COMUN' como categórica)
    veg_specs = []
    for var in veg_vars:
        if var not in Vegetacion.columns:
            veg_specs.append((var, 'missing'))
        else:
            if var == 'NOM_COMUN':
                kind = 'cat'
            else:
                kind = 'numeric' if _is_numeric_like(Vegetacion[var]) else 'cat'
            veg_specs.append((var, kind))

    # Armar columnas de salida
    cols_veg = ['conteo_vegetacion']
    for var, kind in veg_specs:
        cols_veg += _veg_out_cols_for(var, kind if kind != 'missing' else 'cat')

    veg_trees = _build_kd_by_group(Vegetacion, key_col='MUN', lat=col_lat, lon=col_lon)
    out_veg = []

    def _compute_veg_row(sub_df: pd.DataFrame) -> list:
        row_vals = [len(sub_df)]  # conteo_vegetacion
        for var, kind in veg_specs:
            if var not in sub_df.columns or sub_df.empty:
                row_vals += _veg_empty_vals_for(kind if kind != 'missing' else 'cat')
                continue

            if kind == 'numeric':
                s = pd.to_numeric(sub_df[var], errors='coerce')
                row_vals += [
                    s.mean(), s.median(), s.min(), s.max(),
                    (0.0 if s.dropna().shape[0] <= 1 else float(s.std()))
                ]
            else:  # categórica (incluye NOM_COMUN)
                s = sub_df[var].dropna()
                moda = s.mode()
                row_vals += [ (moda.iloc[0] if not moda.empty else np.nan) ]
        return row_vals

    iterator = Eventos.itertuples()
    pbar = tqdm(total=len(Eventos), desc='Vegetación (por lat/lon de municipio)') if usar_tqdm else None
    for ev in iterator:
        mun = getattr(ev, 'MUN', None)
        puntos = pts_cache_mun.get(mun, [])

        if not mun or mun not in veg_trees or not puntos:
            empty_vals = [0]
            for _, kind in veg_specs:
                empty_vals += _veg_empty_vals_for(kind if kind != 'missing' else 'cat')
            out_veg.append(empty_vals)
            if pbar: pbar.update(1)
            continue

        _, veg_mun = veg_trees[mun]
        veg_mun = veg_mun.dropna(subset=[col_lat, col_lon])
        if veg_mun.empty:
            empty_vals = [0]
            for _, kind in veg_specs:
                empty_vals += _veg_empty_vals_for(kind if kind != 'missing' else 'cat')
            out_veg.append(empty_vals)
            if pbar: pbar.update(1)
            continue

        tree_veg = cKDTree(veg_mun[[col_lat, col_lon]].to_numpy())
        idxs = _query_indices_for_points(tree_veg, puntos, r=radio_vegetacion)
        if not idxs:
            empty_vals = [0]
            for _, kind in veg_specs:
                empty_vals += _veg_empty_vals_for(kind if kind != 'missing' else 'cat')
            out_veg.append(empty_vals)
            if pbar: pbar.update(1)
            continue

        sub = veg_mun.iloc[list(idxs)]
        out_veg.append(_compute_veg_row(sub))
        if pbar: pbar.update(1)
    if pbar: pbar.close()

    df_veg = pd.DataFrame(out_veg, columns=cols_veg, index=Eventos.index)
    Eventos.loc[df_veg.index, df_veg.columns] = df_veg

    return Eventos


In [9]:
Xdata=enriquecer_eventos_con_rayos_y_vegetacion(Xdata, Rayos, Vegetacion,ventana_dias= 24,veg_vars=['NOM_COMUN','ESTADO_INICIAL','LADO_RED','DAP_ESTIM','LONG_INTER','TIPO_INTER', 'NIVEL_RIES'])
Xdata.to_pickle('SuperEventos_Criticidad_AguasAbajo_CODEs.pkl')
Xdata.shape

Rayos (por lat/lon de municipio): 100%|██████████| 1000/1000 [00:35<00:00, 28.54it/s]
Vegetación (por lat/lon de municipio): 100%|██████████| 1000/1000 [00:06<00:00, 163.73it/s]


(1000, 369)

In [10]:
Xdata.iloc[:,-14:]

Unnamed: 0,ESTADO_INICIAL_mode,LADO_RED_mode,DAP_ESTIM_mean,DAP_ESTIM_median,DAP_ESTIM_min,DAP_ESTIM_max,DAP_ESTIM_std,LONG_INTER_mean,LONG_INTER_median,LONG_INTER_min,LONG_INTER_max,LONG_INTER_std,TIPO_INTER_mode,NIVEL_RIES_mode
0,Ejecutado,Debajo,3.000000,0.0,0.0,9.0,5.196152,2.000000,0.0,0.0,6.0,3.464102,Poda,Alto
1,Ejecutado,Debajo,3.000000,0.0,0.0,9.0,5.196152,2.000000,0.0,0.0,6.0,3.464102,Poda,Alto
2,Ejecutado,Debajo,3.000000,0.0,0.0,9.0,5.196152,2.000000,0.0,0.0,6.0,3.464102,Poda,Alto
3,Ejecutado,Debajo,3.000000,0.0,0.0,9.0,5.196152,2.000000,0.0,0.0,6.0,3.464102,Poda,Alto
4,Ejecutado,Debajo,3.000000,0.0,0.0,9.0,5.196152,2.000000,0.0,0.0,6.0,3.464102,Poda,Alto
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Ejecutado,Debajo,31.500000,26.0,1.0,70.0,20.277619,7.666667,10.0,2.0,10.0,3.113996,Poda,Alto
996,Ejecutado,Debajo,21.333333,20.0,16.0,28.0,6.110101,2.666667,2.0,2.0,4.0,1.154701,Poda,Alto
997,Ejecutado,Debajo,20.555556,20.0,10.0,30.0,6.821127,4.222222,4.0,0.0,10.0,2.773886,Poda,Alto
998,Ejecutado,Debajo,20.555556,20.0,10.0,30.0,6.821127,4.222222,4.0,0.0,10.0,2.773886,Poda,Alto


In [11]:
Dur_h = Xdata['duracion_h'].values
SAIDI = Xdata['SAIDI'].values
df1=Xdata.copy()
# Eliminar columnas no utilizadas
Xdata.drop(['inicio_evento', 'h0-solar_rad', 'h0-uv', 'h1-solar_rad', 'h1-uv', 'h2-solar_rad', 'h2-uv', 'h3-solar_rad', 'h3-uv',
            'h4-solar_rad', 'h4-uv', 'h5-solar_rad', 'h5-uv', 'h19-solar_rad', 'h19-uv', 'h20-solar_rad', 'h20-uv',
            'h21-solar_rad', 'h21-uv', 'h22-solar_rad', 'h22-uv', 'h23-solar_rad', 'h23-uv', 'evento', 'fin', 'inicio',
            'cnt_usus', 'DEP', 'MUN', 'FECHA', 'NIVEL_C', 'VALOR_C', 'TRAMOS_AGUAS_ABAJO', 'EQUIPOS_PUNTOS',
            'PUNTOS_POLIGONO', 'LONGITUD2', 'LATITUD2', 'FECHA_C','TRAMOS_AGUAS_ABAJO_CODES','ORDER_'],
           inplace=True, axis=1)

# Definir la variable objetivo y eliminarla del conjunto de características
target = ['SAIFI', 'SAIDI', 'duracion_h']
y1 = Xdata[target].values
Xdata.drop(target, axis=1, inplace=True)
y = y1[:, 0:1].astype('float32')

# Copia de trabajo de X
df = Xdata.copy()

# ---------------------------------------------------------
# Etapa 2: tipificar columnas
NUMERIC_COLUMNS = df.select_dtypes(include=['number']).columns.tolist()
CATEGORICAL_COLUMNS = df.select_dtypes(include=['object', 'category']).columns.tolist()

# ---------------------------------------------------------
# Etapa 3: imputación numérica
max_values = {}
for col in NUMERIC_COLUMNS:
    max_value = pd.to_numeric(df[col], errors='coerce').max()
    if pd.isna(max_value):
        max_value = 0.0
    max_values[col] = max_value
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(-10.0 * max_value)

# ---------------------------------------------------------
# Etapa 4: codificación categórica
label_encoders = {}
categorical_dims = {}
for col in CATEGORICAL_COLUMNS:
    enc = LabelEncoder()
    s = df[col].astype(str).fillna("no aplica")
    enc.fit(s)
    df[col] = enc.transform(s)
    label_encoders[col] = enc
    categorical_dims[col] = len(enc.classes_)

# ---------------------------------------------------------
# Etapa 5: construir matrices X, y
unused_feat = []
# Si Xdata NO incluye el target, basta con tomar todas las columnas
features = [c for c in df.columns if c not in unused_feat]
X = df[features].values.astype('float32')
# Etapa 6: clases auxiliares para estratificación
try:
    # usar etiqueta externa si existe
    y_categorized = df1['NIVEL_C'].values.astype(int)
except Exception:
    # fallback: terciles del objetivo
    percentiles = np.percentile(y[:, 0], [33.33, 66.66])
    y_categorized = np.digitize(y[:, 0].flatten(), bins=percentiles).astype(int)

# ---------------------------------------------------------
# Etapa 7: escalar objetivo (regresión)
scaler = MinMaxScaler()
y_scaled = scaler.fit_transform(y)

# ---------------------------------------------------------
# Etapa 8: split train/test estratificado
X_train, X_test, y_train, y_test, ycat_train, ycat_test = train_test_split(
    X, y_scaled, y_categorized, test_size=0.20, random_state=42, stratify=y_categorized
)

# Etapa 8b: split train/valid estratificado por percentiles de y_train
percentiles_t = np.percentile(y_train[:, 0], [25, 50, 75])
y_categorized_t = np.digitize(y_train[:, 0].flatten(), bins=percentiles_t).astype(int)

X_train, X_valid, y_train, y_valid, ycat_train, ycat_valid = train_test_split(
    X_train, y_train, ycat_train, test_size=0.20, random_state=42, stratify=y_categorized_t
)

# Comprobaciones rápidas
print(X.shape, y.shape)
print("Train/Valid/Test:", X_train.shape, X_valid.shape, X_test.shape)

(1000, 326) (1000, 1)
Train/Valid/Test: (640, 326) (160, 326) (200, 326)


![Logo UNAL CHEC](https://miro.medium.com/v2/resize:fit:1100/format:webp/0*lu62RCEko0VYe-YZ)



In [None]:
import optuna
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler())

study.optimize(objective_regression, n_trials=15)

print("Best hyperparameters for regression: ", study.best_params)
print("Best mae: ", study.best_value)
par = study.best_params


[I 2025-09-05 05:44:49,489] A new study created in memory with name: no-name-07c23c89-d6ae-4a26-8cea-0e5d421f516a


epoch 0  | loss: 271.92625| train_mae: 25.25287| valid_mae: 24.57386|  0:00:01s
epoch 1  | loss: 116.51288| train_mae: 3.98531 | valid_mae: 3.92739 |  0:00:02s
epoch 2  | loss: 153.20446| train_mae: 5.04861 | valid_mae: 5.97843 |  0:00:03s
epoch 3  | loss: 31.68513| train_mae: 13.03509| valid_mae: 12.8926 |  0:00:04s
epoch 4  | loss: 37.5296 | train_mae: 10.24223| valid_mae: 10.5753 |  0:00:06s
epoch 5  | loss: 11.63218| train_mae: 1.03958 | valid_mae: 1.04903 |  0:00:07s
epoch 6  | loss: 4.20039 | train_mae: 1.26825 | valid_mae: 1.26902 |  0:00:08s
epoch 7  | loss: 1.4267  | train_mae: 0.4804  | valid_mae: 0.36659 |  0:00:09s
epoch 8  | loss: 2.69515 | train_mae: 0.84933 | valid_mae: 0.88982 |  0:00:10s
epoch 9  | loss: 3.28872 | train_mae: 0.2533  | valid_mae: 0.23891 |  0:00:11s
epoch 10 | loss: 1.31982 | train_mae: 0.08341 | valid_mae: 0.07679 |  0:00:11s
epoch 11 | loss: 1.12629 | train_mae: 0.06308 | valid_mae: 0.06426 |  0:00:12s
epoch 12 | loss: 1.00371 | train_mae: 0.05701 | v

[I 2025-09-05 05:46:51,285] Trial 0 finished with value: 0.16774559617042542 and parameters: {'n_d': 25, 'n_a': 11, 'n_steps': 3, 'gamma': 1.2767044583220173, 'lambda_sparse': 0.00021743503701894743, 'batch_size': 256, 'mask_type': 'entmax', 'emb': 9, 'momentum': 0.8378948181524486, 'learning_rate': 0.0026985781664567872, 'weight_decay': 4.039364356411261e-05, 'scheduler_gamma': 0.9852634307945352, 'step_size': 11, 'virtual_batch_size': 32, 'optimizer_type': 'rmsprop', 'p': 0.1794052271724728}. Best is trial 0 with value: 0.16774559617042542.


epoch 99 | loss: 0.16775 | train_mae: 0.01439 | valid_mae: 0.01586 |  0:01:49s
Stop training because you reached max_epochs = 100 with best_epoch = 89 and best_valid_mae = 0.00943




epoch 0  | loss: 14047.09344| train_mae: 91.82583| valid_mae: 89.55322|  0:00:01s
epoch 1  | loss: 6236.75177| train_mae: 85.91447| valid_mae: 82.48672|  0:00:03s
epoch 2  | loss: 8690.83115| train_mae: 38.12731| valid_mae: 37.9627 |  0:00:04s
epoch 3  | loss: 7207.35773| train_mae: 27.89284| valid_mae: 29.19356|  0:00:06s
epoch 4  | loss: 8996.16973| train_mae: 27.57042| valid_mae: 31.06845|  0:00:09s
epoch 5  | loss: 6347.51808| train_mae: 19.80003| valid_mae: 19.29023|  0:00:10s
epoch 6  | loss: 6281.75803| train_mae: 23.48881| valid_mae: 24.07481|  0:00:12s
epoch 7  | loss: 5184.08166| train_mae: 24.10659| valid_mae: 22.98389|  0:00:14s
epoch 8  | loss: 3866.64102| train_mae: 20.60244| valid_mae: 21.74703|  0:00:15s
epoch 9  | loss: 6184.68173| train_mae: 18.44676| valid_mae: 20.38288|  0:00:17s
epoch 10 | loss: 4558.98179| train_mae: 16.69636| valid_mae: 14.88144|  0:00:19s
epoch 11 | loss: 3518.85027| train_mae: 26.76334| valid_mae: 26.16837|  0:00:21s
epoch 12 | loss: 4442.47365

[I 2025-09-05 05:49:55,365] Trial 1 finished with value: 1356.7415557861327 and parameters: {'n_d': 18, 'n_a': 23, 'n_steps': 5, 'gamma': 1.2200593872830134, 'lambda_sparse': 2.1646129449895796e-06, 'batch_size': 64, 'mask_type': 'sparsemax', 'emb': 14, 'momentum': 0.7718995901460693, 'learning_rate': 0.00028328508652435814, 'weight_decay': 2.7419645555209715e-05, 'scheduler_gamma': 0.9709430293139852, 'step_size': 9, 'virtual_batch_size': 32, 'optimizer_type': 'adam', 'p': 0.023050692015054684}. Best is trial 0 with value: 0.16774559617042542.


epoch 0  | loss: 2484.19469| train_mae: 94.69408| valid_mae: 95.32561|  0:00:01s
epoch 1  | loss: 1104.06787| train_mae: 25.20789| valid_mae: 22.6221 |  0:00:03s
epoch 2  | loss: 749.13465| train_mae: 42.2212 | valid_mae: 44.70208|  0:00:05s
epoch 3  | loss: 2428.00749| train_mae: 36.31425| valid_mae: 35.06764|  0:00:07s
epoch 4  | loss: 1788.0575| train_mae: 37.61064| valid_mae: 36.10211|  0:00:09s
epoch 5  | loss: 524.56212| train_mae: 33.63172| valid_mae: 40.86635|  0:00:11s
epoch 6  | loss: 465.49125| train_mae: 30.54578| valid_mae: 36.15289|  0:00:13s
epoch 7  | loss: 387.22532| train_mae: 23.77735| valid_mae: 21.05051|  0:00:14s
epoch 8  | loss: 416.92058| train_mae: 6.76607 | valid_mae: 7.54987 |  0:00:16s
epoch 9  | loss: 260.27273| train_mae: 9.69735 | valid_mae: 11.27608|  0:00:18s
epoch 10 | loss: 265.13235| train_mae: 5.67813 | valid_mae: 7.36142 |  0:00:21s
epoch 11 | loss: 277.66488| train_mae: 4.77812 | valid_mae: 4.5058  |  0:00:22s
epoch 12 | loss: 179.79005| train_mae

[I 2025-09-05 05:53:29,742] Trial 2 finished with value: 1.0823955535888672 and parameters: {'n_d': 29, 'n_a': 16, 'n_steps': 5, 'gamma': 1.7388756312377902, 'lambda_sparse': 3.1534474240913024e-06, 'batch_size': 64, 'mask_type': 'entmax', 'emb': 15, 'momentum': 0.5142848494131601, 'learning_rate': 0.011046385775149133, 'weight_decay': 1.7536412970947171e-06, 'scheduler_gamma': 0.9689897207687074, 'step_size': 5, 'virtual_batch_size': 32, 'optimizer_type': 'adamw', 'p': 0.07198264673935191}. Best is trial 0 with value: 0.16774559617042542.


epoch 0  | loss: 1078.00094| train_mae: 15.37752| valid_mae: 16.66172|  0:00:01s
epoch 1  | loss: 310.02507| train_mae: 19.20212| valid_mae: 24.58351|  0:00:02s
epoch 2  | loss: 213.97578| train_mae: 139.44009| valid_mae: 139.16814|  0:00:04s
epoch 3  | loss: 120.05261| train_mae: 232.64221| valid_mae: 231.66821|  0:00:05s
epoch 4  | loss: 82.79048| train_mae: 60.74608| valid_mae: 61.25182|  0:00:07s
epoch 5  | loss: 54.43558| train_mae: 129.54793| valid_mae: 128.53545|  0:00:08s
epoch 6  | loss: 62.38545| train_mae: 41.90364| valid_mae: 42.27773|  0:00:09s
epoch 7  | loss: 29.40224| train_mae: 8.61691 | valid_mae: 9.24906 |  0:00:10s
epoch 8  | loss: 31.29958| train_mae: 8.46839 | valid_mae: 7.89872 |  0:00:11s
epoch 9  | loss: 22.10001| train_mae: 14.15624| valid_mae: 14.19948|  0:00:13s
epoch 10 | loss: 22.20976| train_mae: 5.01511 | valid_mae: 5.63505 |  0:00:14s
epoch 11 | loss: 9.65101 | train_mae: 7.63587 | valid_mae: 7.54035 |  0:00:16s
epoch 12 | loss: 8.13014 | train_mae: 24.

In [None]:
best_params = par

n_d = best_params['n_d']; n_a = best_params['n_a']; n_steps = best_params['n_steps']
gamma = best_params['gamma']; lambda_sparse = best_params['lambda_sparse']
mask_type = best_params['mask_type']; batch_size = best_params['batch_size']
emb = best_params['emb']; p = best_params['p']
momentum = best_params['momentum']; learning_rate = best_params['learning_rate']
weight_decay = best_params['weight_decay']
scheduler_gamma = best_params['scheduler_gamma']; step_size = best_params['step_size']
virtual_batch_size = best_params['virtual_batch_size']; optimizer_type = best_params['optimizer_type']

# Optimizer config (misma lógica)
optimizer_fn, optimizer_params = optimizer_fn, optimizer_params = build_optimizer(optimizer_type, learning_rate, momentum, weight_decay)


# Aumento y categóricas
aug = RegressionSMOTE(p=p)
cat_idxs = [i for i, f in enumerate(features) if f in CATEGORICAL_COLUMNS]
cat_dims = [categorical_dims[f] for f in features if f in CATEGORICAL_COLUMNS]
cat_emb_dim = [min(emb, (dim + 1)//2) for dim in cat_dims]

clf = CustomTabNetRegressor(
    cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs,
    n_d=n_d, n_a=n_a, n_steps=n_steps, gamma=gamma, lambda_sparse=lambda_sparse,
    mask_type=mask_type, optimizer_fn=optimizer_fn, optimizer_params=optimizer_params,
    scheduler_params={"gamma": scheduler_gamma, "step_size": step_size},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    momentum=momentum, verbose=True
)

clf.fit(
    X_train=X_train, y_train=y_train[:,0:1],
    eval_set=[(X_train, y_train[:,0:1]), (X_valid, y_valid[:,0:1])],
    eval_name=['train', 'valid'], eval_metric=['mae'],
    loss_fn=my_r2_score_fn,
    max_epochs=200, patience=70,
    batch_size=batch_size, virtual_batch_size=virtual_batch_size,
    num_workers=1, drop_last=False, augmentations=aug,
)




In [None]:
y_pred=clf.predict(X_test)
y_pred_ = scaler.inverse_transform(y_pred)
y_test_ = scaler.inverse_transform(y_test)
# Plot 2
r2_1 = r2_score(y_test, y_pred)
print(f" Test  -> R2={r2_1:.4f}")

In [None]:
torch.save(clf, "model.pth")

In [None]:
M_explain, masks = clf.explain(X)
M_explain, masks = clf.explain(X)
aux=[]
for i in masks:
    aux.append(np.array(masks[i]))
masks=np.array(aux)
mask=masks.sum(axis=0)
mask=mask-np.min(mask)
mask=mask/np.max(mask)
np.save('mask.npy',mask)

In [None]:
min_val = np.min(mask)
max_val = np.max(mask)

# Crear figura
fig, ax = plt.subplots(figsize=(10, 6))

# Gráfico de la máscara
im = ax.imshow(mask, aspect='auto', cmap='viridis', vmin=min_val, vmax=max_val)
ax.set_title("Relevancia", fontsize=14)
ax.set_xlabel("Características", fontsize=12)
ax.set_ylabel("Muestras", fontsize=12)

# Barra de color
cbar = fig.colorbar(im, ax=ax, orientation='vertical', fraction=0.046, pad=0.04)
cbar.set_label("Valores", fontsize=12)

# Mostrar la gráfica
plt.tight_layout()
plt.savefig('Mask.pdf')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.special import softmax

# Asume que `mask`, `y_categorized`, y `columns` ya están definidos
normalized_mask = softmax(mask, axis=1)
class_titles = ['Todas las muestras', 'Riesgo Bajo', 'Riesgo Medio', 'Riesgo Alto', 'Riesgo Muy Alto']
columns=df.columns
# Crear la figura con 1 fila y 5 columnas
fig, axes = plt.subplots(1, 5, figsize=(40, 15))

for selected_class, ax in enumerate(axes):
    if selected_class == 0:
        # Todas las muestras
        column_relevance = np.mean(normalized_mask, axis=0)
        top_20_indices = np.argsort(column_relevance)[-20:]
        top_20_columns = [columns[i] for i in top_20_indices]
        top_20_mask = normalized_mask[:, top_20_indices]
        sns.violinplot(data=top_20_mask, inner="box", cut=0, ax=ax)
        ax.set_title(class_titles[selected_class])
    else:
        # Filtrar por clase
        class_idx = selected_class - 1
        if np.any(y_categorized == class_idx):
            column_relevance = np.mean(normalized_mask[y_categorized == class_idx], axis=0)
            top_20_indices = np.argsort(column_relevance)[-20:]
            top_20_columns = [columns[i] for i in top_20_indices]
            top_20_mask = normalized_mask[y_categorized == class_idx][:, top_20_indices]
            sns.violinplot(data=top_20_mask, inner="box", cut=0, ax=ax)
            ax.set_title(class_titles[selected_class])
        else:
            ax.text(0.5, 0.5, f'No hay datos para\n{class_titles[selected_class]}',
                    fontsize=14, ha='center', va='center')
            ax.set_axis_off()

    ax.set_xticks(range(len(top_20_columns)))
    ax.set_xticklabels(top_20_columns, rotation=90, fontsize=8)
    ax.set_ylabel("Valores", fontsize=10)
    ax.grid(True)

plt.tight_layout()
plt.savefig('violin.pdf')
plt.show()


In [None]:
def process_dataframe(redmt, df, label_encoders, df1, ind,tip,s,scolumns):
    """
    Procesa un DataFrame `redmt` usando coordenadas y LabelEncoders, y devuelve un DataFrame resultante.

    Args:
        redmt (pd.DataFrame): DataFrame base con información de referencia.
        df (pd.DataFrame): DataFrame con coordenadas para encontrar vecinos cercanos.
        label_encoders (dict): Diccionario con `LabelEncoder` para columnas categóricas.
        df1 (pd.DataFrame): DataFrame con la fila de interés.
        ind (int): Índice de la fila de interés en `df1`.

    Returns:
        pd.DataFrame: DataFrame resultante con los datos procesados.
    """
    # Convertir las columnas de fecha
    df1['inicio'] = pd.to_datetime(df1['inicio'])
    df1['FECHA_C'] = df1['inicio'].dt.to_period('M')

    # Seleccionar la fila de interés de `df1` (por índice)
    row_of_interest = df1.loc[[ind]].copy()
    #print('1',row_of_interest[['LATITUD','LONGITUD']].values)
    #row_of_interest[scolumns]=np.nan
    # Vaciar los valores de las columnas en `scolumns`
    #for col in scolumns:
    #    if col in row_of_interest.columns:
    #        row_of_interest[col] = np.nan

    # Extraer las listas de coordenadas y equipos desde la fila de interés
    if s==0:
        #aux = eval(row_of_interest.loc[ind, 'TRAMOS_AGUAS_ABAJO'])
        aux=  list(eval(row_of_interest.loc[ind,'TRAMOS_AGUAS_ABAJO_CODES']))
    else:
        aux = eval(row_of_interest.loc[ind, 'EQUIPOS_PUNTOS'])


    # DataFrame para almacenar las nuevas filas
    new_rows = []
    # Iterar sobre cada elemento de `aux` para filtrar y duplicar
    for i in aux:
        # Filtrar `redmt` según las condiciones dadas
        if s==0:
            filtered_row = redmt[
              (redmt['FECHA_C'] == row_of_interest.loc[ind, 'FECHA_C']) &
              (redmt['equipo_ope']== i)
                ]

        else:
            filtered_row = redmt[
              (redmt['FECHA_C'] == row_of_interest.loc[ind, 'FECHA_C']) &
              (redmt['LATITUD'] == i[0]) &
              (redmt['LONGITUD'] == i[1])]


        #print('2',filtered_row[['LATITUD','LONGITUD']].values)
        # Si hay filas que cumplen la condición, reemplazar columnas en la fila de interés
        if not filtered_row.empty:
            for _, row in filtered_row.iterrows():
                #print(3,redmt.columns)
                # Crear una copia de la fila de interés y reemplazar las columnas correspondientes
                temp_row = row_of_interest.copy()
                temp_row[redmt.columns] = row.values  # Reemplaza las columnas de redmt
                #temp_row['LATITUD'] = np.float64(i[0])  # Asegura precisión en la asignación
                #temp_row['LONGITUD'] = np.float64(i[1])
                new_rows.append(temp_row)
    if not new_rows:
        # Retornar un DataFrame vacío con las columnas esperadas
        aux1=pd.DataFrame(columns=df1.columns)
        aux1.drop(['inicio_evento', 'h0-solar_rad', 'h0-uv', 'h1-solar_rad', 'h1-uv', 'h2-solar_rad', 'h2-uv', 'h3-solar_rad', 'h3-uv',
            'h4-solar_rad', 'h4-uv', 'h5-solar_rad', 'h5-uv', 'h19-solar_rad', 'h19-uv', 'h20-solar_rad', 'h20-uv',
            'h21-solar_rad', 'h21-uv', 'h22-solar_rad', 'h22-uv', 'h23-solar_rad', 'h23-uv', 'evento', 'fin', 'inicio',
            'cnt_usus', 'DEP', 'MUN', 'FECHA', 'NIVEL_C', 'VALOR_C', 'TRAMOS_AGUAS_ABAJO', 'EQUIPOS_PUNTOS',
            'PUNTOS_POLIGONO', 'LONGITUD2', 'LATITUD2', 'FECHA_C','TRAMOS_AGUAS_ABAJO_CODES','ORDER_'],
           inplace=True, axis=1)
        aux1.drop(target, axis=1, inplace=True)
        return pd.DataFrame(columns=scolumns).values,aux1
    # Concatenar todas las nuevas filas generadas
    result_df = pd.concat(new_rows, ignore_index=True)
    result_df=enriquecer_eventos_con_rayos_y_vegetacion(result_df, Rayos, Vegetacion,ventana_dias= 24,veg_vars=['NOM_COMUN','ESTADO_INICIAL','LADO_RED','DAP_ESTIM','LONG_INTER','TIPO_INTER', 'NIVEL_RIES'])

    bad_types = (list, dict, set)
    hashable_cols = [c for c in result_df.columns
                    if not result_df[c].apply(lambda v: isinstance(v, bad_types)).any()]
    result_df.drop_duplicates(subset=hashable_cols, inplace=True)

    result_df['LATITUD'] = result_df['LATITUD'].astype('float64')
    result_df['LONGITUD'] = result_df['LONGITUD'].astype('float64')

    result_df1 =result_df.copy()
    result_df1['LATITUD'] = result_df1['LATITUD'].astype('float64')
    result_df1['LONGITUD'] = result_df1['LONGITUD'].astype('float64')


    # Codificar las columnas categóricas usando los LabelEncoder definidos en `label_encoders`
    for col, le in label_encoders.items():
      if col in result_df.columns:  # Verificar que la columna exista en `result_df`
        if col in redmt.columns:  # Si la columna pertenece a redmt
            result_df[col] = result_df[col].apply(
                lambda x: le.transform([x])[0] if x in le.classes_ else np.nan
            )
        else:  # Si no pertenece a redmt
            result_df[col] = result_df[col].fillna("no aplica")  # Rellenar NaN con "no aplica"
            result_df[col] = result_df[col].apply(
                lambda x: le.transform([x])[0] if x in le.classes_ else 0
            )


    # Reemplazar valores NaN en columnas categóricas usando el valor más cercano
    categorical_columns = redmt.select_dtypes(include=['object', 'category']).columns

    # Preparar las coordenadas (LATITUD y LONGITUD) de `df`
    df_coords = df[['LATITUD', 'LONGITUD']].dropna()

    # Modelo de vecinos más cercanos
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(df_coords)

    # Recorrer las columnas categóricas de result_df
    for col in result_df.columns:
        if col in redmt.columns:  # Verificar si la columna pertenece a redmt
            nan_indices = result_df[result_df[col].isna()].index  # Índices con NaN en la columna
            for idx in nan_indices:
                # Coordenadas de la fila con NaN
                query_coords = result_df.loc[idx, ['LATITUD', 'LONGITUD']].values.reshape(1, -1)
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore", message="X does not have valid feature names, but NearestNeighbors was fitted with feature names")
                    distance, neighbor_idx = nbrs.kneighbors(query_coords)
                closest_idx = df_coords.iloc[neighbor_idx[0][0]].name
                # Reemplazar el valor NaN con el valor del vecino más cercano
                result_df.at[idx, col] = df.at[closest_idx, col]
        #else:
            #result_df[col].astype(str).fillna("no aplica",inplace=True)
    for col in NUMERIC_COLUMNS:
        max_value = max_values[col]
        # Rellenar valores y ajustar el tipo de datos
        result_df[col] = result_df[col].fillna(-10 * max_value).astype('float64')

    result_df['tipo_equi_ope'] = tip
    result_df.drop(['inicio_evento', 'h0-solar_rad', 'h0-uv', 'h1-solar_rad', 'h1-uv', 'h2-solar_rad', 'h2-uv', 'h3-solar_rad', 'h3-uv',
            'h4-solar_rad', 'h4-uv', 'h5-solar_rad', 'h5-uv', 'h19-solar_rad', 'h19-uv', 'h20-solar_rad', 'h20-uv',
            'h21-solar_rad', 'h21-uv', 'h22-solar_rad', 'h22-uv', 'h23-solar_rad', 'h23-uv', 'evento', 'fin', 'inicio',
            'cnt_usus', 'DEP', 'MUN', 'FECHA', 'NIVEL_C', 'VALOR_C', 'TRAMOS_AGUAS_ABAJO', 'EQUIPOS_PUNTOS',
            'PUNTOS_POLIGONO', 'LONGITUD2', 'LATITUD2', 'FECHA_C','TRAMOS_AGUAS_ABAJO_CODES','ORDER_'],
           inplace=True, axis=1)
    result_df.drop(target, axis=1, inplace=True)
    result_df1.drop(['inicio_evento', 'h0-solar_rad', 'h0-uv', 'h1-solar_rad', 'h1-uv', 'h2-solar_rad', 'h2-uv', 'h3-solar_rad', 'h3-uv',
            'h4-solar_rad', 'h4-uv', 'h5-solar_rad', 'h5-uv', 'h19-solar_rad', 'h19-uv', 'h20-solar_rad', 'h20-uv',
            'h21-solar_rad', 'h21-uv', 'h22-solar_rad', 'h22-uv', 'h23-solar_rad', 'h23-uv', 'evento', 'fin', 'inicio',
            'cnt_usus', 'DEP', 'MUN', 'FECHA', 'NIVEL_C', 'VALOR_C', 'TRAMOS_AGUAS_ABAJO', 'EQUIPOS_PUNTOS',
            'PUNTOS_POLIGONO', 'LONGITUD2', 'LATITUD2', 'FECHA_C','TRAMOS_AGUAS_ABAJO_CODES','ORDER_'],
           inplace=True, axis=1)
    result_df1.drop(target, axis=1, inplace=True)
    result_df1.drop_duplicates(inplace=True)
    result_df.drop_duplicates(inplace=True)
    return result_df.values,result_df1

In [None]:

redmt = pd.read_pickle('/content/CHEC/Data_CHEC/REDMT_1.pkl')
redmt['FECHA']=pd.to_datetime(redmt['FECHA'])
redmt['FECHA_C']=redmt['FECHA'].dt.to_period('M')
redmt.rename(columns={'CODE':'equipo_ope'}, inplace=True)
apoyos = pd.read_pickle('/content/CHEC/Data_CHEC/APOYOS.pkl')
apoyos['FECHA']=pd.to_datetime(apoyos['FECHA'])
apoyos['FECHA_C']=apoyos['FECHA'].dt.to_period('M')
apoyos.rename(columns={'CODE':'equipo_ope'}, inplace=True)
switches = pd.read_pickle('/content/CHEC/Data_CHEC/SWITCHES.pkl')
switches['FECHA']=pd.to_datetime(switches['FECHA'])
switches['FECHA_C']=switches['FECHA'].dt.to_period('M')
trafos = pd.read_pickle('/content/CHEC/Data_CHEC/TRAFOS.pkl')
trafos['FECHA']=pd.to_datetime(trafos['FECHA'])
trafos['FECHA_C']=trafos['FECHA'].dt.to_period('M')
scolumns = list(
    set(redmt.columns)
    .union(set(apoyos.columns))
    .union(set(trafos.columns))
    .union(set(switches.columns))
)

In [None]:
ind=0
a1,a1_df = process_dataframe(trafos, df, label_encoders, df1, ind=ind,tip=2,s=1,scolumns=scolumns)
del trafos
a2,a2_df = process_dataframe(switches, df, label_encoders, df1, ind=ind,tip=0,s=1,scolumns=scolumns)
del switches
a3,a3_df = process_dataframe(redmt, df, label_encoders, df1, ind=ind,tip=1,s=0,scolumns=scolumns)
del redmt
a4,a4_df = process_dataframe(apoyos, df, label_encoders, df1, ind=ind,tip=2,s=1,scolumns=scolumns)
del apoyos
columns=df.columns
arrays_to_concatenate = [arr for arr in (a1, a2, a3, a4) if arr.size > 0]
a = np.concatenate(arrays_to_concatenate, axis=0)
y_e=clf.predict(a)
y_e=y_e.flatten()

In [None]:
pd.concat([a1_df,a2_df,a3_df,a4_df],axis=0)['LONG_INTER_median']