<a href="https://colab.research.google.com/github/Sxmuu/TG-Samuel-P/blob/main/Scripts/Python/notebooks/ML/Modelos_ML_version2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
# ========= 0) INSUMOS Y SUPUESTOS =========
import numpy as np
import pandas as pd

url = "https://raw.githubusercontent.com/Sxmuu/TG-Samuel-P/main/Databases/Contam/Final/df_final.xlsx"

df_raw = pd.read_excel(url, engine="openpyxl")  # instala openpyxl si hace falta

# Columnas esperadas
expected_cols = {
    "Estacion","Date","PM25","PM10","CO",
    "PM25_imputed","PM10_imputed","CO_imputed",
    "PM25_was_imputed","PM10_was_imputed","CO_was_imputed",
    "lat","lon","Altitud","Temp","Hum","WindSpeed","Precip","Pres",
    "Localidad"
}

missing = expected_cols - set(df_raw.columns)
if missing:
    raise ValueError(f"Faltan columnas en el dataset: {sorted(missing)}")

# Normalizar tipo de fecha (diaria)
df_raw["Date"] = pd.to_datetime(df_raw["Date"], errors="coerce").dt.normalize()
if df_raw["Date"].isna().any():
    raise ValueError("Hay fechas no parseables en 'Date'. Revisa el formato de entrada.")

# Orden lógico
df_raw = df_raw.sort_values(["Estacion","Date"]).reset_index(drop=True)

# Tipos sugeridos (no forzamos categóricos aún por compatibilidad con OneHotEncoder)
numeric_cols = ["PM25","PM10","CO","PM25_imputed","PM10_imputed","CO_imputed",
                "PM25_was_imputed","PM10_was_imputed","CO_was_imputed",
                "lat","lon","Altitud","Temp","Hum","WindSpeed","Precip","Pres"]
df_raw[numeric_cols] = df_raw[numeric_cols].apply(pd.to_numeric, errors="coerce")

# Chequeo rápido de nulidad y dtypes
display(df_raw.dtypes)
null_pct = df_raw.isna().mean().sort_values(ascending=False)*100
display(null_pct.head(20))

print(f"Rango de fechas: {df_raw['Date'].min().date()} → {df_raw['Date'].max().date()}")
print(f"Filas totales: {len(df_raw):,}  |  Estaciones: {df_raw['Estacion'].nunique()}  |  Localidades: {df_raw['Localidad'].nunique()}")


Unnamed: 0,0
Unnamed: 0,int64
Estacion,object
Date,datetime64[ns]
PM25,float64
PM10,float64
CO,float64
PM25_imputed,float64
PM10_imputed,float64
CO_imputed,float64
PM25_was_imputed,int64


Unnamed: 0,0
CO,10.255469
PM10,7.12746
PM25,4.75164
Date,0.0
Unnamed: 0,0.0
Estacion,0.0
PM25_imputed,0.0
PM10_imputed,0.0
CO_imputed,0.0
PM25_was_imputed,0.0


Rango de fechas: 2021-01-01 → 2024-12-30
Filas totales: 21,803  |  Estaciones: 15  |  Localidades: 11


In [15]:
# ---------- Sanity checks de rangos (no elimina filas, solo reporta) ----------
def flag_range(col, lo=None, hi=None):
    s = df_raw[col]
    mask = pd.Series(False, index=s.index)
    if lo is not None: mask |= s < lo
    if hi is not None: mask |= s > hi
    n = int(mask.sum())
    if n > 0:
        print(f"[WARN] {col}: {n} valores fuera de rango (~{100*n/len(s):.2f}%). Ejemplos:")
        display(df_raw.loc[mask, ["Estacion","Date",col]].head(5))

# RANGOS DE REFERENCIA (ajústalos si tienes especificaciones oficiales)
flag_range("PM25", lo=0, hi=500)        # µg/m3
flag_range("PM10", lo=0, hi=800)        # µg/m3
flag_range("CO",   lo=0, hi=50)         # ppm (ajusta si tu CO está en mg/m3)
flag_range("Temp", lo=-10, hi=40)       # °C Bogotá
flag_range("Hum",  lo=0, hi=100)        # %
flag_range("WindSpeed", lo=0, hi=20)    # m/s
flag_range("Precip", lo=0, hi=200)      # mm/día
flag_range("Pres", lo=75, hi=105)       # kPa (≈ 750–1050 hPa)


In [33]:
# ========= 1) CONSISTENCIA PARA MODELADO =========

# Target y columnas clave
TARGET = "PM25"  # el objetivo del modelo

# Variables categóricas espaciales (para One-Hot)
CAT_COLS = ["Localidad","Estacion"]

# Variables numéricas base (exógenas y estáticas)
NUM_BASE = ["Temp","Hum","WindSpeed","Precip","Pres","lat","lon","Altitud"]

# Copolutantes como features (si estarán disponibles en operación)
USE_COPOLLUTANTS = False  # Cámbialo a False si no los tendrás en producción
if USE_COPOLLUTANTS:
    NUM_BASE += ["PM10_imputed","CO_imputed"]   # usamos las imputadas como features robustas

# Verificación de presencia
for c in CAT_COLS + NUM_BASE + [TARGET]:
    if c not in df_raw.columns:
        raise ValueError(f"Falta la columna requerida para modelado: {c}")

print("Categóricas:", CAT_COLS)
print("Numéricas base:", NUM_BASE)
print("Target:", TARGET)


Categóricas: ['Localidad', 'Estacion']
Numéricas base: ['Temp', 'Hum', 'WindSpeed', 'Precip', 'Pres', 'lat', 'lon', 'Altitud']
Target: PM25


In [34]:
# ========= 2) TARGET OBSERVADO + SPLIT TEMPORAL =========

# Filtrar SOLO filas con PM2.5 observado (y) para entrenamiento/evaluación
df_model = df_raw[df_raw[TARGET].notna()].copy()

# Partición tiempo: Train=2021–2023, Test=2024
cut_date = pd.Timestamp("2024-01-01")
train_df = df_model[df_model["Date"] < cut_date].copy()
test_df  = df_model[df_model["Date"] >= cut_date].copy()

# Reporte
print(f"TRAIN: {train_df['Date'].min().date()} → {train_df['Date'].max().date()}  |  filas: {len(train_df):,}")
print(f"TEST : {test_df['Date'].min().date()} → {test_df['Date'].max().date()}  |  filas: {len(test_df):,}")

# Guardar vistas para siguientes pasos del notebook (features/targets se definirán luego)
X_train_base = train_df[CAT_COLS + NUM_BASE].copy()
y_train = train_df[TARGET].copy()
X_test_base  = test_df[CAT_COLS + NUM_BASE].copy()
y_test  = test_df[TARGET].copy()

display(X_train_base.head(3))
display(X_test_base.head(3))


TRAIN: 2021-01-01 → 2023-12-31  |  filas: 15,577
TEST : 2024-01-01 → 2024-12-30  |  filas: 5,190


Unnamed: 0,Localidad,Estacion,Temp,Hum,WindSpeed,Precip,Pres,lat,lon,Altitud
1,Barrios Unidos,Centro De Alto Rendimiento,19.64,77.8,0.75,3.9,82.28,4.65847,-74.08396,2552
2,Barrios Unidos,Centro De Alto Rendimiento,18.16,84.79,0.77,11.61,82.26,4.65847,-74.08396,2552
3,Barrios Unidos,Centro De Alto Rendimiento,18.37,81.29,1.17,12.97,82.18,4.65847,-74.08396,2552


Unnamed: 0,Localidad,Estacion,Temp,Hum,WindSpeed,Precip,Pres,lat,lon,Altitud
1096,Barrios Unidos,Centro De Alto Rendimiento,19.68,76.92,0.87,0.09,82.36,4.65847,-74.08396,2552
1097,Barrios Unidos,Centro De Alto Rendimiento,19.71,78.73,0.97,0.11,82.37,4.65847,-74.08396,2552
1098,Barrios Unidos,Centro De Alto Rendimiento,19.87,76.6,1.06,0.0,82.42,4.65847,-74.08396,2552


In [35]:
# ========= 3) INGENIERÍA TEMPORAL (CALENDARIO) =========
for df_ in (train_df, test_df):
    # componentes calendarios
    df_["year"] = df_["Date"].dt.year
    df_["month"] = df_["Date"].dt.month
    df_["day_of_year"] = df_["Date"].dt.dayofyear
    df_["dow"] = df_["Date"].dt.weekday          # 0=Lunes ... 6=Domingo
    df_["is_weekend"] = (df_["dow"] >= 5).astype(int)

    # estacionalidad continua anual
    df_["sin_doy"] = np.sin(2 * np.pi * df_["day_of_year"] / 365.25)
    df_["cos_doy"] = np.cos(2 * np.pi * df_["day_of_year"] / 365.25)

# actualizamos los conjuntos base con estas nuevas columnas
CAL_COLS = ["year","month","day_of_year","dow","is_weekend","sin_doy","cos_doy"]

X_train_base = pd.concat([X_train_base.reset_index(drop=True),
                          train_df[CAL_COLS].reset_index(drop=True)], axis=1)
X_test_base  = pd.concat([X_test_base.reset_index(drop=True),
                          test_df[CAL_COLS].reset_index(drop=True)], axis=1)

print("Añadidas columnas de calendario:", CAL_COLS)
display(X_train_base.head(3))


Añadidas columnas de calendario: ['year', 'month', 'day_of_year', 'dow', 'is_weekend', 'sin_doy', 'cos_doy']


Unnamed: 0,Localidad,Estacion,Temp,Hum,WindSpeed,Precip,Pres,lat,lon,Altitud,year,month,day_of_year,dow,is_weekend,sin_doy,cos_doy
0,Barrios Unidos,Centro De Alto Rendimiento,19.64,77.8,0.75,3.9,82.28,4.65847,-74.08396,2552,2021,1,2,5,1,0.034398,0.999408
1,Barrios Unidos,Centro De Alto Rendimiento,18.16,84.79,0.77,11.61,82.26,4.65847,-74.08396,2552,2021,1,3,6,1,0.051584,0.998669
2,Barrios Unidos,Centro De Alto Rendimiento,18.37,81.29,1.17,12.97,82.18,4.65847,-74.08396,2552,2021,1,4,0,0,0.068755,0.997634


In [19]:
# ========= 3) LAGS & ROLLINGS POR LOCALIDAD (SIN FUGA) =========
from typing import List, Tuple

USE_PM25_LAGS_AS_FEATURES = True  # pon False si no usarás lags del target en producción

LAG_COLS_NUM = ["Temp","Hum","WindSpeed","Precip","Pres"]
# Si vas a usar copolutantes en producción, puedes incluir sus versiones imputadas:
if "PM10_imputed" in X_train_base.columns: LAG_COLS_NUM += ["PM10_imputed"]
if "CO_imputed"   in X_train_base.columns: LAG_COLS_NUM += ["CO_imputed"]
if USE_PM25_LAGS_AS_FEATURES:
    LAG_COLS_NUM = ["PM25"] + LAG_COLS_NUM

LAGS = (1, 3, 7)
ROLL_WINS = (7, 14)

def add_group_lags_rolls(df_all: pd.DataFrame,
                         group_col: str,
                         date_col: str,
                         lag_cols: List[str],
                         lags: Tuple[int,...] = (1,3,7),
                         roll_wins: Tuple[int,...] = (7,14)) -> pd.DataFrame:
    df_all = df_all.sort_values([group_col, date_col]).copy()
    def _apply(g: pd.DataFrame) -> pd.DataFrame:
        g = g.copy()
        for c in lag_cols:
            for L in lags:
                g[f"{c}_lag{L}"] = g[c].shift(L)
        for c in lag_cols:
            for w in roll_wins:
                g[f"{c}_roll{w}"] = g[c].shift(1).rolling(window=w, min_periods=max(3, w//2)).mean()
        return g
    return df_all.groupby(group_col, group_keys=False).apply(_apply)

# Construir df combinado para que los lags del inicio de 2024 queden bien
BASE_COLS = ["Localidad","Estacion","Date"] + list(X_train_base.columns)
tmp_all = pd.concat([
    pd.concat([train_df[BASE_COLS], X_train_base], axis=1),
    pd.concat([test_df[BASE_COLS],  X_test_base],  axis=1)
], axis=0, ignore_index=True)
tmp_all = tmp_all.loc[:, ~tmp_all.columns.duplicated()].copy()

# Agregar lags/rollings por Localidad
tmp_all_lr = add_group_lags_rolls(
    df_all=tmp_all,
    group_col="Localidad",
    date_col="Date",
    lag_cols=[c for c in LAG_COLS_NUM if c in tmp_all.columns],
    lags=LAGS, roll_wins=ROLL_WINS
)

# Separar de nuevo en TRAIN / TEST
train_mask = tmp_all_lr["Date"] < pd.Timestamp("2024-01-01")
tmp_train_lr = tmp_all_lr.loc[train_mask].copy()
tmp_test_lr  = tmp_all_lr.loc[~train_mask].copy()

# Añadir solo las columnas nuevas a las matrices base
new_feat_cols = [c for c in tmp_all_lr.columns if any(s in c for s in ["_lag","_roll"])]
X_train = pd.concat([X_train_base.reset_index(drop=True),
                     tmp_train_lr[new_feat_cols].reset_index(drop=True)], axis=1)
X_test  = pd.concat([X_test_base.reset_index(drop=True),
                     tmp_test_lr[new_feat_cols].reset_index(drop=True)], axis=1)

print(f"Columnas nuevas (lags/rolls, por Localidad): {len(new_feat_cols)}")


Columnas nuevas (lags/rolls, por Localidad): 25


  return df_all.groupby(group_col, group_keys=False).apply(_apply)


In [20]:
# ========= 4) VALIDACIÓN ESPACIOTEMPORAL: LOLO + XGBOOST =========
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import pandas as pd
import numpy as np

CAT_COLS = ["Localidad","Estacion"]
NUM_COLS = [c for c in X_train.columns if c not in CAT_COLS]

pre = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), NUM_COLS),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT_COLS),
    ],
    remainder="drop"
)

xgb = XGBRegressor(
    tree_method="hist",
    n_estimators=1200, learning_rate=0.05,
    max_depth=6, subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.0, reg_lambda=1.0,
    random_state=42, n_jobs=-1
)

pipe = Pipeline([("prep", pre), ("xgb", xgb)])

# ---- LOLO: Leave-One-Locality-Out ----
GROUP_BY = "Localidad"
groups = train_df[GROUP_BY].values
n_loc = train_df[GROUP_BY].nunique()
cv = GroupKFold(n_splits=min(5, n_loc))

param_dist = {
    "xgb__max_depth": [4,6,8],
    "xgb__learning_rate": [0.03, 0.05, 0.1],
    "xgb__subsample": [0.7, 0.8, 1.0],
    "xgb__colsample_bytree": [0.7, 0.8, 1.0],
    "xgb__min_child_weight": [1, 3, 5],
    "xgb__reg_alpha": [0.0, 0.1, 1.0],
    "xgb__reg_lambda": [0.5, 1.0, 2.0],
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=30,
    cv=cv.split(X_train, y_train, groups=groups),  # ← CV con bloqueo espacial por Localidad
    scoring="neg_mean_absolute_error",
    n_jobs=-1, verbose=1, random_state=42
)
search.fit(X_train, y_train)

print("Mejor MAE (CV LOLO):", -search.best_score_)
print("Mejores hiperparámetros:", search.best_params_)
best_model = search.best_estimator_

y_pred = best_model.predict(X_test)

# Aseguramos arrays 1D
y_true = np.asarray(y_test).ravel()
y_hat  = np.asarray(y_pred).ravel()

mae  = mean_absolute_error(y_true, y_hat)
rmse = float(np.sqrt(mean_squared_error(y_true, y_hat)))   # <- sin 'squared'
r2   = r2_score(y_true, y_hat)
print(f"[TEST 2024] MAE={mae:.3f}  RMSE={rmse:.3f}  R2={r2:.3f}")

# Métricas por Localidad (útiles para la tesis)
eval_df = test_df[["Date","Localidad","Estacion"]].copy()
eval_df["y_true"] = y_true
eval_df["y_pred"] = y_hat

def _agg(g):
    return pd.Series({
        "MAE": mean_absolute_error(g["y_true"], g["y_pred"]),
        "RMSE": float(np.sqrt(mean_squared_error(g["y_true"], g["y_pred"]))),
        "R2": r2_score(g["y_true"], g["y_pred"])
    })

loc_report = eval_df.groupby("Localidad").apply(_agg).sort_values("MAE")
display(loc_report)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Mejor MAE (CV LOLO): 3.86341651776963
Mejores hiperparámetros: {'xgb__subsample': 0.8, 'xgb__reg_lambda': 1.0, 'xgb__reg_alpha': 1.0, 'xgb__min_child_weight': 3, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.03, 'xgb__colsample_bytree': 0.7}
[TEST 2024] MAE=6.138  RMSE=8.039  R2=0.339


  loc_report = eval_df.groupby("Localidad").apply(_agg).sort_values("MAE")


Unnamed: 0_level_0,MAE,RMSE,R2
Localidad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Suba,4.695722,6.552764,0.420524
San Cristobal,4.769532,6.185521,0.453654
Usme,4.905882,6.377767,0.452877
Barrios Unidos,5.252078,6.549811,0.464329
Engativa,5.318346,7.129455,0.426633
Santa Fe,5.344859,7.314371,0.378121
Kennedy,6.028985,7.207585,0.364775
Fontibon,6.116351,7.840207,0.375673
Ciudad Bolivar,6.502655,7.855347,0.401868
Tunjuelito,7.090837,8.630677,-0.868888


In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- GLOBAL ---
y_true = np.asarray(y_test).ravel()
y_hat  = np.asarray(y_pred).ravel()
n = y_true.shape[0]

# obtener p a partir del transformador
X_test_t = best_model.named_steps["prep"].transform(X_test)   # matriz numérica final
p = X_test_t.shape[1]

mae  = mean_absolute_error(y_true, y_hat)
rmse = float(np.sqrt(mean_squared_error(y_true, y_hat)))
r2   = r2_score(y_true, y_hat)
r2_adj = 1 - (1 - r2) * (n - 1) / max(1, (n - p - 1))  # evita división por cero

print(f"[TEST 2024] MAE={mae:.3f}  RMSE={rmse:.3f}  R2={r2:.3f}  R2_adj={r2_adj:.3f}")

# --- POR LOCALIDAD (sin DeprecationWarning) ---
eval_df = test_df[["Date","Localidad","Estacion"]].copy()
eval_df["y_true"] = y_true
eval_df["y_pred"] = y_hat

def _agg_local(g: pd.DataFrame) -> pd.Series:
    y = g["y_true"].to_numpy()
    yhat = g["y_pred"].to_numpy()
    n_loc = y.shape[0]
    # p es el mismo (mismo pipeline), aunque n cambia por localidad
    r2_loc = r2_score(y, yhat)
    r2_adj_loc = 1 - (1 - r2_loc) * (n_loc - 1) / max(1, (n_loc - p - 1))
    return pd.Series({
        "MAE": mean_absolute_error(y, yhat),
        "RMSE": float(np.sqrt(mean_squared_error(y, yhat))),
        "R2": r2_loc,
        "R2_adj": r2_adj_loc,
        "n": n_loc
    })

# usar .agg en lugar de .apply para evitar el DeprecationWarning
loc_report = eval_df.groupby("Localidad", group_keys=False).apply(_agg_local).sort_values("MAE")
display(loc_report)


[TEST 2024] MAE=6.138  RMSE=8.039  R2=0.339  R2_adj=0.331


  loc_report = eval_df.groupby("Localidad", group_keys=False).apply(_agg_local).sort_values("MAE")


Unnamed: 0_level_0,MAE,RMSE,R2,R2_adj,n
Localidad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Suba,4.695722,6.552764,0.420524,0.381258,1041.0
San Cristobal,4.769532,6.185521,0.453654,0.322053,341.0
Usme,4.905882,6.377767,0.452877,0.327929,356.0
Barrios Unidos,5.252078,6.549811,0.464329,0.331418,333.0
Engativa,5.318346,7.129455,0.426633,0.275867,318.0
Santa Fe,5.344859,7.314371,0.378121,0.232057,348.0
Kennedy,6.028985,7.207585,0.364775,0.209498,337.0
Fontibon,6.116351,7.840207,0.375673,0.310577,700.0
Ciudad Bolivar,6.502655,7.855347,0.401868,0.266674,359.0
Tunjuelito,7.090837,8.630677,-0.868888,-1.309411,347.0


In [22]:
# ================== COMBO GANADOR ==================
import numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# Columnas
CAT_COLS = ["Localidad","Estacion"]
NUM_COLS = [c for c in X_train.columns if c not in CAT_COLS]

# --- Compatibilidad scikit-learn: sparse_output (nuevo) vs sparse (antiguo) ---
try:
    # sklearn >= 1.2
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    # sklearn <= 1.1
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

pre = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), NUM_COLS),
        ("cat", ohe, CAT_COLS),
    ],
    remainder="drop"
)


# 2) Modelo base: XGBoost con regularización más fuerte (menos sobreajuste)
xgb = XGBRegressor(
    tree_method="hist",
    n_estimators=1600,              # más árboles, pero con regularización y lr bajo
    learning_rate=0.03,            # aprendizaje lento y estable
    max_depth=6,                   # menos profundo que 8 (mejor generalización)
    min_child_weight=6,            # evita hojas con pocos puntos
    subsample=0.8,
    colsample_bytree=0.7,
    reg_alpha=4.0,                 # L1 más fuerte
    reg_lambda=4.0,                # L2 más fuerte
    gamma=1.0,                     # penaliza splits pequeños
    random_state=42,
    n_jobs=-1
)

# 3) Pipeline + transformación de objetivo (log1p en train / expm1 en pred)
pipe = Pipeline([("prep", pre), ("xgb", xgb)])
model = TransformedTargetRegressor(
    regressor=pipe,
    func=np.log1p,      # y -> log1p(y)
    inverse_func=np.expm1
)

# 4) CV interna con bloqueo espacial por Localidad (LOLO)
GROUP_BY = "Localidad"
groups = train_df[GROUP_BY].values
cv = GroupKFold(n_splits=min(5, train_df[GROUP_BY].nunique()))

# 5) Espacio de búsqueda (enfatiza regularización)
param_dist = {
    "regressor__xgb__max_depth": [4, 5, 6],
    "regressor__xgb__learning_rate": [0.02, 0.03, 0.05],
    "regressor__xgb__min_child_weight": [5, 6, 8, 10],
    "regressor__xgb__gamma": [0.5, 1.0, 2.0],
    "regressor__xgb__subsample": [0.6, 0.8, 0.9],
    "regressor__xgb__colsample_bytree": [0.6, 0.7, 0.9],
    "regressor__xgb__reg_alpha": [2.0, 4.0, 8.0],
    "regressor__xgb__reg_lambda": [2.0, 4.0, 8.0],
    "regressor__xgb__n_estimators": [1000, 1400, 1800],
}

search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=35,
    cv=cv.split(X_train, y_train, groups=groups),
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=1,
    random_state=42
)

search.fit(X_train, y_train)

print("Mejor MAE (CV LOLO):", -search.best_score_)
print("Mejores hiperparámetros:")
best_params = {k: search.best_params_[k] for k in sorted(search.best_params_.keys())}
for k,v in best_params.items():
    print(f"  {k}: {v}")

best_model = search.best_estimator_

# 6) Evaluación externa (TEST 2024) con R2 ajustado
y_pred = best_model.predict(X_test)
y_true = np.asarray(y_test).ravel()
y_hat  = np.asarray(y_pred).ravel()

mae  = mean_absolute_error(y_true, y_hat)
rmse = float(np.sqrt(mean_squared_error(y_true, y_hat)))
r2   = r2_score(y_true, y_hat)

# p = número de columnas efectivas tras el preprocesador (transforma X_test)
X_test_t = best_model.regressor_.named_steps["prep"].transform(X_test)
p = X_test_t.shape[1]
n = y_true.shape[0]
r2_adj = 1 - (1 - r2) * (n - 1) / max(1, (n - p - 1))

print(f"[TEST 2024] MAE={mae:.3f}  RMSE={rmse:.3f}  R2={r2:.3f}  R2_adj={r2_adj:.3f}")

# 7) Reporte por Localidad (con R2 ajustado por localidad)
eval_df = test_df[["Date","Localidad","Estacion"]].copy()
eval_df["y_true"] = y_true
eval_df["y_pred"] = y_hat

def _agg_local(g: pd.DataFrame) -> pd.Series:
    y = g["y_true"].to_numpy()
    yhat = g["y_pred"].to_numpy()
    n_loc = y.shape[0]
    r2_loc = r2_score(y, yhat)
    r2_adj_loc = 1 - (1 - r2_loc) * (n_loc - 1) / max(1, (n_loc - p - 1))
    return pd.Series({
        "MAE": mean_absolute_error(y, yhat),
        "RMSE": float(np.sqrt(mean_squared_error(y, yhat))),
        "R2": r2_loc,
        "R2_adj": r2_adj_loc,
        "n": n_loc
    })

loc_report = eval_df.groupby("Localidad", group_keys=False).apply(_agg_local).sort_values("MAE")
display(loc_report)
# ================== FIN COMBO ==================


Fitting 5 folds for each of 35 candidates, totalling 175 fits
Mejor MAE (CV LOLO): 4.069027875900761
Mejores hiperparámetros:
  regressor__xgb__colsample_bytree: 0.9
  regressor__xgb__gamma: 0.5
  regressor__xgb__learning_rate: 0.02
  regressor__xgb__max_depth: 6
  regressor__xgb__min_child_weight: 10
  regressor__xgb__n_estimators: 1400
  regressor__xgb__reg_alpha: 4.0
  regressor__xgb__reg_lambda: 8.0
  regressor__xgb__subsample: 0.9
[TEST 2024] MAE=5.962  RMSE=8.012  R2=0.344  R2_adj=0.335


  loc_report = eval_df.groupby("Localidad", group_keys=False).apply(_agg_local).sort_values("MAE")


Unnamed: 0_level_0,MAE,RMSE,R2,R2_adj,n
Localidad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
San Cristobal,4.614423,6.443849,0.407067,0.264244,341.0
Suba,4.895231,7.075732,0.324339,0.278555,1041.0
Usme,4.920375,6.841085,0.370498,0.226736,356.0
Barrios Unidos,5.179797,6.756169,0.430043,0.288626,333.0
Engativa,5.377444,7.594138,0.349455,0.178395,318.0
Kennedy,5.393406,6.649507,0.459337,0.327174,337.0
Santa Fe,5.495999,7.766068,0.298941,0.13428,348.0
Fontibon,5.984722,8.081509,0.336651,0.267487,700.0
Ciudad Bolivar,6.054284,7.703505,0.424768,0.29475,359.0
Tunjuelito,6.754817,8.212912,-0.692341,-1.09125,347.0


In [23]:
import xgboost
print(xgboost.__version__)


3.1.0


In [24]:
pip install --upgrade xgboost




In [27]:
# === POR LOCALIDAD (máscaras POSICIONALES + validación manual de n_estimators) ===
import numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

results = []
y_pred_all, y_true_all, loc_all = [], [], []

localidades = sorted(train_df["Localidad"].unique())

# OneHot compatible con versiones de sklearn
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # sklearn >=1.2
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)         # sklearn <=1.1

# Valores a validar para el tamaño del ensamble
N_LIST = [400, 800, 1200, 1600]

for loc in localidades:
    # --- Máscaras POSICIONALES (numpy) alineadas con X_train / X_test ---
    mask_tr_loc = (train_df["Localidad"].to_numpy() == loc)
    mask_te_loc = (test_df["Localidad"].to_numpy()  == loc)

    if mask_tr_loc.sum() < 120 or mask_te_loc.sum() < 60:
        continue  # pocos datos para una validación temporal fiable

    # Subsets POSICIONALES (sin depender de índices)
    Xtr_full = X_train.iloc[mask_tr_loc].drop(columns=["Localidad"], errors="ignore").reset_index(drop=True)
    ytr_full = y_train.iloc[mask_tr_loc].reset_index(drop=True)
    Xte_full = X_test.iloc[mask_te_loc].drop(columns=["Localidad"], errors="ignore").reset_index(drop=True)
    yte_full = y_test.iloc[mask_te_loc].reset_index(drop=True)

    # Vector de fechas POSICIONAL para el corte temporal interno (hold-out ~15% final)
    dates_tr = train_df.loc[mask_tr_loc, "Date"].reset_index(drop=True)
    cut = dates_tr.quantile(0.85)
    m_tr  = (dates_tr <  cut).to_numpy()
    m_val = (dates_tr >= cut).to_numpy()

    X_tr, y_tr = Xtr_full.iloc[m_tr].reset_index(drop=True),  ytr_full.iloc[m_tr].reset_index(drop=True)
    X_va, y_va = Xtr_full.iloc[m_val].reset_index(drop=True), ytr_full.iloc[m_val].reset_index(drop=True)

    # Definir columnas locales
    CAT = ["Estacion"] if "Estacion" in X_tr.columns else []
    NUM = [c for c in X_tr.columns if c not in CAT]

    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), NUM),
            ("cat", ohe, CAT),
        ],
        remainder="drop"
    )

    # Ajustar preprocesador SOLO en TRAIN local y transformar TR/VAL/TEST
    X_tr_t     = pre.fit_transform(X_tr)
    X_va_t     = pre.transform(X_va)
    X_all_tr_t = pre.transform(Xtr_full)
    X_te_t     = pre.transform(Xte_full)

    # Transformación del objetivo (log1p) para estabilizar varianza
    y_tr_log = np.log1p(y_tr.to_numpy())
    y_va_log = np.log1p(y_va.to_numpy())

    # Hiperparámetros base (regularización fuerte, robusto)
    base_params = dict(
        tree_method="hist",
        learning_rate=0.03,
        max_depth=4,
        min_child_weight=10,
        subsample=0.9,
        colsample_bytree=0.8,
        gamma=2.0,
        reg_alpha=8.0,
        reg_lambda=8.0,
        random_state=42,
        n_jobs=-1
    )

    # --- Validación MANUAL de n_estimators (sin early stopping / eval_set) ---
    best_n, best_rmse = None, np.inf
    for n_est in N_LIST:
        xgb = XGBRegressor(n_estimators=n_est, **base_params)
        xgb.fit(X_tr_t, y_tr_log)
        va_pred = np.expm1(xgb.predict(X_va_t))
        rmse_va = float(np.sqrt(mean_squared_error(y_va, va_pred)))
        if rmse_va < best_rmse:
            best_rmse = rmse_va
            best_n = n_est

    # --- Reentrenar con TODO el TRAIN local usando el mejor n_estimators ---
    xgb_best = XGBRegressor(n_estimators=best_n, **base_params)
    xgb_best.fit(X_all_tr_t, np.log1p(ytr_full.to_numpy()))

    # Predicción en TEST 2024 (y destransformar)
    yhat = np.expm1(xgb_best.predict(X_te_t))

    # Agregar para métricas globales
    y_pred_all.append(yhat)
    y_true_all.append(yte_full.to_numpy())
    loc_all += [loc]*len(yhat)

    # Métricas por localidad
    mae  = mean_absolute_error(yte_full, yhat)
    rmse = float(np.sqrt(mean_squared_error(yte_full, yhat)))
    r2   = r2_score(yte_full, yhat)
    # R2 ajustado local (p = dimensionalidad post-prepro)
    p = X_te_t.shape[1]; n = len(yte_full)
    r2_adj = 1 - (1 - r2) * (n - 1) / max(1, (n - p - 1))
    results.append([loc, mae, rmse, r2, r2_adj, n, best_n])

# Tabla por localidad
res_df = pd.DataFrame(results, columns=["Localidad","MAE","RMSE","R2","R2_adj","n","best_n_estimators"]).sort_values("MAE")
display(res_df)

# Métrica global combinada
if len(y_pred_all) > 0:
    y_true_all = np.concatenate(y_true_all)
    y_pred_all = np.concatenate(y_pred_all)
    mae_g  = mean_absolute_error(y_true_all, y_pred_all)
    rmse_g = float(np.sqrt(mean_squared_error(y_true_all, y_pred_all)))
    r2_g   = r2_score(y_true_all, y_pred_all)
    p_g = p; n_g = len(y_true_all)
    r2_adj_g = 1 - (1 - r2_g) * (n_g - 1) / max(1, (n_g - p_g - 1))
    print(f"[POR LOCALIDAD | TEST 2024] MAE={mae_g:.3f}  RMSE={rmse_g:.3f}  R2={r2_g:.3f}  R2_adj={r2_adj_g:.3f}")
else:
    print("No se entrenaron modelos (insuficientes datos por localidad).")


Unnamed: 0,Localidad,MAE,RMSE,R2,R2_adj,n,best_n_estimators
6,San Cristobal,5.074395,7.160406,0.267867,0.167474,341,1200
8,Suba,5.210885,7.495153,0.241864,0.209166,1041,1600
10,Usme,5.650508,7.980949,0.143245,0.031376,356,1200
0,Barrios Unidos,5.731725,7.746156,0.250773,0.145212,333,400
4,Kennedy,5.831786,7.160967,0.372965,0.285818,337,1200
2,Engativa,6.319233,8.769931,0.132414,0.003533,318,1200
7,Santa Fe,6.526271,9.296016,-0.00449,-0.139079,348,1600
9,Tunjuelito,6.566584,7.832364,-0.539144,-0.746045,347,1600
1,Ciudad Bolivar,6.611747,8.659235,0.273182,0.179178,359,800
3,Fontibon,6.676717,9.111291,0.156827,0.102925,700,400


[POR LOCALIDAD | TEST 2024] MAE=6.437  RMSE=8.635  R2=0.238  R2_adj=0.232


In [29]:
# ========= RUTA A (GLOBAL ROBUSTO) =========
# Sin Estacion, sin lags del target, log1p en y, regularización fuerte,
# validación MANUAL de n_estimators con hold-out temporal dentro de 2023.
import numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# 1) Definir columnas: SOLO 'Localidad' como categórica; quitar 'Estacion'
CAT_COLS = ["Localidad"]
NUM_COLS = [c for c in X_train.columns if c not in (["Localidad","Estacion"])]

# 2) Corte temporal interno para validación (dentro del TRAIN: 2021–2023)
val_cut = pd.Timestamp("2023-10-01")
dates_tr = train_df["Date"].to_numpy()
mask_tr_core = dates_tr <  val_cut   # para entrenar
mask_va_core = dates_tr >= val_cut   # para validar

# Usar .iloc con máscaras POSICIONALES (evita desalineación)
X_tr_core = X_train.iloc[mask_tr_core].copy()
y_tr_core = y_train.iloc[mask_tr_core].copy()
X_va_core = X_train.iloc[mask_va_core].copy()
y_va_core = y_train.iloc[mask_va_core].copy()

# 3) Preprocesador (compatibilidad scikit-learn nueva/antigua en OneHot)
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

pre_core = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), NUM_COLS),
        ("cat", ohe, CAT_COLS),
    ],
    remainder="drop"
)

# Ajustar SOLO con el tramo de entrenamiento interno
X_tr_t = pre_core.fit_transform(X_tr_core)
X_va_t = pre_core.transform(X_va_core)

# 4) Regresión XGB con regularización fuerte (robusto)
base_params = dict(
    tree_method="hist",
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.7,
    gamma=2.0,
    reg_alpha=8.0,
    reg_lambda=8.0,
    random_state=42,
    n_jobs=-1
)

# Validación MANUAL de n_estimators (sin early_stopping)
N_LIST = [400, 800, 1200, 1600]
best_n, best_rmse = None, np.inf

y_tr_log = np.log1p(y_tr_core.to_numpy())
y_va     = y_va_core.to_numpy()

for n_est in N_LIST:
    xgb = XGBRegressor(n_estimators=n_est, **base_params)
    xgb.fit(X_tr_t, y_tr_log)
    va_pred = np.expm1(xgb.predict(X_va_t))
    rmse_va = float(np.sqrt(mean_squared_error(y_va, va_pred)))
    if rmse_va < best_rmse:
        best_rmse = rmse_va
        best_n = n_est

print(f"[GLOBAL robusto] Mejor n_estimators validado en 2023Q4: {best_n}  (RMSE val={best_rmse:.3f})")

# 5) Reentrenar GLOBAL con TODO el TRAIN (2021–2023)
#    Reajustamos el preprocesador en TODO el TRAIN (para captar todas las categorías/valores)
pre_full = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), NUM_COLS),
        ("cat", ohe, CAT_COLS),
    ],
    remainder="drop"
)
X_tr_full_t = pre_full.fit_transform(X_train)
xgb_best = XGBRegressor(n_estimators=best_n, **base_params)
xgb_best.fit(X_tr_full_t, np.log1p(y_train.to_numpy()))

# 6) Evaluación en TEST 2024
X_te_t = pre_full.transform(X_test)
y_pred = np.expm1(xgb_best.predict(X_te_t))

y_true = y_test.to_numpy().ravel()
y_hat  = y_pred.ravel()

mae  = mean_absolute_error(y_true, y_hat)
rmse = float(np.sqrt(mean_squared_error(y_true, y_hat)))
r2   = r2_score(y_true, y_hat)

# R2 ajustado (p = # columnas tras preprocesado)
p = X_te_t.shape[1]; n = y_true.shape[0]
r2_adj = 1 - (1 - r2) * (n - 1) / max(1, (n - p - 1))
print(f"[GLOBAL robusto | TEST 2024] MAE={mae:.3f}  RMSE={rmse:.3f}  R2={r2:.3f}  R2_adj={r2_adj:.3f}")

# 7) Reporte por Localidad
eval_df = test_df[["Date","Localidad","Estacion"]].copy()
eval_df["y_true"] = y_true
eval_df["y_pred"] = y_hat

def _agg_local(g: pd.DataFrame) -> pd.Series:
    y = g["y_true"].to_numpy(); yhat = g["y_pred"].to_numpy()
    n_loc = len(y)
    r2_loc = r2_score(y, yhat)
    r2_adj_loc = 1 - (1 - r2_loc) * (n_loc - 1) / max(1, (n_loc - p - 1))
    return pd.Series({
        "MAE": mean_absolute_error(y, yhat),
        "RMSE": float(np.sqrt(mean_squared_error(y, yhat))),
        "R2": r2_loc,
        "R2_adj": r2_adj_loc,
        "n": n_loc
    })

loc_report = eval_df.groupby("Localidad", group_keys=False).apply(_agg_local).sort_values("MAE")
display(loc_report)
# ========= FIN RUTA A =========


[GLOBAL robusto] Mejor n_estimators validado en 2023Q4: 1600  (RMSE val=6.727)
[GLOBAL robusto | TEST 2024] MAE=6.114  RMSE=8.280  R2=0.299  R2_adj=0.292


  loc_report = eval_df.groupby("Localidad", group_keys=False).apply(_agg_local).sort_values("MAE")


Unnamed: 0_level_0,MAE,RMSE,R2,R2_adj,n
Localidad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
San Cristobal,4.754296,6.560102,0.38548,0.277035,341.0
Suba,4.993857,7.281119,0.284544,0.24765,1041.0
Usme,5.133606,7.158632,0.310701,0.195062,356.0
Barrios Unidos,5.239776,6.929912,0.400352,0.291519,333.0
Kennedy,5.339329,6.676529,0.454933,0.357395,337.0
Engativa,5.502352,7.873143,0.300776,0.166714,318.0
Santa Fe,5.637625,8.038958,0.248807,0.119379,348.0
Fontibon,6.180739,8.482987,0.269106,0.211581,700.0
Ciudad Bolivar,6.294661,8.088994,0.365757,0.260395,359.0
Tunjuelito,6.524977,7.912164,-0.570667,-0.842206,347.0


In [36]:
# ===== 1) FEATURES DE RÉGIMEN =====
import numpy as np
import pandas as pd

def make_regime_features(df, group_col="Localidad", date_col="Date"):
    d = df.copy()
    d = d.sort_values([group_col, date_col]).reset_index(drop=True)

    # Asegurar columnas base
    for c in ["Precip","WindSpeed","month","is_weekend"]:
        if c not in d.columns:
            raise ValueError(f"Falta columna requerida para régimen: {c}")

    # Rollings (causales) por localidad
    def _grp(g):
        g = g.copy()
        # rollings causales: shift(1) para no "ver el futuro"
        g["Precip_roll3"]     = g["Precip"].shift(1).rolling(3,  min_periods=2).mean()
        g["Precip_roll7"]     = g["Precip"].shift(1).rolling(7,  min_periods=3).mean()
        g["WindSpeed_roll3"]  = g["WindSpeed"].shift(1).rolling(3, min_periods=2).mean()

        # días desde la última lluvia (> 0.1 mm)
        rain = (g["Precip"] > 0.1).astype(int)
        dsr  = []
        cnt  = 0
        for r in rain.shift(1).fillna(0).astype(int):  # causal
            cnt = 0 if r==1 else cnt+1
            dsr.append(cnt)
        g["days_since_rain"] = dsr

        # racha húmeda (longitud de lluvia consecutiva hasta ayer)
        rstreak = []
        cur = 0
        for r in rain.shift(1).fillna(0).astype(int):
            if r==1: cur += 1
            else:    cur = 0
            rstreak.append(cur)
        g["rain_streak_len"] = rstreak

        return g

    d = d.groupby(group_col, group_keys=False).apply(_grp)

    # Temporadas lluviosas Bogotá aprox: Mar–May (3–5) y Sep–Nov (9–11)
    d["is_rainy_season"] = d["month"].isin([3,4,5,9,10,11]).astype(int)

    # Estancamiento: flag por percentil 20 de WindSpeed EN TRAIN (lo rellenamos abajo)
    d["wind_low"] = np.nan  # se llenará con umbrales por localidad usando TRAIN

    return d

# Construir df auxiliares con calendario mínimo (si no lo tenías ya añadido)
for _df in (train_df, test_df):
    _df["month"] = _df["Date"].dt.month
    _df["dow"]   = _df["Date"].dt.weekday
    _df["is_weekend"] = (_df["dow"] >= 5).astype(int)

train_reg = make_regime_features(train_df, group_col="Localidad", date_col="Date")
test_reg  = make_regime_features(test_df,  group_col="Localidad", date_col="Date")

# Umbral de estancamiento por localidad (percentil 20 del TRAIN)
wind_p20 = (
    train_reg.groupby("Localidad")["WindSpeed"].quantile(0.20)
             .reindex(train_reg["Localidad"].unique())
)
# Mapear a ambos
def _map_wind_low(df, ref):
    out = df.copy()
    thr = out["Localidad"].map(ref.to_dict())
    out["wind_low"] = (out["WindSpeed"] <= thr).astype(int)
    return out

train_reg = _map_wind_low(train_reg, wind_p20)
test_reg  = _map_wind_low(test_reg,  wind_p20)

# Selección final de variables de régimen
REGIME_COLS = ["Precip_roll3","Precip_roll7","WindSpeed_roll3",
               "days_since_rain","rain_streak_len","is_rainy_season","wind_low"]

# Alinear matrices con tus X_train / X_test actuales (por posición)
X_train_reg = pd.concat([X_train.reset_index(drop=True),
                         train_reg[REGIME_COLS].reset_index(drop=True)], axis=1)
X_test_reg  = pd.concat([X_test.reset_index(drop=True),
                         test_reg[REGIME_COLS].reset_index(drop=True)], axis=1)


  d = d.groupby(group_col, group_keys=False).apply(_grp)
  d = d.groupby(group_col, group_keys=False).apply(_grp)


In [38]:
# ========= PARCHE: quitar duplicados y evitar colisiones de nombres =========
import pandas as pd

# 0) Asegurar que X_train / X_test no tengan duplicados previos
X_train = X_train.loc[:, ~X_train.columns.duplicated()].copy()
X_test  = X_test.loc[:,  ~X_test.columns.duplicated()].copy()

# 1) Evitar colisiones al añadir features de régimen
#    REGIME_COLS viene del paso 1 (régimen). Filtramos sólo las nuevas que NO existan ya en X_train/X_test.
existing = set(X_train.columns)
REGIME_NEW = [c for c in REGIME_COLS if c not in existing]

# 2) Concatenar sólo columnas nuevas de régimen
X_train_reg = pd.concat([X_train.reset_index(drop=True),
                         train_reg[REGIME_NEW].reset_index(drop=True)], axis=1)
X_test_reg  = pd.concat([X_test.reset_index(drop=True),
                         test_reg[REGIME_NEW].reset_index(drop=True)], axis=1)

# 3) Como cinturón y tirantes, quitar cualquier duplicado residual
X_train_reg = X_train_reg.loc[:, ~X_train_reg.columns.duplicated()].copy()
X_test_reg  = X_test_reg.loc[:,  ~X_test_reg.columns.duplicated()].copy()

print("Se agregaron (régimen) sin colisión:", REGIME_NEW)
skipped = sorted(set(REGIME_COLS) - set(REGIME_NEW))
if skipped:
    print("Omitidas por estar ya presentes:", skipped)

# 4) (Opcional) Si prefieres mantener todas y renombrar las de régimen para distinguirlas:
# train_reg_ren = train_reg.rename(columns={c: f"rg_{c}" for c in REGIME_COLS})
# test_reg_ren  = test_reg.rename(columns={c: f"rg_{c}" for c in REGIME_COLS})
# X_train_reg = pd.concat([X_train, train_reg_ren[[f"rg_{c}" for c in REGIME_COLS]]], axis=1)
# X_test_reg  = pd.concat([X_test,  test_reg_ren [[f"rg_{c}" for c in REGIME_COLS]]], axis=1)


Se agregaron (régimen) sin colisión: ['Precip_roll3', 'WindSpeed_roll3', 'days_since_rain', 'rain_streak_len', 'is_rainy_season', 'wind_low']
Omitidas por estar ya presentes: ['Precip_roll7']


In [39]:
# ===== 2) GLOBAL: reentreno y OOF para residuales =====
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GroupKFold
from xgboost import XGBRegressor

# Usamos SOLO 'Localidad' como categórica, quitamos 'Estacion'
CAT_G = ["Localidad"]
NUM_G = [c for c in X_train_reg.columns if c not in (["Localidad","Estacion"])]

# OHE compatible
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

pre_g = ColumnTransformer(
    [("num", SimpleImputer(strategy="median"), NUM_G),
     ("cat", ohe, CAT_G)],
    remainder="drop"
)

# Hiperparámetros robustos (sin early stopping)
base_params = dict(
    tree_method="hist", learning_rate=0.03,
    max_depth=5, min_child_weight=10,
    subsample=0.8, colsample_bytree=0.7,
    gamma=2.0, reg_alpha=8.0, reg_lambda=8.0,
    random_state=42, n_jobs=-1
)

# 2A) Elegimos n_estimators con hold-out temporal (último trimestre 2023)
val_cut = pd.Timestamp("2023-10-01")
m_tr = (train_df["Date"].to_numpy() <  val_cut)
m_va = (train_df["Date"].to_numpy() >= val_cut)

X_tr_core = X_train_reg.iloc[m_tr]
y_tr_core = y_train.iloc[m_tr]
X_va_core = X_train_reg.iloc[m_va]
y_va_core = y_train.iloc[m_va]

X_tr_t = pre_g.fit_transform(X_tr_core)
X_va_t = pre_g.transform(X_va_core)

N_LIST = [600, 1000, 1400, 1800]
best_n, best_rmse = None, np.inf
y_tr_log = np.log1p(y_tr_core.to_numpy())
for n_est in N_LIST:
    xgb = XGBRegressor(n_estimators=n_est, **base_params)
    xgb.fit(X_tr_t, y_tr_log)
    va_pred = np.expm1(xgb.predict(X_va_t))
    rmse_va = float(np.sqrt(mean_squared_error(y_va_core.to_numpy(), va_pred)))
    if rmse_va < best_rmse:
        best_rmse, best_n = rmse_va, n_est
print(f"[GLOBAL] Mejor n_estimators en 2023Q4: {best_n}  (RMSE val={best_rmse:.3f})")

# 2B) OOF por LOLO (para residuales limpios)
groups = train_df["Localidad"].to_numpy()
cv = GroupKFold(n_splits=min(5, np.unique(groups).size))
oof_pred = np.full(len(train_df), np.nan)

for tr_idx, va_idx in cv.split(np.zeros(len(train_df)), y_train, groups):
    Xtr, ytr = X_train_reg.iloc[tr_idx], y_train.iloc[tr_idx]
    Xva       = X_train_reg.iloc[va_idx]

    Xt = pre_g.fit_transform(Xtr)
    xv = pre_g.transform(Xva)

    xgb = XGBRegressor(n_estimators=best_n, **base_params)
    xgb.fit(Xt, np.log1p(ytr.to_numpy()))
    oof_pred[va_idx] = np.expm1(xgb.predict(xv))

# Residuales de entrenamiento (limpios, sin fuga)
res_train = y_train.to_numpy() - oof_pred

# 2C) Reentreno GLOBAL final y predicción en TEST
Xtr_full_t = pre_g.fit_transform(X_train_reg)
xgb_full = XGBRegressor(n_estimators=best_n, **base_params)
xgb_full.fit(Xtr_full_t, np.log1p(y_train.to_numpy()))

Xte_t = pre_g.transform(X_test_reg)
yhat_test_global = np.expm1(xgb_full.predict(Xte_t))


[GLOBAL] Mejor n_estimators en 2023Q4: 1800  (RMSE val=6.804)


In [41]:
# ===== PASO 3: Corrección local de residuales (stacking por localidad) =====
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -----------------------------------------------------------------------------------
# Supone disponibles desde pasos previos:
# - train_df, test_df  (con columnas: Date, Localidad, etc.)
# - X_train_reg, X_test_reg (features finales para GLOBAL + régimen, ya deduplicadas)
# - y_train, y_test
# - yhat_test_global (predicción GLOBAL sobre test 2024)
# - res_train (residual OOF del global en train: y_train - oof_global)
# - RES_FEATS (lista de features para el modelo de residuales)
# -----------------------------------------------------------------------------------

# Asegurar que RES_FEATS existen en matrices
RES_FEATS = [c for c in RES_FEATS if c in X_train_reg.columns]
if len(RES_FEATS) == 0:
    raise ValueError("RES_FEATS quedó vacío; verifica que las columnas de régimen existan en X_train_reg/X_test_reg.")

results = []
y_pred_all, y_true_all = [], []

localidades = sorted(train_df["Localidad"].unique())

for loc in localidades:
    # --- Máscaras posicionales alineadas ---
    mtr_loc = (train_df["Localidad"].to_numpy() == loc)
    mte_loc = (test_df["Localidad"].to_numpy()  == loc)

    if mtr_loc.sum() < 120 or mte_loc.sum() < 60:
        # Opcional: saltar localidades con pocos datos para validación estable
        continue

    # Subconjuntos locales (posición, sin depender de índices)
    Xtr_loc = X_train_reg.iloc[mtr_loc][RES_FEATS].reset_index(drop=True)
    rtr_loc = pd.Series(res_train[mtr_loc]).reset_index(drop=True)  # residual de TRAIN (OOF)
    Xte_loc = X_test_reg.iloc[mte_loc][RES_FEATS].reset_index(drop=True)
    yte_loc = y_test.iloc[mte_loc].reset_index(drop=True)

    # --- Split temporal interno (85/15) para elegir alpha sin fuga ---
    dates_loc = train_df.loc[mtr_loc, "Date"].reset_index(drop=True)
    cut = dates_loc.quantile(0.85)
    m_in  = (dates_loc <  cut).to_numpy()
    m_val = (dates_loc >= cut).to_numpy()

    # --- 1) Selección de columnas efectivas (descarta columnas totalmente NaN en la localidad) ---
    valid_cols = [c for c in Xtr_loc.columns if Xtr_loc[c].notna().any()]
    Xtr_loc = Xtr_loc[valid_cols].copy()
    Xte_loc = Xte_loc[valid_cols].copy()

    # --- 2) Imputación SIN FUGA (ajustar en TRAIN interno y aplicar a VALID/ALL/TEST) ---
    imp_in = SimpleImputer(strategy="median")
    X_in   = imp_in.fit_transform(Xtr_loc.iloc[m_in])   # solo TRAIN interno
    X_val  = imp_in.transform(Xtr_loc.iloc[m_val])      # VALID
    X_all  = imp_in.transform(Xtr_loc)                  # todo TRAIN local (para reentreno final)
    X_te   = imp_in.transform(Xte_loc)                  # TEST

    # Fallback si persisten NaN por columnas constantes
    if np.isnan(X_in).any() or np.isnan(X_val).any() or np.isnan(X_all).any() or np.isnan(X_te).any():
        imp0 = SimpleImputer(strategy="constant", fill_value=0.0)
        X_in  = imp0.fit_transform(X_in)
        X_val = imp0.transform(X_val)
        X_all = imp0.transform(X_all)
        X_te  = imp0.transform(X_te)

    # --- 3) Selección de alpha (Ridge) en VALID ---
    alphas = [0.1, 1.0, 3.0, 10.0, 30.0]
    best_a, best_rmse = None, np.inf
    y_in  = rtr_loc.iloc[m_in].to_numpy()
    y_val = rtr_loc.iloc[m_val].to_numpy()

    for a in alphas:
        rg = Ridge(alpha=a, random_state=42)
        rg.fit(X_in, y_in)
        pred_val = rg.predict(X_val)
        rmse_val = float(np.sqrt(mean_squared_error(y_val, pred_val)))
        if rmse_val < best_rmse:
            best_rmse, best_a = rmse_val, a

    # --- 4) Reentrenar con TODO el TRAIN local y predecir residual en TEST ---
    rg = Ridge(alpha=best_a, random_state=42)
    rg.fit(X_all, rtr_loc.to_numpy())
    res_pred_test = rg.predict(X_te)

    # --- 5) Pred final = Global + Residual local ---
    yh_glob_loc = yhat_test_global[mte_loc]
    yhat_final  = yh_glob_loc + res_pred_test

    # --- 6) Métricas por localidad ---
    mae  = mean_absolute_error(yte_loc, yhat_final)
    rmse = float(np.sqrt(mean_squared_error(yte_loc, yhat_final)))
    r2   = r2_score(yte_loc, yhat_final)
    p = X_te.shape[1]  # # de columnas efectivas tras imputación
    n = len(yte_loc)
    r2_adj = 1 - (1 - r2) * (n - 1) / max(1, (n - p - 1))

    results.append([loc, mae, rmse, r2, r2_adj, n, best_a])
    y_pred_all.append(yhat_final)
    y_true_all.append(yte_loc.to_numpy())

# ----- Tabla por localidad -----
res_df = pd.DataFrame(results, columns=["Localidad","MAE","RMSE","R2","R2_adj","n","alpha_ridge"]).sort_values("MAE")
try:
    display(res_df)
except NameError:
    print(res_df)

# ----- Métricas globales -----
if len(y_pred_all) > 0:
    y_true_all = np.concatenate(y_true_all)
    y_pred_all = np.concatenate(y_pred_all)
    mae_g  = mean_absolute_error(y_true_all, y_pred_all)
    rmse_g = float(np.sqrt(mean_squared_error(y_true_all, y_pred_all)))
    r2_g   = r2_score(y_true_all, y_pred_all)
    # aproximación de p global = # de features residuales tras imputación en la última localidad procesada
    p_g = p
    n_g = len(y_true_all)
    r2_adj_g = 1 - (1 - r2_g) * (n_g - 1) / max(1, (n_g - p_g - 1))
    print(f"[FINAL Stacking | TEST 2024] MAE={mae_g:.3f}  RMSE={rmse_g:.3f}  R2={r2_g:.3f}  R2_adj={r2_adj_g:.3f}")
else:
    print("No se generaron predicciones finales (datos insuficientes).")


Unnamed: 0,Localidad,MAE,RMSE,R2,R2_adj,n,alpha_ridge
6,San Cristobal,4.907668,6.795955,0.340498,0.30793,341,30.0
8,Suba,5.211366,7.60371,0.219743,0.207552,1041,30.0
0,Barrios Unidos,5.240879,6.98237,0.391239,0.360416,333,30.0
2,Engativa,5.624662,7.473705,0.369925,0.336433,318,0.1
7,Santa Fe,5.810224,8.152948,0.227353,0.190004,348,30.0
1,Ciudad Bolivar,6.132326,7.799124,0.410399,0.382816,359,30.0
10,Usme,6.196063,8.860117,-0.055909,-0.105745,356,30.0
4,Kennedy,6.212016,7.44595,0.322065,0.288168,337,0.1
3,Fontibon,6.595737,8.072957,0.338054,0.322548,700,0.1
9,Tunjuelito,8.297079,9.942839,-1.480357,-1.600617,347,30.0


[FINAL Stacking | TEST 2024] MAE=6.583  RMSE=8.764  R2=0.215  R2_adj=0.212


In [43]:
!pip -q install catboost

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [45]:
# ==================== CATBOOST ROBUSTO (FIX 'Date') ====================
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GroupKFold

# --- Configuración ---
TARGET   = "PM25_imputed"     # usar el target imputado (tu decisión)
CAT_COLS = ["Localidad", "Estacion"]

# NUM_COLS: todas las columnas numéricas de X_train (sin categóricas)
NUM_COLS = [c for c in X_train.columns if c not in CAT_COLS]

# Pesos: baja influencia de filas con target imputado (mitigar sesgo)
if "PM25_was_imputed" in train_df.columns:
    w_train = np.where(train_df["PM25_was_imputed"].to_numpy()==1, 0.5, 1.0)
else:
    w_train = np.ones(len(train_df), dtype=float)

# Fechas posicionales (para corte temporal)
dates_train = train_df["Date"].to_numpy()
dates_test  = test_df["Date"].to_numpy()   # no lo uso para split, pero puede servir para análisis

# --- Validación interna LOLO (espacial) + early stopping temporal (dentro del fold) ---
groups = train_df["Localidad"].to_numpy()
cv = GroupKFold(n_splits=min(5, np.unique(groups).size))

param_grid = [
    dict(depth=d, learning_rate=lr, l2_leaf_reg=l2, bagging_temperature=bt)
    for d  in [6, 8]
    for lr in [0.03, 0.06]
    for l2 in [3.0, 8.0]
    for bt in [0.5, 1.0]
]

best_params, best_cv_mae = None, np.inf
cut_ts = np.datetime64("2023-10-01")  # corte temporal para hold-out interno

for p in param_grid:
    fold_mae = []
    for tr_idx, va_idx in cv.split(X_train, y_train, groups):
        # Subsets posicionales
        Xtr, ytr = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
        Xva, yva = X_train.iloc[va_idx], y_train.iloc[va_idx]
        wtr      = w_train[tr_idx]
        # Fechas POSICIONALES del subset de entrenamiento del fold
        dates_tr_fold = dates_train[tr_idx]
        # Hold-out temporal dentro del fold: 85% / 15% aprox (Q4 2023)
        m_in  = dates_tr_fold <  cut_ts
        m_val = dates_tr_fold >= cut_ts

        # Pools CatBoost (con nombres de columnas categóricas)
        train_pool = Pool(
            data=Xtr.loc[m_in, CAT_COLS + NUM_COLS],
            label=ytr.loc[m_in].values,
            weight=wtr[m_in],
            cat_features=CAT_COLS
        )
        valid_pool = Pool(
            data=Xtr.loc[m_val, CAT_COLS + NUM_COLS],
            label=ytr.loc[m_val].values,
            cat_features=CAT_COLS
        )

        model = CatBoostRegressor(
            loss_function="RMSE",
            eval_metric="RMSE",
            iterations=5000,
            early_stopping_rounds=200,   # usa el hold-out temporal interno
            random_seed=42,
            verbose=False,
            depth=p["depth"],
            learning_rate=p["learning_rate"],
            l2_leaf_reg=p["l2_leaf_reg"],
            bagging_temperature=p["bagging_temperature"],
            subsample=0.8,
            rsm=0.8
        )
        model.fit(train_pool, eval_set=valid_pool)

        # Evalúa en la parte espacial dejada fuera (va_idx)
        va_pool = Pool(
            data=Xva[CAT_COLS + NUM_COLS],
            label=yva.values,
            cat_features=CAT_COLS
        )
        yhat_va = model.predict(va_pool)
        fold_mae.append(mean_absolute_error(yva.values, yhat_va))

    cv_mae = float(np.mean(fold_mae))
    if cv_mae < best_cv_mae:
        best_cv_mae = cv_mae
        best_params = p

print(f"[CatBoost LOLO] Mejor MAE CV: {best_cv_mae:.3f}  con params: {best_params}")

# --- Entrenamiento final (2021–2023) con early stopping temporal (Q4 2023 como validación) ---
m_in  = dates_train <  cut_ts
m_val = dates_train >= cut_ts

train_pool = Pool(
    data=X_train.loc[m_in,  CAT_COLS + NUM_COLS],
    label=y_train.loc[m_in].values,
    weight=w_train[m_in],
    cat_features=CAT_COLS
)
valid_pool = Pool(
    data=X_train.loc[m_val, CAT_COLS + NUM_COLS],
    label=y_train.loc[m_val].values,
    cat_features=CAT_COLS
)

final_model = CatBoostRegressor(
    loss_function="RMSE",
    eval_metric="RMSE",
    iterations=5000,
    early_stopping_rounds=200,
    random_seed=42,
    verbose=False,
    depth=best_params["depth"],
    learning_rate=best_params["learning_rate"],
    l2_leaf_reg=best_params["l2_leaf_reg"],
    bagging_temperature=best_params["bagging_temperature"],
    subsample=0.8,
    rsm=0.8
)
final_model.fit(train_pool, eval_set=valid_pool)

# --- Evaluación en TEST 2024 ---
test_pool = Pool(
    data=X_test[CAT_COLS + NUM_COLS],
    label=y_test.values,
    cat_features=CAT_COLS
)
y_pred = final_model.predict(test_pool)

mae  = mean_absolute_error(y_test.values, y_pred)
rmse = float(np.sqrt(mean_squared_error(y_test.values, y_pred)))
r2   = r2_score(y_test.values, y_pred)

# R2 ajustado (aprox p = #features totales que ve el modelo: num + cat)
p = len(NUM_COLS) + len(CAT_COLS)
n = len(y_test)
r2_adj = 1 - (1 - r2) * (n - 1) / max(1, (n - p - 1))

print(f"[CatBoost | TEST 2024] MAE={mae:.3f}  RMSE={rmse:.3f}  R2={r2:.3f}  R2_adj={r2_adj:.3f}")


[CatBoost LOLO] Mejor MAE CV: 4.573  con params: {'depth': 8, 'learning_rate': 0.06, 'l2_leaf_reg': 8.0, 'bagging_temperature': 0.5}
[CatBoost | TEST 2024] MAE=6.235  RMSE=8.207  R2=0.312  R2_adj=0.306
