<a href="https://colab.research.google.com/github/Sxmuu/TG-Samuel-P/blob/main/Scripts/Python/Notebooks/ML/Modelos_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Ingeniería de Variables**

In [None]:
# --- Importaciones ---
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 200)

# Utilidad: reindexar por día dentro de cada estación para ventanas móviles estrictas
def reindex_daily_per_station(df, station_col='Estacion', date_col='Date'):
    out = []
    for est, dfg in df.groupby(station_col, sort=False):
        dfg = dfg.sort_values(date_col).copy()
        idx = pd.date_range(dfg[date_col].min(), dfg[date_col].max(), freq='D')
        dfg = dfg.set_index(date_col).reindex(idx).rename_axis(date_col).reset_index()
        dfg[station_col] = est
        out.append(dfg)
    return pd.concat(out, ignore_index=True)


In [None]:
url = "https://raw.githubusercontent.com/Sxmuu/TG-Samuel-P/main/Databases/Contam/Final/df_final.xlsx"

df = pd.read_excel(url, engine="openpyxl")  # instala openpyxl si hace falta

In [None]:
expected_cols = ['Date','Estacion','Localidad','PM25','lat','lon','Altitud',
                 'Pres','Precip','Hum','Temp','WindSpeed']
missing = [c for c in expected_cols if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas esperadas: {missing}")

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.sort_values(['Estacion','Date']).drop_duplicates(subset=['Estacion','Date']).reset_index(drop=True)

print(df[['Estacion','Date']].groupby('Estacion').agg(['min','max','nunique']).head())


In [None]:
# --- 2) (Opcional) Reindexar a diario por estación ---
# Si ya sabes que todas las estaciones tienen una observación por día, puedes saltarte esto.
# Si no, esto asegura ventanas móviles de longitud exacta (introducirá NaN si faltaban días).
df = reindex_daily_per_station(df, station_col='Estacion', date_col='Date')


In [None]:
# --- 2) Tipos y orden temporal ---
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
# (Opcional) Orden global
df = df.sort_values(['Estacion', 'Date']).reset_index(drop=True)

# (Opcional) Quitar duplicados exactos por Estacion-Fecha (si existieran)
df = df.drop_duplicates(subset=['Estacion','Date'])
print(df[['Estacion','Date']].groupby('Estacion').agg(['min','max','nunique']).head())

In [None]:
# --- 3) Calendario y estacionalidad ---
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['dayofyear'] = df['Date'].dt.dayofyear
df['dow'] = df['Date'].dt.dayofweek
df['is_weekend'] = (df['dow'] >= 5).astype(int)
df['sin_doy'] = np.sin(2*np.pi*df['dayofyear']/365.25)
df['cos_doy'] = np.cos(2*np.pi*df['dayofyear']/365.25)

In [None]:
# --- 4) Funciones de lags y rolling (sin fuga) ---
def add_lags(df, group_key, date_col, vars_to_lag, lags):
    df = df.sort_values([group_key, date_col]).copy()
    for var in vars_to_lag:
        for k in lags:
            df[f'{var}_lag{k}'] = df.groupby(group_key, sort=False)[var].shift(k)
    return df

def add_rolling_features(df, group_key, date_col, var, windows, stats=('mean',), shift_one=True):
    df = df.sort_values([group_key, date_col]).copy()
    base = df.groupby(group_key, sort=False)[var]
    series = base.shift(1) if shift_one else base.transform(lambda x: x)
    for w in windows:
        roll = series.rolling(w)
        if 'mean' in stats:
            df[f'{var}_rollmean{w}'] = roll.mean().reset_index(level=0, drop=True)
        if 'std' in stats:
            df[f'{var}_rollstd{w}'] = roll.std().reset_index(level=0, drop=True)
        if 'min' in stats:
            df[f'{var}_rollmin{w}'] = roll.min().reset_index(level=0, drop=True)
        if 'max' in stats:
            df[f'{var}_rollmax{w}'] = roll.max().reset_index(level=0, drop=True)
    return df


In [None]:
# --- 5) Lags y rolling de PM25 (clave para 2026) ---
lags_pm25 = [1, 3, 7]
wins_pm25 = [3, 7]

df_feat = add_lags(df, group_key='Estacion', date_col='Date',
                   vars_to_lag=['PM25'], lags=lags_pm25)

df_feat = add_rolling_features(df_feat, group_key='Estacion', date_col='Date',
                               var='PM25', windows=wins_pm25,
                               stats=('mean',), shift_one=True)


In [None]:
# --- 6) (Opcional) Lags de meteorología (sí disponibles si tienes meteo 2026 o usarás climatología)
include_meteo_lags = True
meteo_vars = ['Temp','Hum','WindSpeed','Precip','Pres']
if include_meteo_lags:
    df_feat = add_lags(df_feat, group_key='Estacion', date_col='Date',
                       vars_to_lag=meteo_vars, lags=[1, 3])


In [None]:
# --- 8) Limpieza por NaN de bordes (debidos a lags/rolling) ---
rows_before = len(df_feat)
df_model = df_feat.dropna(subset=['PM25_lag1','PM25_rollmean3']).reset_index(drop=True)
rows_after = len(df_model)
print(f"Filas antes: {rows_before:,} | después de dropna: {rows_after:,} | perdidas: {rows_before - rows_after:,}")


In [None]:
# --- 9) Columnas finales para modelado (no entrenamos aún) ---
base_cols = ['Date','Estacion','Localidad','lat','lon','Altitud','PM25',
             'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy']
lag_cols = [c for c in df_model.columns if c.startswith('PM25_lag') or c.startswith('PM25_rollmean')]
met_lag_cols = [c for c in df_model.columns if any(c.startswith(v+'_lag') for v in meteo_vars)]

cols_for_next_steps = base_cols + lag_cols + met_lag_cols
df_ready = df_model[cols_for_next_steps].copy()

print("Columnas finales (primeras 25):")
print(df_ready.columns.tolist()[:25], '...')
df_ready.head()


In [None]:
# --- 10) Guardar dataset de características ---
df_ready.to_csv('df_features_PM25_no_copollutants.csv', index=False)
print("✅ Guardado: df_features_PM25_no_copollutants.csv")


In [None]:
# --- Climatología meteo por estación y día-del-año (mediana) ---
years_hist = [2021, 2022, 2023, 2024]   # ajusta si procede
meteo_vars = ['Temp','Hum','WindSpeed','Precip','Pres']

df_hist = df[df['year'].isin(years_hist)].copy()
df_hist['doy'] = df_hist['Date'].dt.dayofyear

clima = (df_hist.groupby(['Estacion','doy'])[meteo_vars]
         .median()
         .reset_index()
         .rename(columns={v: f'{v}_clim' for v in meteo_vars}))

# Construir calendario 2026 y “pegar” climatología por estación y DOY
cal2026 = pd.date_range('2026-01-01','2026-12-31',freq='D')
cal = (pd.DataFrame({'Date': cal2026})
       .assign(doy=lambda x: x['Date'].dt.dayofyear)
      )

# Ejemplo: climatología para todas las estaciones (repetimos por estación)
ests = df['Estacion'].dropna().unique()
clima2026 = (cal.assign(key=1)
               .merge(pd.DataFrame({'Estacion': ests, 'key':1}), on='key')
               .drop(columns='key')
               .merge(clima, on=['Estacion','doy'], how='left'))

clima2026.to_csv('climatologia_meteo_2026_por_estacion.csv', index=False)
print("✅ Guardado: climatologia_meteo_2026_por_estacion.csv (medianas por DOY y estación)")


##**Validación Cruzada**

In [None]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error



df = df_ready.copy()
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Chequeo rápido:
print(df.shape)
print(df[['Date','Estacion','Localidad']].head(3))


In [None]:
# Columnas base (ajusta si cambiaste nombres)
base_cols = ['Date','Localidad','lat','lon','Altitud','PM25',
             'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy']

# Lags/rollings ya construidos en Paso 1
lag_cols = [c for c in df.columns if c.startswith('PM25_lag') or c.startswith('PM25_rollmean')]
# Lags meteo si los añadiste en Paso 1
met_vars = ['Temp','Hum','WindSpeed','Precip','Pres']
met_lag_cols = [c for c in df.columns if any(c.startswith(v+'_lag') for v in met_vars)]

# Sin copolutantes:
feature_cols = ['Localidad','lat','lon','Altitud',
                'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy'] \
               + lag_cols + met_lag_cols

# Quitar filas con NaN en features/target (bordes por lags)
data = df.dropna(subset=feature_cols + ['PM25']).copy()

X = data[feature_cols].copy()
y = data['PM25'].values
dates = data['Date'].copy()
localities = data['Localidad'].copy()

# Preprocesamiento: OHE para categóricas; numéricas 'passthrough'
cat_features = ['Localidad']
num_features = [c for c in feature_cols if c not in cat_features]

pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
    ('num', 'passthrough', num_features)
])


In [None]:
model = HistGradientBoostingRegressor(
    learning_rate=0.06,
    max_iter=400,
    min_samples_leaf=25,
    early_stopping=True,
    random_state=42
)

pipe = Pipeline([
    ('pre', pre),
    ('model', model)
])


In [None]:
def build_time_folds(unique_dates, n_folds=4):
    """
    Forward-chaining con validación por bloques de igual tamaño aproximado.
    Devuelve lista de dicts con índices booleanos para train/val.
    """
    unique_dates = np.array(sorted(unique_dates))
    folds = []
    val_block = int(len(unique_dates)/(n_folds+1))
    for k in range(1, n_folds+1):
        train_end = k*val_block
        val_start = train_end
        val_end = val_start + val_block
        train_dates = unique_dates[:train_end]
        val_dates = unique_dates[val_start:val_end]
        folds.append({
            "train_dates": train_dates,
            "val_dates": val_dates
        })
    return folds

unique_days = np.array(sorted(dates.dt.normalize().unique()))
folds = build_time_folds(unique_days, n_folds=4)
[(f["train_dates"].min(), f["train_dates"].max(), f["val_dates"].min(), f["val_dates"].max()) for f in folds]


In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sklearn
print("scikit-learn versión:", sklearn.__version__)

def rmse_compat(y_true, y_pred):
    """RMSE compatible con cualquier versión de scikit-learn."""
    try:
        # sklearn >= 0.22 (aprox.) soporta 'squared'
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        # fallback para versiones antiguas
        return np.sqrt(mean_squared_error(y_true, y_pred))


In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

def rmse_compat(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

# Cambiar agrupamiento de 'Estacion' a 'Localidad'
def eval_by_locality(y_true, y_pred, localities):
    dfm = pd.DataFrame({"y": y_true, "yhat": y_pred, "Localidad": localities})  # Cambio aquí
    out = []
    for loc, g in dfm.groupby("Localidad", sort=False):  # Cambiar 'Estacion' por 'Localidad'
        mae = mean_absolute_error(g["y"], g["yhat"])
        rmse = rmse_compat(g["y"], g["yhat"])   # Calculamos RMSE
        out.append({"Localidad": loc, "MAE": mae, "RMSE": rmse, "n": len(g)})  # Cambiar 'Estacion' por 'Localidad'
    return pd.DataFrame(out).sort_values("RMSE")

# Cambiar 'stations' a 'localities'
cv_rows = []
per_locality_reports = []

for i, f in enumerate(folds, start=1):
    tr_mask = dates.dt.normalize().isin(f["train_dates"])
    va_mask = dates.dt.normalize().isin(f["val_dates"])

    X_tr, y_tr = X[tr_mask], y[tr_mask]
    X_va, y_va = X[va_mask], y[va_mask]
    locality_va = localities[va_mask].values  # Usamos 'localities' aquí en lugar de 'stations'

    pipe.fit(X_tr, y_tr)
    pred_va = pipe.predict(X_va)

    mae = mean_absolute_error(y_va, pred_va)
    rmse = rmse_compat(y_va, pred_va)  # Calculamos RMSE

    cv_rows.append({
        "fold": i,
        "train_start": str(dates[tr_mask].min().date()),
        "train_end":   str(dates[tr_mask].max().date()),
        "val_start":   str(dates[va_mask].min().date()),
        "val_end":     str(dates[va_mask].max().date()),
        "n_train": int(tr_mask.sum()),
        "n_val": int(va_mask.sum()),
        "MAE": mae,
        "RMSE": rmse
    })

    # Usamos la nueva función con 'Localidad'
    rep = eval_by_locality(y_va, pred_va, locality_va)
    rep.insert(0, 'fold', i)
    per_locality_reports.append(rep)

cv_table = pd.DataFrame(cv_rows)
per_locality_table = pd.concat(per_locality_reports, ignore_index=True)

display(cv_table.round(3))
display(per_locality_table.round(3))

# Guardamos los resultados
cv_table.to_csv("cv_temporal_global.csv", index=False)
per_locality_table.to_csv("cv_temporal_por_localidad.csv", index=False)
print("✅ Guardados: cv_temporal_global.csv, cv_temporal_por_localidad.csv")


In [None]:
mask_train = dates.dt.year <= 2023
mask_test  = dates.dt.year == 2024

X_tr, y_tr = X[mask_train], y[mask_train]
X_te, y_te = X[mask_test],  y[mask_test]
localities_te = localities[mask_test].values  # Cambiar 'stations' por 'localities'

pipe.fit(X_tr, y_tr)
pred_te = pipe.predict(X_te)

mae_te = mean_absolute_error(y_te, pred_te)
rmse_te = rmse_compat(y_te, pred_te)  # Calculando RMSE sin squared=False
print(f"Hold-out 2024 → MAE: {mae_te:.3f} | RMSE: {rmse_te:.3f} | n_test: {mask_test.sum()}")

# Evaluar por localidad en vez de estación
rep_te = eval_by_locality(y_te, pred_te, localities_te).round(3)  # Usar eval_by_locality
display(rep_te)

# Guardar el resultado con nombre modificado
rep_te.to_csv("holdout2024_por_localidad.csv", index=False)
print("✅ Guardado: holdout2024_por_localidad.csv")

In [None]:
# Repite el armado del fold 4 exactamente como en tu CV
fold = folds[3]  # 4º fold (índice 3)
tr_mask = dates.dt.normalize().isin(fold["train_dates"])
va_mask = dates.dt.normalize().isin(fold["val_dates"])

pipe.fit(X[tr_mask], y[tr_mask])
pred_va = pipe.predict(X[va_mask])

df_va = data.loc[va_mask, ['Date','Localidad','PM25']].copy()  # Cambiar 'Estacion' por 'Localidad'
df_va['yhat'] = pred_va
df_va['abs_err'] = (df_va['PM25'] - df_va['yhat']).abs()

# Modificar la función para agrupar por 'Localidad' en vez de 'Estacion'
def mae_rmse(g):
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np
    try:
        rmse = mean_squared_error(g['PM25'], g['yhat'], squared=False)
    except TypeError:
        rmse = np.sqrt(mean_squared_error(g['PM25'], g['yhat']))
    return pd.Series({
        'n': len(g),
        'MAE': mean_absolute_error(g['PM25'], g['yhat']),
        'RMSE': rmse
    })

# Agrupar por 'Localidad' en vez de 'Estacion'
print(df_va.groupby('Localidad').apply(mae_rmse).round(3))

# Si quieres ver la distribución de errores de Usaquen (ahora por localidad)
usa = df_va[df_va['Localidad'] == 'Puente Aranda']  # Cambiar 'Estacion' por 'Localidad'
print(usa[['abs_err']].describe(percentiles=[.5,.9,.95,.99]).round(3).T)
print("Fechas con mayor error en Puente Aranda:\n", usa.nlargest(5, 'abs_err')[['Date', 'PM25', 'yhat', 'abs_err']])


##**CatBoost**

In [None]:
# ==== 0) Setup: instalar CatBoost (si hiciera falta) ====
!pip -q install catboost


In [None]:
# ==== 1) Cargar librerías y datos ====
import json, math, random
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool

# Compatibilidad de métricas (RMSE con y sin 'squared')
from sklearn.metrics import mean_absolute_error, mean_squared_error
def rmse_compat(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

df = df_ready.copy()
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Solo por seguridad: tipar categóricas como string
for c in ['Localidad']:  # Cambiar 'Estacion' por 'Localidad'
    if c in df.columns:
        df[c] = df[c].astype(str)

print(df.shape)
df.head(2)


In [None]:
# ==== 2) Definir features y target (sin copolutantes) ====
base_cols = ['Date','Localidad','lat','lon','Altitud',  # Cambiar 'Estacion' por 'Localidad'
             'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy']

lag_cols = [c for c in df.columns if c.startswith('PM25_lag') or c.startswith('PM25_rollmean')]

met_vars = ['Temp','Hum','WindSpeed','Precip','Pres']
met_lag_cols = [c for c in df.columns if any(c.startswith(v+'_lag') for v in met_vars)]

# target
target_col = 'PM25'

# columnas finales de X (Date no va al modelo)
feature_cols = ['Localidad','lat','lon','Altitud',  # Cambiar 'Estacion' por 'Localidad'
                'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy'] \
               + lag_cols + met_lag_cols

data = df.dropna(subset=feature_cols + [target_col]).copy()
X = data[feature_cols].copy()
y = data[target_col].values
dates = data['Date'].copy()
localities = data['Localidad'].copy()  # Cambiar 'Estacion' por 'Localidad'

# Índices de categóricas para CatBoost (dentro de X)
cat_cols = ['Localidad']  # Cambiar 'Estacion' por 'Localidad'
cat_idx = [X.columns.get_loc(c) for c in cat_cols if c in X.columns]

len(feature_cols), feature_cols[:8], cat_idx


In [None]:
# ==== 3) Construir los mismos folds temporales (ventana expansiva) ====
def build_time_folds(unique_dates, n_folds=4):
    unique_dates = np.array(sorted(unique_dates))
    folds = []
    val_block = int(len(unique_dates)/(n_folds+1))
    for k in range(1, n_folds+1):
        train_end = k*val_block
        val_start = train_end
        val_end = val_start + val_block
        train_dates = unique_dates[:train_end]
        val_dates = unique_dates[val_start:val_end]
        folds.append({"train_dates": train_dates, "val_dates": val_dates})
    return folds

unique_days = np.array(sorted(dates.dt.normalize().unique()))
folds = build_time_folds(unique_days, n_folds=4)

[(f["train_dates"].min(), f["train_dates"].max(), f["val_dates"].min(), f["val_dates"].max()) for f in folds]


In [None]:
# ==== 4) Función de evaluación para un set de hiperparámetros ====
def eval_params(params, verbose=False):
    """Devuelve dict con MAE y RMSE promediados en CV, y por fold."""
    fold_results = []
    for i, f in enumerate(folds, start=1):
        tr_mask = dates.dt.normalize().isin(f["train_dates"])
        va_mask = dates.dt.normalize().isin(f["val_dates"])

        X_tr, y_tr = X[tr_mask], y[tr_mask]
        X_va, y_va = X[va_mask], y[va_mask]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
        valid_pool = Pool(X_va, y_va, cat_features=cat_idx)

        model = CatBoostRegressor(
            loss_function='RMSE',
            iterations=params.get('iterations', 2000),
            depth=params.get('depth', 7),
            learning_rate=params.get('learning_rate', 0.06),
            l2_leaf_reg=params.get('l2_leaf_reg', 3.0),
            bootstrap_type=params.get('bootstrap_type', 'Bayesian'),
            bagging_temperature=params.get('bagging_temperature', 1.0),
            random_strength=params.get('random_strength', 0.0),
            early_stopping_rounds=params.get('early_stopping_rounds', 100),
            random_seed=42,
            verbose=False,
            allow_writing_files=False
        )

        model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=verbose)
        pred_va = model.predict(valid_pool)

        mae = mean_absolute_error(y_va, pred_va)
        rmse = rmse_compat(y_va, pred_va)

        fold_results.append({"fold": i, "MAE": mae, "RMSE": rmse, "n_val": int(va_mask.sum())})

    mae_mean = float(np.mean([r["MAE"] for r in fold_results]))
    rmse_mean = float(np.mean([r["RMSE"] for r in fold_results]))
    return {"mae_mean": mae_mean, "rmse_mean": rmse_mean, "folds": fold_results}


In [None]:
# ==== 5) Random Search liviano de hiperparámetros ====
random.seed(42)
search_space = {
    "depth":       [5,6,7,8,9,10],
    "learning_rate": [0.02, 0.03, 0.04, 0.06, 0.08, 0.10],
    "l2_leaf_reg":  [1.0, 2.0, 3.0, 5.0, 7.0, 10.0],
    "bagging_temperature": [0.0, 0.5, 1.0, 2.0, 3.0, 5.0],
    "random_strength": [0.0, 0.1, 0.2, 0.5],
    "iterations":  [1500, 2000, 2500],
    "bootstrap_type": ['Bayesian'],
    "early_stopping_rounds": [100]
}

def sample_params(space):
    return {
        "depth": random.choice(space["depth"]),
        "learning_rate": random.choice(space["learning_rate"]),
        "l2_leaf_reg": random.choice(space["l2_leaf_reg"]),
        "bagging_temperature": random.choice(space["bagging_temperature"]),
        "random_strength": random.choice(space["random_strength"]),
        "iterations": random.choice(space["iterations"]),
        "bootstrap_type": 'Bayesian',
        "early_stopping_rounds": 100
    }

results = []
N_TRIALS = 50  # puedes subirlo si tienes tiempo de cómputo
for t in range(1, N_TRIALS+1):
    params = sample_params(search_space)
    res = eval_params(params, verbose=False)
    res["params"] = params
    results.append(res)
    print(f"Trial {t}/{N_TRIALS} → RMSE_CV={res['rmse_mean']:.3f} | MAE_CV={res['mae_mean']:.3f} | {params}")

# Ordenar por RMSE (menor es mejor)
results_sorted = sorted(results, key=lambda r: r["rmse_mean"])
best = results_sorted[0]
print("\n=== MEJOR CONFIGURACIÓN (CV) ===")
print(json.dumps(best["params"], indent=2))
print("CV → RMSE promedio:", round(best["rmse_mean"],3), " | MAE promedio:", round(best["mae_mean"],3))
pd.DataFrame(best["folds"]).round(3)


Trial 1/50 → RMSE_CV=5.706 | MAE_CV=4.386 | {'depth': 10, 'learning_rate': 0.02, 'l2_leaf_reg': 1.0, 'bagging_temperature': 5.0, 'random_strength': 0.2, 'iterations': 1500, 'bootstrap_type': 'Bayesian', 'early_stopping_rounds': 100}
Trial 2/50 → RMSE_CV=5.473 | MAE_CV=4.198 | {'depth': 6, 'learning_rate': 0.03, 'l2_leaf_reg': 10.0, 'bagging_temperature': 0.0, 'random_strength': 0.0, 'iterations': 2500, 'bootstrap_type': 'Bayesian', 'early_stopping_rounds': 100}
Trial 3/50 → RMSE_CV=5.484 | MAE_CV=4.210 | {'depth': 8, 'learning_rate': 0.02, 'l2_leaf_reg': 1.0, 'bagging_temperature': 0.0, 'random_strength': 0.1, 'iterations': 1500, 'bootstrap_type': 'Bayesian', 'early_stopping_rounds': 100}
Trial 4/50 → RMSE_CV=5.586 | MAE_CV=4.290 | {'depth': 9, 'learning_rate': 0.08, 'l2_leaf_reg': 1.0, 'bagging_temperature': 3.0, 'random_strength': 0.1, 'iterations': 2500, 'bootstrap_type': 'Bayesian', 'early_stopping_rounds': 100}
Trial 5/50 → RMSE_CV=5.668 | MAE_CV=4.364 | {'depth': 10, 'learning_ra

In [None]:
# ==== 6) Reentrenar con mejor set (2021–2023) y evaluar en 2024 (hold-out) ====
mask_train = (dates.dt.year <= 2023)
mask_test  = (dates.dt.year == 2024)

X_tr, y_tr = X[mask_train], y[mask_train]
X_te, y_te = X[mask_test],  y[mask_test]

train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
test_pool  = Pool(X_te, y_te, cat_features=cat_idx)

best_params = best["params"].copy()
final_model = CatBoostRegressor(
    loss_function='RMSE',
    iterations=best_params["iterations"],
    depth=best_params["depth"],
    learning_rate=best_params["learning_rate"],
    l2_leaf_reg=best_params["l2_leaf_reg"],
    bootstrap_type='Bayesian',
    bagging_temperature=best_params["bagging_temperature"],
    random_strength=best_params["random_strength"],
    early_stopping_rounds=100,
    random_seed=42,
    verbose=False,
    allow_writing_files=False
)

# Usamos un pequeño conjunto de validación (último mes de 2023) para early stopping del final_model
cutoff = pd.Timestamp('2023-12-01')
tr_in  = dates[mask_train] < cutoff
tr_val = (dates[mask_train] >= cutoff)

final_model.fit(
    Pool(X_tr[tr_in],  y_tr[tr_in],  cat_features=cat_idx),
    eval_set=Pool(X_tr[tr_val], y_tr[tr_val], cat_features=cat_idx),
    use_best_model=True, verbose=False
)

pred_te = final_model.predict(test_pool)
mae_te = mean_absolute_error(y_te, pred_te)
rmse_te = rmse_compat(y_te, pred_te)
print(f"Hold-out 2024 → MAE: {mae_te:.3f} | RMSE: {rmse_te:.3f} | n_test: {int(mask_test.sum())}")

# Guardar predicciones 2024
out_te = data.loc[mask_test, ['Date','Localidad',target_col]].copy()  # Cambiar 'Estacion' por 'Localidad'
out_te['yhat'] = pred_te
out_te.to_csv('predicciones_holdout2024_catboost.csv', index=False)

# Métricas por localidad (2024) - Cambiar 'Estacion' por 'Localidad'
by_locality = out_te.groupby('Localidad').apply(  # Cambiar 'Estacion' por 'Localidad'
    lambda g: pd.Series({
        'n': len(g),
        'MAE': mean_absolute_error(g[target_col], g['yhat']),
        'RMSE': rmse_compat(g[target_col], g['yhat'])
    })
).reset_index().sort_values('RMSE')
by_locality.round(3)


In [None]:
# ==== 7) Importancias de variables y guardado del modelo ====
# Importancia "Feature Importance" de CatBoost (Gain)
fi = final_model.get_feature_importance(train_pool, type='FeatureImportance')
fi_df = pd.DataFrame({'feature': X.columns, 'importance': fi}).sort_values('importance', ascending=False)
fi_df.to_csv('feature_importance_catboost.csv', index=False)

# Guardar modelo y parámetros
final_model.save_model('catboost_pm25_model.cbm')
with open('best_params_catboost.json','w') as f:
    json.dump(best_params, f, indent=2)

print("✅ Guardados: catboost_pm25_model.cbm, best_params_catboost.json, feature_importance_catboost.csv, predicciones_holdout2024_catboost.csv")
fi_df.head(15)


In [None]:
from sklearn.metrics import r2_score

# --- 1) Evaluación de varianza explicada (R²) para cada fold y hold-out 2024 ---
def evaluate_r2(y_true, y_pred):
    """Calcula R² (varianza explicada) global."""
    return r2_score(y_true, y_pred)

# --- 2) Evaluación en los folds de la CV temporal ---
def eval_r2_params(params, verbose=False):
    """Devuelve el R² promedio en CV y por fold."""
    fold_results = []
    for i, f in enumerate(folds, start=1):
        tr_mask = dates.dt.normalize().isin(f["train_dates"])
        va_mask = dates.dt.normalize().isin(f["val_dates"])

        X_tr, y_tr = X[tr_mask], y[tr_mask]
        X_va, y_va = X[va_mask], y[va_mask]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
        valid_pool = Pool(X_va, y_va, cat_features=cat_idx)

        model = CatBoostRegressor(
            loss_function='RMSE',
            iterations=params.get('iterations', 2000),
            depth=params.get('depth', 7),
            learning_rate=params.get('learning_rate', 0.06),
            l2_leaf_reg=params.get('l2_leaf_reg', 3.0),
            bootstrap_type='Bayesian',
            bagging_temperature=params.get('bagging_temperature', 1.0),
            random_strength=params.get('random_strength', 0.0),
            early_stopping_rounds=100,
            random_seed=42,
            verbose=False,
            allow_writing_files=False
        )
        model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=verbose)

        # Predicciones para R² (validación)
        yhat_va = model.predict(valid_pool)

        # Calcular R² por fold
        r2 = evaluate_r2(y_va, yhat_va)

        fold_results.append({"fold": i, "R²": r2, "n_val": int(va_mask.sum())})

    r2_mean = float(np.mean([r["R²"] for r in fold_results]))
    return {"r2_mean": r2_mean, "folds": fold_results}

# Evaluación del mejor modelo (con parámetros encontrados en la búsqueda)
r2_result = eval_r2_params(best["params"])
print("CV (varianza explicada R²) → R² promedio:", round(r2_result["r2_mean"],3))
pd.DataFrame(r2_result["folds"]).round(3)


In [None]:
# --- 3) Evaluación en hold-out 2024 (R²) ---
mask_train = (dates.dt.year <= 2023)
mask_test  = (dates.dt.year == 2024)

X_tr, y_tr = X[mask_train], y[mask_train]
X_te, y_te = X[mask_test],  y[mask_test]

train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
test_pool  = Pool(X_te, y_te, cat_features=cat_idx)

final_model = CatBoostRegressor(
    loss_function='RMSE',
    iterations=best["params"]["iterations"],
    depth=best["params"]["depth"],
    learning_rate=best["params"]["learning_rate"],
    l2_leaf_reg=best["params"]["l2_leaf_reg"],
    bootstrap_type='Bayesian',
    bagging_temperature=best["params"]["bagging_temperature"],
    random_strength=best["params"]["random_strength"],
    early_stopping_rounds=100,
    random_seed=42,
    verbose=False,
    allow_writing_files=False
)

final_model.fit(train_pool, eval_set=test_pool, use_best_model=True, verbose=False)

# Predicciones en hold-out
yhat_te = final_model.predict(test_pool)

# Calcular R² en hold-out
r2_te = evaluate_r2(y_te, yhat_te)
print(f"Hold-out 2024 → R²: {r2_te:.3f} | n_test: {int(mask_test.sum())}")


In [None]:
# --- 4) R² por estación en hold-out 2024 ---
def eval_r2_by_station(y_true, y_pred, localities):
    """Calcula R² por estación."""
    dfm = pd.DataFrame({"y": y_true, "yhat": y_pred, "Localidad": localities})
    out = []
    for est, g in dfm.groupby("Localidad", sort=False):
        r2 = evaluate_r2(g["y"], g["yhat"])
        out.append({"Localidad": est, "R²": r2, "n": len(g)})
    return pd.DataFrame(out).sort_values("R²", ascending=False)

r2_by_station = eval_r2_by_station(y_te, yhat_te, localities[mask_test])
r2_by_station.round(3)


In [None]:
# ===================== BLOQUE A2: Rolling-origin con reentrenamiento por origen =====================
from catboost import CatBoostRegressor, Pool
import numpy as np, pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def rmse_compat(y_true, y_pred):
    try:    return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError: return np.sqrt(mean_squared_error(y_true, y_pred))

# Reconstruye dataset coherente (igual que en el fix anterior)
data_bt = df_ready.dropna(subset=feature_cols + ['PM25']).copy()
X_bt     = data_bt[feature_cols].copy()
y_bt     = data_bt['PM25'].to_numpy()
dates_bt = pd.to_datetime(data_bt['Date'])
cat_features = ['Localidad']

# Orígenes y horizontes
origins  = pd.date_range("2024-01-01", "2024-10-01", freq="MS")
HORIZONS = [1, 7, 14, 30]

# Hiperparámetros (usa los de tu mejor búsqueda si existen)
if "best" in globals() and isinstance(best, dict) and "params" in best:
    bp = best["params"]
    base_params = dict(
        loss_function='RMSE',
        iterations=bp.get("iterations", 2000),
        depth=bp.get("depth", 8),
        learning_rate=bp.get("learning_rate", 0.06),
        l2_leaf_reg=bp.get("l2_leaf_reg", 3.0),
        bootstrap_type='Bayesian',
        bagging_temperature=bp.get("bagging_temperature", 1.0),
        random_strength=bp.get("random_strength", 0.0),
        early_stopping_rounds=100,
        random_seed=42,
        verbose=False,
        allow_writing_files=False
    )
else:
    base_params = dict(
        loss_function='RMSE',
        iterations=2500,
        depth=8,
        learning_rate=0.06,
        l2_leaf_reg=8.0,
        bootstrap_type='Bayesian',
        bagging_temperature=1.0,
        random_strength=0.0,
        early_stopping_rounds=100,
        random_seed=42,
        verbose=False,
        allow_writing_files=False
    )

rows = []
for ori in origins:
    # Entrenamiento con historia hasta el día anterior al origen
    m_train = dates_bt < ori
    if m_train.sum() < 200:
        continue

    X_tr, y_tr = X_bt.loc[m_train], y_bt[m_train.to_numpy()]
    # Early stopping en los últimos ~60 días previos al origen
    dt_tr = dates_bt.loc[m_train]
    if (dt_tr.max() - dt_tr.min()).days < 120:
        m_in  = np.ones(len(X_tr), dtype=bool)
        m_val = np.zeros(len(X_tr), dtype=bool)
    else:
        cutoff = dt_tr.max() - pd.Timedelta(days=60)
        m_in  = (dt_tr <  cutoff).to_numpy()
        m_val = (dt_tr >= cutoff).to_numpy()

    train_pool = Pool(X_tr.loc[m_in],  y_tr[m_in],  cat_features=cat_features)
    valid_pool = Pool(X_tr.loc[m_val], y_tr[m_val], cat_features=cat_features) if m_val.any() else None

    model = CatBoostRegressor(**base_params)
    if valid_pool is not None and m_val.any():
        model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=False)
    else:
        model.fit(train_pool, verbose=False)

    # Ventana a evaluar: [ori, ori+H-1] ∩ 2024
    for H in HORIZONS:
        endH = ori + pd.Timedelta(days=H-1)
        m_eval = (dates_bt >= ori) & (dates_bt <= endH) & (dates_bt.dt.year == 2024)
        if not m_eval.any():
            continue

        X_ev = X_bt.loc[m_eval]
        y_ev = y_bt[m_eval.to_numpy()]
        pool_ev = Pool(X_ev, label=y_ev, cat_features=cat_features)
        yhat = model.predict(pool_ev)

        mae  = mean_absolute_error(y_ev, yhat)
        try:
            rmse = mean_squared_error(y_ev, yhat, squared=False)
        except TypeError:
            rmse = np.sqrt(mean_squared_error(y_ev, yhat))
        r2   = r2_score(y_ev, yhat)

        rows.append([ori.strftime("%Y-%m-%d"), H, int(m_eval.sum()), mae, rmse, r2])

bt2 = pd.DataFrame(rows, columns=["origin","horizon_days","n","MAE","RMSE","R2"])
print("Resumen por horizonte (promedio sobre orígenes):")
display(bt2.groupby("horizon_days")[["MAE","RMSE","R2","n"]].mean().round(3))
bt2.to_csv("backtest_rolling_origin_retrain_2024.csv", index=False)
print("✅ Guardado: backtest_rolling_origin_retrain_2024.csv")


In [None]:
# ===================== BLOQUE A2 (COMPLETO): Rolling-origin con reentrenamiento + skill + blend H=1 =====================
from catboost import CatBoostRegressor, Pool
import numpy as np, pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def rmse_compat(y_true, y_pred):
    try:    return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError: return np.sqrt(mean_squared_error(y_true, y_pred))

# --- 0) Reconstruir dataset coherente a partir de tu df_ready / feature_cols ---
data_bt = df_ready.dropna(subset=feature_cols + ['PM25']).copy()
# matrices alineadas
X_bt     = data_bt[feature_cols].copy()
y_bt     = data_bt['PM25'].to_numpy()
dates_bt = pd.to_datetime(data_bt['Date']).copy()
loc_bt   = data_bt['Localidad'].astype(str).copy()

# categóricas por NOMBRE (más robusto si cambian índices)
cat_features = ['Localidad']

# índice booleano para TEST 2024 (lo piden en otros bloques)
test_2024_idx = (dates_bt.dt.year == 2024).to_numpy()

# --- 1) Orígenes y horizontes ---
origins  = pd.date_range("2024-01-01", "2024-10-01", freq="MS")
HORIZONS = [1, 7, 14, 30]

# --- 2) Hiperparámetros base (usa los de tu búsqueda si existen) ---
if "best" in globals() and isinstance(best, dict) and "params" in best:
    bp = best["params"]
    base_params = dict(
        loss_function='MAE',          # MAE mejora robustez para H=1
        eval_metric='MAE',
        iterations=bp.get("iterations", 2000),
        depth=bp.get("depth", 8),
        learning_rate=bp.get("learning_rate", 0.06),
        l2_leaf_reg=bp.get("l2_leaf_reg", 3.0),
        bootstrap_type='Bayesian',
        bagging_temperature=bp.get("bagging_temperature", 1.0),
        random_strength=bp.get("random_strength", 0.0),
        early_stopping_rounds=100,
        random_seed=42,
        verbose=False,
        allow_writing_files=False
    )
else:
    base_params = dict(
        loss_function='MAE',
        eval_metric='MAE',
        iterations=2500,
        depth=8,
        learning_rate=0.06,
        l2_leaf_reg=8.0,
        bootstrap_type='Bayesian',
        bagging_temperature=1.0,
        random_strength=0.0,
        early_stopping_rounds=100,
        random_seed=42,
        verbose=False,
        allow_writing_files=False
    )

# Peso del blend con persistencia para H=1 (ajústalo 0.2–0.6 si quieres)
W_BLEND_H1 = 0.4

rows = []
for ori in origins:
    # --- 3) Ventana de entrenamiento: últimos 365 días antes del origen ---
    m_train_all = (dates_bt < ori)
    if m_train_all.sum() < 120:
        # muy poca historia; saltar este origen
        continue

    # recorta a últimos 365 días
    last_day_train = dates_bt.loc[m_train_all].max()
    cut_start = last_day_train - pd.Timedelta(days=365)
    m_train = m_train_all & (dates_bt >= cut_start)

    X_tr, y_tr = X_bt.loc[m_train], y_bt[m_train.to_numpy()]
    dt_tr = dates_bt.loc[m_train]

    # --- 4) Early stopping: últimos 120 días del entrenamiento como validación ---
    if (dt_tr.max() - dt_tr.min()).days < 180:
        m_in  = np.ones(len(X_tr), dtype=bool)
        m_val = np.zeros(len(X_tr), dtype=bool)
    else:
        cutoff = dt_tr.max() - pd.Timedelta(days=120)
        m_in  = (dt_tr <  cutoff).to_numpy()
        m_val = (dt_tr >= cutoff).to_numpy()

    train_pool = Pool(X_tr.loc[m_in],  y_tr[m_in],  cat_features=cat_features)
    valid_pool = Pool(X_tr.loc[m_val], y_tr[m_val], cat_features=cat_features) if m_val.any() else None

    model = CatBoostRegressor(**base_params)
    if valid_pool is not None and m_val.any():
        model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=False)
    else:
        model.fit(train_pool, verbose=False)

    # --- 5) Evaluación por horizontes en 2024 ---
    for H in HORIZONS:
        endH = ori + pd.Timedelta(days=H-1)
        m_eval = (dates_bt >= ori) & (dates_bt <= endH) & (dates_bt.dt.year == 2024)
        if not m_eval.any():
            continue

        X_ev = X_bt.loc[m_eval]
        y_ev = y_bt[m_eval.to_numpy()]
        pool_ev = Pool(X_ev, label=y_ev, cat_features=cat_features)
        yhat = model.predict(pool_ev)

        # Baseline naive lag-1 (si hay NaN, evalúo skill solo con válidos)
        naive_arr = data_bt.loc[m_eval, 'PM25_lag1'].to_numpy() if 'PM25_lag1' in data_bt.columns else np.full_like(y_ev, np.nan)
        valid_mask = ~np.isnan(naive_arr)
        rmse_naive = rmse_compat(y_ev[valid_mask], naive_arr[valid_mask]) if valid_mask.any() else np.nan

        # Blend con persistencia SOLO para H=1
        if H == 1 and valid_mask.any():
            yhat_blend = yhat.copy()
            yhat_blend[valid_mask] = (1.0 - W_BLEND_H1) * yhat_blend[valid_mask] + W_BLEND_H1 * naive_arr[valid_mask]
            yhat = yhat_blend

        mae  = mean_absolute_error(y_ev, yhat)
        rmse = rmse_compat(y_ev, yhat)
        r2   = r2_score(y_ev, yhat)

        skill = np.nan
        if valid_mask.any() and rmse_naive > 0:
            skill = 1.0 - (rmse / rmse_naive)

        rows.append([ori.strftime("%Y-%m-%d"), H, int(m_eval.sum()), mae, rmse, r2, rmse_naive, skill])

bt2 = pd.DataFrame(rows, columns=["origin","horizon_days","n","MAE","RMSE","R2","RMSE_naive","Skill_vs_naive"])
summary = bt2.groupby("horizon_days")[["MAE","RMSE","R2","RMSE_naive","Skill_vs_naive","n"]].mean().round(3)

print("Resumen por horizonte (promedio sobre orígenes):")
display(summary)
bt2.to_csv("backtest_rolling_origin_retrain_2024.csv", index=False)
summary.to_csv("backtest_rolling_origin_retrain_2024_summary.csv")
print("✅ Guardados: backtest_rolling_origin_retrain_2024.csv, backtest_rolling_origin_retrain_2024_summary.csv")


In [None]:
# === FUTURO 2025–2026 A PARTIR DE TU CLIMATOLOGÍA ===
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/Sxmuu/TG-Samuel-P/main/Databases/Climat/Forecast/Forecast_2025-2026.csv"

df_clim = pd.read_csv(url)

In [None]:
# ===================== PROYECCIÓN 2025–2026 DESDE CLIMATOLOGÍA =====================
import pandas as pd, numpy as np
from collections import deque
from catboost import Pool

# ---------- ENTRADAS ESPERADAS ----------
# df_ready: tu dataset final con estructura mencionada (incluye lags de PM25 y meteo)
# df_clim : DataFrame con columnas ['date','station','Temp','Hum','WindSpeed','Precip','Pres'] para 2025–2026
# final_model: CatBoost entrenado (o usa el que reentrenaste en tu cuaderno)
# feature_cols: ya definido en tu cuaderno (usas Localidad + geo + calendario + lags PM25 + lags meteo)
# cat_idx: índices de categóricas (en tu caso suele ser ['Localidad'])

# -------- 1) Normalizar climatología y añadir metadata de estación --------
future_exo = df_clim.rename(columns={"date":"Date","station":"Estacion"}).copy()
future_exo["Date"] = pd.to_datetime(future_exo["Date"], errors="coerce")

# Metadata por estación desde df_ready
meta_cols = ["Estacion","Localidad","lat","lon","Altitud"]
st_meta = (df_ready[meta_cols]
           .dropna(subset=["Estacion"])
           .drop_duplicates("Estacion"))
future_exo = future_exo.merge(st_meta, on="Estacion", how="left")

# Si hay estaciones sin Localidad/geo, advierte (las puedes excluir o completar manualmente)
miss = future_exo[future_exo["Localidad"].isna()]["Estacion"].unique()
if len(miss):
    print("⚠️ Estaciones sin metadata (no tendrán predicción a menos que completes Localidad/geo):", miss)

# -------- 2) Calendario para que calce con tus features --------
future_exo["year"]       = future_exo["Date"].dt.year
future_exo["month"]      = future_exo["Date"].dt.month
future_exo["dayofyear"]  = future_exo["Date"].dt.dayofyear
future_exo["dow"]        = future_exo["Date"].dt.dayofweek
future_exo["is_weekend"] = (future_exo["dow"] >= 5).astype(int)
future_exo["sin_doy"]    = np.sin(2*np.pi*future_exo["dayofyear"]/365.25)
future_exo["cos_doy"]    = np.cos(2*np.pi*future_exo["dayofyear"]/365.25)

# -------- 3) Garantizar cobertura si faltan registros en alguna estación/fecha --------
# Construimos una malla completa fechas×estaciones (las de df_ready) y rellenamos con climatología por Localidad–DOY y ciudad–DOY si hiciera falta.
all_stations = st_meta["Estacion"].unique()
date_min, date_max = future_exo["Date"].min(), future_exo["Date"].max()
grid = pd.DataFrame({"Date": pd.date_range(date_min, date_max, freq="D")})
grid = grid.assign(key=1).merge(pd.DataFrame({"Estacion": all_stations, "key":1}), on="key").drop(columns="key")

# 3) Unir el futuro (puede venir incompleto) con la malla completa
future_exo = grid.merge(future_exo, on=["Date","Estacion"], how="left")

# 4) Añadir metadata de estación SOLO si falta alguna de estas columnas
meta_needed = [c for c in ["Localidad","lat","lon","Altitud"] if c not in future_exo.columns]
if meta_needed:
    future_exo = future_exo.merge(st_meta[["Estacion"] + meta_needed], on="Estacion", how="left")

future_exo["doy"] = future_exo["Date"].dt.dayofyear
met_vars = ["Temp","Hum","WindSpeed","Precip","Pres"]

# Climatología por Localidad–DOY y Ciudad–DOY usando la MISMA climatología disponible (si te falta para ciertas estaciones)
clim_loc  = (future_exo.groupby(["Localidad","doy"])[met_vars].median()
                       .rename(columns={v:f"{v}_loc" for v in met_vars}).reset_index())
clim_city = (future_exo.groupby(["doy"])[met_vars].median()
                       .rename(columns={v:f"{v}_city" for v in met_vars}).reset_index())

future_exo = future_exo.merge(clim_loc,  on=["Localidad","doy"], how="left") \
                       .merge(clim_city, on=["doy"],       how="left")

glob_med = future_exo[met_vars].median(numeric_only=True)

for v in met_vars:
    # Prioridad: valor provisto por df_clim > climatología localidad-DOY > ciudad-DOY > mediana global
    future_exo[v] = (future_exo[v]
                     .fillna(future_exo[f"{v}_loc"])
                     .fillna(future_exo[f"{v}_city"])
                     .fillna(glob_med[v]))

# -------- 4) Construir lags de meteo (lag1, lag3) por estación --------
future_exo = future_exo.sort_values(["Estacion","Date"]).reset_index(drop=True)
for v in met_vars:
    future_exo[f"{v}_lag1"] = future_exo.groupby("Estacion", sort=False)[v].shift(1)
    future_exo[f"{v}_lag3"] = future_exo.groupby("Estacion", sort=False)[v].shift(3)

# Para los primeros 1–3 días de 2025 habrá NaN en los lags; rellenamos con la misma serie (backfill ligero) para no perder esos días
# Asegura tipos y orden antes de transformar
future_exo["Estacion"] = future_exo["Estacion"].astype(str)
future_exo = future_exo.sort_values(["Estacion","Date"]).reset_index(drop=True)

for v in ["Temp","Hum","WindSpeed","Precip","Pres"]:
    for k in [1, 3]:
        col = f"{v}_lag{k}"
        # Si la columna no existe aún, sáltala
        if col not in future_exo.columns:
            continue
        # Usar transform en vez de apply para mantener el mismo índice
        future_exo[col] = (
            future_exo
            .groupby("Estacion", sort=False)[col]
            .transform(lambda s: s.bfill().ffill())
            .astype("float64")   # homogeneiza dtype y evita conflictos
        )
# -------- 5) Semilla de lags PM25 (últimos 7 días observados hasta 2024-12-31) --------
hist_buffers = {}
for st, g in df_ready.sort_values(["Estacion","Date"]).groupby("Estacion", sort=False):
    tail = g["PM25"].tail(7).tolist()
    if len(tail) < 7:
        tail = [np.nan]*(7-len(tail)) + tail
    hist_buffers[st] = deque(tail, maxlen=7)

# -------- 6) Proyección recursiva 2025–2026 (usando TUS feature_cols) --------
future_days = sorted(future_exo["Date"].unique())
pred_rows = []

for d in future_days:
    day = future_exo[future_exo["Date"] == d]
    for st, g in day.groupby("Estacion"):
        if st not in hist_buffers:   # si la estación no estaba en df_ready (poco probable), la saltamos
            continue
        buf = hist_buffers[st].copy()

        # Lags/rollings de PM25 desde el buffer
        pm25_lag1 = buf[-1] if len(buf)>=1 else np.nan
        pm25_lag3 = buf[-3] if len(buf)>=3 else np.nan
        pm25_lag7 = buf[0]  if len(buf)>=7 else np.nan
        roll3     = np.nanmean([x for x in list(buf)[-3:] if pd.notna(x)]) if len(buf) else np.nan
        roll7     = np.nanmean([x for x in list(buf)      if pd.notna(x)]) if len(buf) else np.nan

        row = g.iloc[0].copy()
        row["PM25_lag1"] = pm25_lag1
        row["PM25_lag3"] = pm25_lag3
        row["PM25_lag7"] = pm25_lag7
        row["PM25_rollmean3"] = roll3
        row["PM25_rollmean7"] = roll7

        # Construir X EXACTAMENTE con tus 'feature_cols'
        x = row.reindex(feature_cols, fill_value=np.nan)

        # Predecir
        pool = Pool(pd.DataFrame([x.values], columns=feature_cols), cat_features=cat_idx)
        yhat = float(final_model.predict(pool))

        pred_rows.append({"Date": d, "Estacion": row["Estacion"], "Localidad": row["Localidad"], "PM25_pred": yhat})
        # actualizar buffer para el siguiente día
        hist_buffers[st].append(yhat)

pred_df = pd.DataFrame(pred_rows).sort_values(["Estacion","Date"]).reset_index(drop=True)

# -------- 7) Salidas por estación y por localidad --------
pred_loc = (pred_df.groupby(["Date","Localidad"])["PM25_pred"]
            .median().rename("PM25_pred_mediana").reset_index())

pred_df.to_csv("proyecciones_pm25_2025_2026_por_estacion.csv", index=False)
pred_loc.to_csv("proyecciones_pm25_2025_2026_por_localidad.csv", index=False)
print("✅ Guardados: proyecciones_pm25_2025_2026_por_estacion.csv, proyecciones_pm25_2025_2026_por_localidad.csv")
display(pred_df.head(), pred_loc.head())
