##**Ingeniería de Variables**

In [None]:
# --- Importaciones ---
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 200)

# Utilidad: reindexar por día dentro de cada estación para ventanas móviles estrictas
def reindex_daily_per_station(df, station_col='Estacion', date_col='Date'):
    out = []
    for est, dfg in df.groupby(station_col, sort=False):
        dfg = dfg.sort_values(date_col).copy()
        idx = pd.date_range(dfg[date_col].min(), dfg[date_col].max(), freq='D')
        dfg = dfg.set_index(date_col).reindex(idx).rename_axis(date_col).reset_index()
        dfg[station_col] = est
        out.append(dfg)
    return pd.concat(out, ignore_index=True)


In [None]:
url = "https://raw.githubusercontent.com/Sxmuu/TG-Samuel-P/main/Databases/Contam/Final/df_final.xlsx"

df = pd.read_excel(url, engine="openpyxl")  # instala openpyxl si hace falta

In [None]:
# --- 1) Cargar y normalizar nombres ---
df = pd.read_excel(DATA_PATH)

expected_cols = ['Date','Estacion','Localidad','PM25','lat','lon','Altitud',
                 'Rad','Precip','Hum','Temp','WindSpeed']
missing = [c for c in expected_cols if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas esperadas: {missing}")

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.sort_values(['Estacion','Date']).drop_duplicates(subset=['Estacion','Date']).reset_index(drop=True)

print(df[['Estacion','Date']].groupby('Estacion').agg(['min','max','nunique']).head())


                                 Date                   
                                  min        max nunique
Estacion                                                
Centro De Alto Rendimiento 2021-01-01 2024-12-30    1452
Ciudad Bolivar             2021-01-01 2024-12-30    1460
Fontibon                   2021-01-01 2024-12-30    1460
Jazmin                     2021-01-01 2024-12-30    1460
Kennedy                    2021-01-01 2024-12-30    1416


In [None]:
# --- 2) (Opcional) Reindexar a diario por estación ---
# Si ya sabes que todas las estaciones tienen una observación por día, puedes saltarte esto.
# Si no, esto asegura ventanas móviles de longitud exacta (introducirá NaN si faltaban días).
df = reindex_daily_per_station(df, station_col='Estacion', date_col='Date')


In [None]:
# --- 2) Tipos y orden temporal ---
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
# (Opcional) Orden global
df = df.sort_values(['Estacion', 'Date']).reset_index(drop=True)

# (Opcional) Quitar duplicados exactos por Estacion-Fecha (si existieran)
df = df.drop_duplicates(subset=['Estacion','Date'])
print(df[['Estacion','Date']].groupby('Estacion').agg(['min','max','nunique']).head())

                                 Date                   
                                  min        max nunique
Estacion                                                
Centro De Alto Rendimiento 2021-01-01 2024-12-30    1460
Ciudad Bolivar             2021-01-01 2024-12-30    1460
Fontibon                   2021-01-01 2024-12-30    1460
Jazmin                     2021-01-01 2024-12-30    1460
Kennedy                    2021-01-01 2024-12-30    1460


In [None]:
# --- 3) Calendario y estacionalidad ---
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['dayofyear'] = df['Date'].dt.dayofyear
df['dow'] = df['Date'].dt.dayofweek
df['is_weekend'] = (df['dow'] >= 5).astype(int)
df['sin_doy'] = np.sin(2*np.pi*df['dayofyear']/365.25)
df['cos_doy'] = np.cos(2*np.pi*df['dayofyear']/365.25)

In [None]:
# --- 4) Funciones de lags y rolling (sin fuga) ---
def add_lags(df, group_key, date_col, vars_to_lag, lags):
    df = df.sort_values([group_key, date_col]).copy()
    for var in vars_to_lag:
        for k in lags:
            df[f'{var}_lag{k}'] = df.groupby(group_key, sort=False)[var].shift(k)
    return df

def add_rolling_features(df, group_key, date_col, var, windows, stats=('mean',), shift_one=True):
    df = df.sort_values([group_key, date_col]).copy()
    base = df.groupby(group_key, sort=False)[var]
    series = base.shift(1) if shift_one else base.transform(lambda x: x)
    for w in windows:
        roll = series.rolling(w)
        if 'mean' in stats:
            df[f'{var}_rollmean{w}'] = roll.mean().reset_index(level=0, drop=True)
        if 'std' in stats:
            df[f'{var}_rollstd{w}'] = roll.std().reset_index(level=0, drop=True)
        if 'min' in stats:
            df[f'{var}_rollmin{w}'] = roll.min().reset_index(level=0, drop=True)
        if 'max' in stats:
            df[f'{var}_rollmax{w}'] = roll.max().reset_index(level=0, drop=True)
    return df


In [None]:
# --- 5) Lags y rolling de PM25 (clave para 2026) ---
lags_pm25 = [1, 3, 7]
wins_pm25 = [3, 7]

df_feat = add_lags(df, group_key='Estacion', date_col='Date',
                   vars_to_lag=['PM25'], lags=lags_pm25)

df_feat = add_rolling_features(df_feat, group_key='Estacion', date_col='Date',
                               var='PM25', windows=wins_pm25,
                               stats=('mean',), shift_one=True)


In [None]:
# --- 6) (Opcional) Lags de meteorología (sí disponibles si tienes meteo 2026 o usarás climatología)
include_meteo_lags = True
meteo_vars = ['Temp','Hum','WindSpeed','Precip','Rad']
if include_meteo_lags:
    df_feat = add_lags(df_feat, group_key='Estacion', date_col='Date',
                       vars_to_lag=meteo_vars, lags=[1, 3])


In [None]:
# --- 8) Limpieza por NaN de bordes (debidos a lags/rolling) ---
rows_before = len(df_feat)
df_model = df_feat.dropna(subset=['PM25_lag1','PM25_rollmean3']).reset_index(drop=True)
rows_after = len(df_model)
print(f"Filas antes: {rows_before:,} | después de dropna: {rows_after:,} | perdidas: {rows_before - rows_after:,}")


Filas antes: 17,520 | después de dropna: 17,374 | perdidas: 146


In [None]:
# --- 9) Columnas finales para modelado (no entrenamos aún) ---
base_cols = ['Date','Estacion','Localidad','lat','lon','Altitud','PM25',
             'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy']
lag_cols = [c for c in df_model.columns if c.startswith('PM25_lag') or c.startswith('PM25_rollmean')]
met_lag_cols = [c for c in df_model.columns if any(c.startswith(v+'_lag') for v in meteo_vars)]

cols_for_next_steps = base_cols + lag_cols + met_lag_cols
df_ready = df_model[cols_for_next_steps].copy()

print("Columnas finales (primeras 25):")
print(df_ready.columns.tolist()[:25], '...')
df_ready.head()


Columnas finales (primeras 25):
['Date', 'Estacion', 'Localidad', 'lat', 'lon', 'Altitud', 'PM25', 'year', 'month', 'dayofyear', 'dow', 'is_weekend', 'sin_doy', 'cos_doy', 'PM25_lag1', 'PM25_lag3', 'PM25_lag7', 'PM25_rollmean3', 'PM25_rollmean7', 'Temp_lag1', 'Temp_lag3', 'Hum_lag1', 'Hum_lag3', 'WindSpeed_lag1', 'WindSpeed_lag3'] ...


Unnamed: 0,Date,Estacion,Localidad,lat,lon,Altitud,PM25,year,month,dayofyear,dow,is_weekend,sin_doy,cos_doy,PM25_lag1,PM25_lag3,PM25_lag7,PM25_rollmean3,PM25_rollmean7,Temp_lag1,Temp_lag3,Hum_lag1,Hum_lag3,WindSpeed_lag1,WindSpeed_lag3,Precip_lag1,Precip_lag3,Rad_lag1,Rad_lag3
0,2021-01-04,Centro De Alto Rendimiento,Barrios Unidos,4.65847,-74.08396,2552.0,8.54,2021,1,4,0,0,0.068755,0.997634,6.62,12.62,,8.58,,18.16,19.17,84.79,79.44,0.77,0.83,11.61,4.03,15.49,14.7
1,2021-01-05,Centro De Alto Rendimiento,Barrios Unidos,4.65847,-74.08396,2552.0,13.88,2021,1,5,1,0,0.085906,0.996303,8.54,6.5,,7.22,,18.37,19.64,81.29,77.8,1.17,0.75,12.97,3.9,21.57,15.42
2,2021-01-06,Centro De Alto Rendimiento,Barrios Unidos,4.65847,-74.08396,2552.0,15.21,2021,1,6,2,0,0.103031,0.994678,13.88,6.62,,9.68,,17.87,18.16,78.17,84.79,0.99,0.77,2.28,11.61,20.3,15.49
3,2021-01-07,Centro De Alto Rendimiento,Barrios Unidos,4.65847,-74.08396,2552.0,8.58,2021,1,7,3,0,0.120126,0.992759,15.21,8.54,,12.543333,,17.92,18.37,77.14,81.29,0.68,1.17,1.89,12.97,19.02,21.57
4,2021-01-08,Centro De Alto Rendimiento,Barrios Unidos,4.65847,-74.08396,2552.0,12.44,2021,1,8,4,0,0.137185,0.990545,8.58,13.88,12.62,12.556667,10.278571,18.81,17.87,75.35,78.17,0.66,0.99,3.63,2.28,17.3,20.3


In [None]:
# --- 10) Guardar dataset de características ---
df_ready.to_csv('df_features_PM25_no_copollutants.csv', index=False)
print("✅ Guardado: df_features_PM25_no_copollutants.csv")


✅ Guardado: df_features_PM25_no_copollutants.csv


In [None]:
# --- Climatología meteo por estación y día-del-año (mediana) ---
years_hist = [2021, 2022, 2023, 2024]   # ajusta si procede
meteo_vars = ['Temp','Hum','WindSpeed','Precip','Rad']

df_hist = df[df['year'].isin(years_hist)].copy()
df_hist['doy'] = df_hist['Date'].dt.dayofyear

clima = (df_hist.groupby(['Estacion','doy'])[meteo_vars]
         .median()
         .reset_index()
         .rename(columns={v: f'{v}_clim' for v in meteo_vars}))

# Construir calendario 2026 y “pegar” climatología por estación y DOY
cal2026 = pd.date_range('2026-01-01','2026-12-31',freq='D')
cal = (pd.DataFrame({'Date': cal2026})
       .assign(doy=lambda x: x['Date'].dt.dayofyear)
      )

# Ejemplo: climatología para todas las estaciones (repetimos por estación)
ests = df['Estacion'].dropna().unique()
clima2026 = (cal.assign(key=1)
               .merge(pd.DataFrame({'Estacion': ests, 'key':1}), on='key')
               .drop(columns='key')
               .merge(clima, on=['Estacion','doy'], how='left'))

clima2026.to_csv('climatologia_meteo_2026_por_estacion.csv', index=False)
print("✅ Guardado: climatologia_meteo_2026_por_estacion.csv (medianas por DOY y estación)")


✅ Guardado: climatologia_meteo_2026_por_estacion.csv (medianas por DOY y estación)


##**Validación Cruzada**

In [None]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error



df = df_ready.copy()
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Chequeo rápido:
print(df.shape)
print(df[['Date','Estacion','Localidad']].head(3))


(17374, 29)
        Date                    Estacion       Localidad
0 2021-01-04  Centro De Alto Rendimiento  Barrios Unidos
1 2021-01-05  Centro De Alto Rendimiento  Barrios Unidos
2 2021-01-06  Centro De Alto Rendimiento  Barrios Unidos


In [None]:
# Columnas base (ajusta si cambiaste nombres)
base_cols = ['Date','Estacion','Localidad','lat','lon','Altitud','PM25',
             'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy']

# Lags/rollings ya construidos en Paso 1
lag_cols = [c for c in df.columns if c.startswith('PM25_lag') or c.startswith('PM25_rollmean')]
# Lags meteo si los añadiste en Paso 1
met_vars = ['Temp','Hum','WindSpeed','Precip','Rad']
met_lag_cols = [c for c in df.columns if any(c.startswith(v+'_lag') for v in met_vars)]

# Sin copolutantes:
feature_cols = ['Estacion','Localidad','lat','lon','Altitud',
                'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy'] \
               + lag_cols + met_lag_cols

# Quitar filas con NaN en features/target (bordes por lags)
data = df.dropna(subset=feature_cols + ['PM25']).copy()

X = data[feature_cols].copy()
y = data['PM25'].values
dates = data['Date'].copy()
stations = data['Estacion'].copy()

# Preprocesamiento: OHE para categóricas; numéricas 'passthrough'
cat_features = ['Estacion','Localidad']
num_features = [c for c in feature_cols if c not in cat_features]

pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
    ('num', 'passthrough', num_features)
])


In [None]:
model = HistGradientBoostingRegressor(
    learning_rate=0.06,
    max_iter=400,
    min_samples_leaf=25,
    early_stopping=True,
    random_state=42
)

pipe = Pipeline([
    ('pre', pre),
    ('model', model)
])


In [None]:
def build_time_folds(unique_dates, n_folds=4):
    """
    Forward-chaining con validación por bloques de igual tamaño aproximado.
    Devuelve lista de dicts con índices booleanos para train/val.
    """
    unique_dates = np.array(sorted(unique_dates))
    folds = []
    val_block = int(len(unique_dates)/(n_folds+1))
    for k in range(1, n_folds+1):
        train_end = k*val_block
        val_start = train_end
        val_end = val_start + val_block
        train_dates = unique_dates[:train_end]
        val_dates = unique_dates[val_start:val_end]
        folds.append({
            "train_dates": train_dates,
            "val_dates": val_dates
        })
    return folds

unique_days = np.array(sorted(dates.dt.normalize().unique()))
folds = build_time_folds(unique_days, n_folds=4)
[(f["train_dates"].min(), f["train_dates"].max(), f["val_dates"].min(), f["val_dates"].max()) for f in folds]


[(Timestamp('2021-01-08 00:00:00'),
  Timestamp('2021-10-24 00:00:00'),
  Timestamp('2021-10-25 00:00:00'),
  Timestamp('2022-08-10 00:00:00')),
 (Timestamp('2021-01-08 00:00:00'),
  Timestamp('2022-08-10 00:00:00'),
  Timestamp('2022-08-11 00:00:00'),
  Timestamp('2023-05-27 00:00:00')),
 (Timestamp('2021-01-08 00:00:00'),
  Timestamp('2023-05-27 00:00:00'),
  Timestamp('2023-05-28 00:00:00'),
  Timestamp('2024-03-12 00:00:00')),
 (Timestamp('2021-01-08 00:00:00'),
  Timestamp('2024-03-12 00:00:00'),
  Timestamp('2024-03-13 00:00:00'),
  Timestamp('2024-12-27 00:00:00'))]

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sklearn
print("scikit-learn versión:", sklearn.__version__)

def rmse_compat(y_true, y_pred):
    """RMSE compatible con cualquier versión de scikit-learn."""
    try:
        # sklearn >= 0.22 (aprox.) soporta 'squared'
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        # fallback para versiones antiguas
        return np.sqrt(mean_squared_error(y_true, y_pred))


scikit-learn versión: 1.6.1


In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

def rmse_compat(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))


def eval_by_station(y_true, y_pred, stations):
    dfm = pd.DataFrame({"y": y_true, "yhat": y_pred, "Estacion": stations})
    out = []
    for est, g in dfm.groupby("Estacion", sort=False):
        mae = mean_absolute_error(g["y"], g["yhat"])
        rmse = rmse_compat(g["y"], g["yhat"])   # <---
        out.append({"Estacion": est, "MAE": mae, "RMSE": rmse, "n": len(g)})
    return pd.DataFrame(out).sort_values("RMSE")



cv_rows = []
per_station_reports = []

for i, f in enumerate(folds, start=1):
    tr_mask = dates.dt.normalize().isin(f["train_dates"])
    va_mask = dates.dt.normalize().isin(f["val_dates"])

    X_tr, y_tr = X[tr_mask], y[tr_mask]
    X_va, y_va = X[va_mask], y[va_mask]
    st_va = stations[va_mask].values

    pipe.fit(X_tr, y_tr)
    pred_va = pipe.predict(X_va)

    mae = mean_absolute_error(y_va, pred_va)
    rmse = rmse_compat(y_va, pred_va)  # <--- usa esta

    cv_rows.append({
        "fold": i,
        "train_start": str(dates[tr_mask].min().date()),
        "train_end":   str(dates[tr_mask].max().date()),
        "val_start":   str(dates[va_mask].min().date()),
        "val_end":     str(dates[va_mask].max().date()),
        "n_train": int(tr_mask.sum()),
        "n_val": int(va_mask.sum()),
        "MAE": mae,
        "RMSE": rmse
    })

    rep = eval_by_station(y_va, pred_va, st_va)
    rep.insert(0, 'fold', i)
    per_station_reports.append(rep)

cv_table = pd.DataFrame(cv_rows)
per_station_table = pd.concat(per_station_reports, ignore_index=True)

display(cv_table.round(3))
display(per_station_table.round(3))

cv_table.to_csv("cv_temporal_global.csv", index=False)
per_station_table.to_csv("cv_temporal_por_estacion.csv", index=False)
print("✅ Guardados: cv_temporal_global.csv, cv_temporal_por_estacion.csv")


Unnamed: 0,fold,train_start,train_end,val_start,val_end,n_train,n_val,MAE,RMSE
0,1,2021-01-08,2021-10-24,2021-10-25,2022-08-10,3440,3447,4.601,5.962
1,2,2021-01-08,2022-08-10,2022-08-11,2023-05-27,6887,3405,4.612,6.012
2,3,2021-01-08,2023-05-27,2023-05-28,2024-03-12,10292,3461,4.302,5.647
3,4,2021-01-08,2024-03-12,2024-03-13,2024-12-27,13753,3457,3.705,5.107


Unnamed: 0,fold,Estacion,MAE,RMSE,n
0,1,Usaquen,3.773,4.956,290
1,1,Suba,3.971,5.092,282
2,1,Centro De Alto Rendimiento,4.153,5.309,290
3,1,Las Ferias,4.176,5.425,265
4,1,San Cristobal,4.288,5.795,290
5,1,Fontibon,4.714,5.812,290
6,1,Kennedy,4.719,5.819,290
7,1,Puente Aranda,4.722,5.908,290
8,1,Jazmin,4.803,6.073,290
9,1,Tunal,5.046,6.567,290


✅ Guardados: cv_temporal_global.csv, cv_temporal_por_estacion.csv


In [None]:
mask_train = dates.dt.year <= 2023
mask_test  = dates.dt.year == 2024

X_tr, y_tr = X[mask_train], y[mask_train]
X_te, y_te = X[mask_test],  y[mask_test]
st_te = stations[mask_test].values

pipe.fit(X_tr, y_tr)
pred_te = pipe.predict(X_te)

mae_te = mean_absolute_error(y_te, pred_te)
rmse_te = rmse_compat(y_te, pred_te) # <-- reemplaza la línea con squared=False
print(f"Hold-out 2024 → MAE: {mae_te:.3f} | RMSE: {rmse_te:.3f} | n_test: {mask_test.sum()}")


rep_te = eval_by_station(y_te, pred_te, st_te).round(3)
display(rep_te)

rep_te.to_csv("holdout2024_por_estacion.csv", index=False)
print("✅ Guardado: holdout2024_por_estacion.csv")

Hold-out 2024 → MAE: 4.496 | RMSE: 5.644 | n_test: 4348


Unnamed: 0,Estacion,MAE,RMSE,n
8,Suba,3.539,4.653,365
9,Tunal,3.773,4.7,365
7,San Cristobal,3.743,4.918,365
10,Usaquen,4.917,5.212,365
0,Centro De Alto Rendimiento,4.257,5.409,343
11,Usme,4.118,5.412,365
5,Las Ferias,4.42,5.719,355
3,Jazmin,4.682,5.909,365
2,Fontibon,4.665,6.029,365
6,Puente Aranda,5.532,6.035,365


✅ Guardado: holdout2024_por_estacion.csv


In [None]:
# Repite el armado del fold 4 exactamente como en tu CV
fold = folds[3]  # 4º fold (índice 3)
tr_mask = dates.dt.normalize().isin(fold["train_dates"])
va_mask = dates.dt.normalize().isin(fold["val_dates"])

pipe.fit(X[tr_mask], y[tr_mask])
pred_va = pipe.predict(X[va_mask])

df_va = data.loc[va_mask, ['Date','Estacion','PM25']].copy()
df_va['yhat'] = pred_va
df_va['abs_err'] = (df_va['PM25'] - df_va['yhat']).abs()

def mae_rmse(g):
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np
    try:
        rmse = mean_squared_error(g['PM25'], g['yhat'], squared=False)
    except TypeError:
        rmse = np.sqrt(mean_squared_error(g['PM25'], g['yhat']))
    return pd.Series({
        'n': len(g),
        'MAE': mean_absolute_error(g['PM25'], g['yhat']),
        'RMSE': rmse
    })

print(df_va.groupby('Estacion').apply(mae_rmse).round(3))

# Si quieres ver la distribución de errores de Usaquen
usa = df_va[df_va['Estacion']=='Usaquen']
print(usa[['abs_err']].describe(percentiles=[.5,.9,.95,.99]).round(3).T)
print("Fechas con mayor error en Usaquen:\n", usa.nlargest(5,'abs_err')[['Date','PM25','yhat','abs_err']])


                                n    MAE   RMSE
Estacion                                       
Centro De Alto Rendimiento  277.0  4.131  5.327
Ciudad Bolivar              290.0  4.865  6.401
Fontibon                    290.0  4.489  5.853
Jazmin                      290.0  4.606  5.865
Kennedy                     290.0  4.919  6.366
Las Ferias                  280.0  4.118  5.457
Puente Aranda               290.0  1.575  2.998
San Cristobal               290.0  3.565  4.643
Suba                        290.0  3.386  4.534
Tunal                       290.0  3.835  4.782
Usaquen                     290.0  0.950  1.417
Usme                        290.0  4.059  5.377
         count  mean    std    min    50%    90%    95%    99%    max
abs_err  290.0  0.95  1.053  0.001  0.636  2.202  2.582  4.614  7.958
Fechas con mayor error en Usaquen:
             Date   PM25       yhat   abs_err
15912 2024-12-26   6.75  14.708379  7.958379
15913 2024-12-27   0.27   7.620862  7.350862
15910 2024-12-24 

  print(df_va.groupby('Estacion').apply(mae_rmse).round(3))


##**CatBoost**

In [None]:
# ==== 0) Setup: instalar CatBoost (si hiciera falta) ====
!pip -q install catboost


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ==== 1) Cargar librerías y datos ====
import json, math, random
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool

# Compatibilidad de métricas (RMSE con y sin 'squared')
from sklearn.metrics import mean_absolute_error, mean_squared_error
def rmse_compat(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

df = df_ready.copy()
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Solo por seguridad: tipar categóricas como string
for c in ['Estacion','Localidad']:
    if c in df.columns:
        df[c] = df[c].astype(str)

print(df.shape)
df.head(2)


(17374, 29)


Unnamed: 0,Date,Estacion,Localidad,lat,lon,Altitud,PM25,year,month,dayofyear,dow,is_weekend,sin_doy,cos_doy,PM25_lag1,PM25_lag3,PM25_lag7,PM25_rollmean3,PM25_rollmean7,Temp_lag1,Temp_lag3,Hum_lag1,Hum_lag3,WindSpeed_lag1,WindSpeed_lag3,Precip_lag1,Precip_lag3,Rad_lag1,Rad_lag3
0,2021-01-04,Centro De Alto Rendimiento,Barrios Unidos,4.65847,-74.08396,2552.0,8.54,2021,1,4,0,0,0.068755,0.997634,6.62,12.62,,8.58,,18.16,19.17,84.79,79.44,0.77,0.83,11.61,4.03,15.49,14.7
1,2021-01-05,Centro De Alto Rendimiento,Barrios Unidos,4.65847,-74.08396,2552.0,13.88,2021,1,5,1,0,0.085906,0.996303,8.54,6.5,,7.22,,18.37,19.64,81.29,77.8,1.17,0.75,12.97,3.9,21.57,15.42


In [None]:
# ==== 2) Definir features y target (sin copolutantes) ====
base_cols = ['Date','Estacion','Localidad','lat','lon','Altitud',
             'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy']

lag_cols = [c for c in df.columns if c.startswith('PM25_lag') or c.startswith('PM25_rollmean')]

met_vars = ['Temp','Hum','WindSpeed','Precip','Rad']
met_lag_cols = [c for c in df.columns if any(c.startswith(v+'_lag') for v in met_vars)]

# target
target_col = 'PM25'

# columnas finales de X (Date no va al modelo)
feature_cols = ['Estacion','Localidad','lat','lon','Altitud',
                'year','month','dayofyear','dow','is_weekend','sin_doy','cos_doy'] \
               + lag_cols + met_lag_cols

data = df.dropna(subset=feature_cols + [target_col]).copy()
X = data[feature_cols].copy()
y = data[target_col].values
dates = data['Date'].copy()
stations = data['Estacion'].copy()

# Índices de categóricas para CatBoost (dentro de X)
cat_cols = ['Estacion','Localidad']
cat_idx = [X.columns.get_loc(c) for c in cat_cols if c in X.columns]

len(feature_cols), feature_cols[:8], cat_idx


(27,
 ['Estacion',
  'Localidad',
  'lat',
  'lon',
  'Altitud',
  'year',
  'month',
  'dayofyear'],
 [0, 1])

In [None]:
# ==== 3) Construir los mismos folds temporales (ventana expansiva) ====
def build_time_folds(unique_dates, n_folds=4):
    unique_dates = np.array(sorted(unique_dates))
    folds = []
    val_block = int(len(unique_dates)/(n_folds+1))
    for k in range(1, n_folds+1):
        train_end = k*val_block
        val_start = train_end
        val_end = val_start + val_block
        train_dates = unique_dates[:train_end]
        val_dates = unique_dates[val_start:val_end]
        folds.append({"train_dates": train_dates, "val_dates": val_dates})
    return folds

unique_days = np.array(sorted(dates.dt.normalize().unique()))
folds = build_time_folds(unique_days, n_folds=4)

[(f["train_dates"].min(), f["train_dates"].max(), f["val_dates"].min(), f["val_dates"].max()) for f in folds]


[(Timestamp('2021-01-08 00:00:00'),
  Timestamp('2021-10-24 00:00:00'),
  Timestamp('2021-10-25 00:00:00'),
  Timestamp('2022-08-10 00:00:00')),
 (Timestamp('2021-01-08 00:00:00'),
  Timestamp('2022-08-10 00:00:00'),
  Timestamp('2022-08-11 00:00:00'),
  Timestamp('2023-05-27 00:00:00')),
 (Timestamp('2021-01-08 00:00:00'),
  Timestamp('2023-05-27 00:00:00'),
  Timestamp('2023-05-28 00:00:00'),
  Timestamp('2024-03-12 00:00:00')),
 (Timestamp('2021-01-08 00:00:00'),
  Timestamp('2024-03-12 00:00:00'),
  Timestamp('2024-03-13 00:00:00'),
  Timestamp('2024-12-27 00:00:00'))]

In [None]:
# ==== 4) Función de evaluación para un set de hiperparámetros ====
def eval_params(params, verbose=False):
    """Devuelve dict con MAE y RMSE promediados en CV, y por fold."""
    fold_results = []
    for i, f in enumerate(folds, start=1):
        tr_mask = dates.dt.normalize().isin(f["train_dates"])
        va_mask = dates.dt.normalize().isin(f["val_dates"])

        X_tr, y_tr = X[tr_mask], y[tr_mask]
        X_va, y_va = X[va_mask], y[va_mask]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
        valid_pool = Pool(X_va, y_va, cat_features=cat_idx)

        model = CatBoostRegressor(
            loss_function='RMSE',
            iterations=params.get('iterations', 2000),
            depth=params.get('depth', 7),
            learning_rate=params.get('learning_rate', 0.06),
            l2_leaf_reg=params.get('l2_leaf_reg', 3.0),
            bootstrap_type=params.get('bootstrap_type', 'Bayesian'),
            bagging_temperature=params.get('bagging_temperature', 1.0),
            random_strength=params.get('random_strength', 0.0),
            early_stopping_rounds=params.get('early_stopping_rounds', 100),
            random_seed=42,
            verbose=False,
            allow_writing_files=False
        )

        model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=verbose)
        pred_va = model.predict(valid_pool)

        mae = mean_absolute_error(y_va, pred_va)
        rmse = rmse_compat(y_va, pred_va)

        fold_results.append({"fold": i, "MAE": mae, "RMSE": rmse, "n_val": int(va_mask.sum())})

    mae_mean = float(np.mean([r["MAE"] for r in fold_results]))
    rmse_mean = float(np.mean([r["RMSE"] for r in fold_results]))
    return {"mae_mean": mae_mean, "rmse_mean": rmse_mean, "folds": fold_results}


In [None]:
# ==== 5) Random Search liviano de hiperparámetros ====
random.seed(42)
search_space = {
    "depth":       [5,6,7,8,9,10],
    "learning_rate": [0.02, 0.03, 0.04, 0.06, 0.08, 0.10],
    "l2_leaf_reg":  [1.0, 2.0, 3.0, 5.0, 7.0, 10.0],
    "bagging_temperature": [0.0, 0.5, 1.0, 2.0, 3.0, 5.0],
    "random_strength": [0.0, 0.1, 0.2, 0.5],
    "iterations":  [1500, 2000, 2500],
    "bootstrap_type": ['Bayesian'],
    "early_stopping_rounds": [100]
}

def sample_params(space):
    return {
        "depth": random.choice(space["depth"]),
        "learning_rate": random.choice(space["learning_rate"]),
        "l2_leaf_reg": random.choice(space["l2_leaf_reg"]),
        "bagging_temperature": random.choice(space["bagging_temperature"]),
        "random_strength": random.choice(space["random_strength"]),
        "iterations": random.choice(space["iterations"]),
        "bootstrap_type": 'Bayesian',
        "early_stopping_rounds": 100
    }

results = []
N_TRIALS = 24  # puedes subirlo si tienes tiempo de cómputo
for t in range(1, N_TRIALS+1):
    params = sample_params(search_space)
    res = eval_params(params, verbose=False)
    res["params"] = params
    results.append(res)
    print(f"Trial {t}/{N_TRIALS} → RMSE_CV={res['rmse_mean']:.3f} | MAE_CV={res['mae_mean']:.3f} | {params}")

# Ordenar por RMSE (menor es mejor)
results_sorted = sorted(results, key=lambda r: r["rmse_mean"])
best = results_sorted[0]
print("\n=== MEJOR CONFIGURACIÓN (CV) ===")
print(json.dumps(best["params"], indent=2))
print("CV → RMSE promedio:", round(best["rmse_mean"],3), " | MAE promedio:", round(best["mae_mean"],3))
pd.DataFrame(best["folds"]).round(3)


Trial 1/24 → RMSE_CV=5.803 | MAE_CV=4.485 | {'depth': 10, 'learning_rate': 0.02, 'l2_leaf_reg': 1.0, 'bagging_temperature': 5.0, 'random_strength': 0.2, 'iterations': 1500, 'bootstrap_type': 'Bayesian', 'early_stopping_rounds': 100}
Trial 2/24 → RMSE_CV=5.550 | MAE_CV=4.252 | {'depth': 6, 'learning_rate': 0.03, 'l2_leaf_reg': 10.0, 'bagging_temperature': 0.0, 'random_strength': 0.0, 'iterations': 2500, 'bootstrap_type': 'Bayesian', 'early_stopping_rounds': 100}
Trial 3/24 → RMSE_CV=5.613 | MAE_CV=4.304 | {'depth': 8, 'learning_rate': 0.02, 'l2_leaf_reg': 1.0, 'bagging_temperature': 0.0, 'random_strength': 0.1, 'iterations': 1500, 'bootstrap_type': 'Bayesian', 'early_stopping_rounds': 100}
Trial 4/24 → RMSE_CV=5.669 | MAE_CV=4.384 | {'depth': 9, 'learning_rate': 0.08, 'l2_leaf_reg': 1.0, 'bagging_temperature': 3.0, 'random_strength': 0.1, 'iterations': 2500, 'bootstrap_type': 'Bayesian', 'early_stopping_rounds': 100}
Trial 5/24 → RMSE_CV=5.805 | MAE_CV=4.469 | {'depth': 10, 'learning_ra

Unnamed: 0,fold,MAE,RMSE,n_val
0,1,4.198,5.448,3447
1,2,4.419,5.757,3405
2,3,4.208,5.502,3461
3,4,3.941,5.185,3457


In [None]:
# ==== 6) Reentrenar con mejor set (2021–2023) y evaluar en 2024 (hold-out) ====
mask_train = (dates.dt.year <= 2023)
mask_test  = (dates.dt.year == 2024)

X_tr, y_tr = X[mask_train], y[mask_train]
X_te, y_te = X[mask_test],  y[mask_test]

train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
test_pool  = Pool(X_te, y_te, cat_features=cat_idx)

best_params = best["params"].copy()
final_model = CatBoostRegressor(
    loss_function='RMSE',
    iterations=best_params["iterations"],
    depth=best_params["depth"],
    learning_rate=best_params["learning_rate"],
    l2_leaf_reg=best_params["l2_leaf_reg"],
    bootstrap_type='Bayesian',
    bagging_temperature=best_params["bagging_temperature"],
    random_strength=best_params["random_strength"],
    early_stopping_rounds=100,
    random_seed=42,
    verbose=False,
    allow_writing_files=False
)

# Usamos un pequeño conjunto de validación (último mes de 2023) para early stopping del final_model
cutoff = pd.Timestamp('2023-12-01')
tr_in  = dates[mask_train] < cutoff
tr_val = (dates[mask_train] >= cutoff)

final_model.fit(
    Pool(X_tr[tr_in],  y_tr[tr_in],  cat_features=cat_idx),
    eval_set=Pool(X_tr[tr_val], y_tr[tr_val], cat_features=cat_idx),
    use_best_model=True, verbose=False
)

pred_te = final_model.predict(test_pool)
mae_te = mean_absolute_error(y_te, pred_te)
rmse_te = rmse_compat(y_te, pred_te)
print(f"Hold-out 2024 → MAE: {mae_te:.3f} | RMSE: {rmse_te:.3f} | n_test: {int(mask_test.sum())}")

# Guardar predicciones 2024
out_te = data.loc[mask_test, ['Date','Estacion','Localidad',target_col]].copy()
out_te['yhat'] = pred_te
out_te.to_csv('predicciones_holdout2024_catboost.csv', index=False)

# Métricas por estación (2024)
by_station = out_te.groupby('Estacion').apply(
    lambda g: pd.Series({
        'n': len(g),
        'MAE': mean_absolute_error(g[target_col], g['yhat']),
        'RMSE': rmse_compat(g[target_col], g['yhat'])
    })
).reset_index().sort_values('RMSE')
by_station.round(3)


Hold-out 2024 → MAE: 4.749 | RMSE: 5.971 | n_test: 4348


  by_station = out_te.groupby('Estacion').apply(


Unnamed: 0,Estacion,n,MAE,RMSE
9,Tunal,365.0,3.433,4.255
8,Suba,365.0,3.486,4.724
7,San Cristobal,365.0,3.728,4.83
11,Usme,365.0,4.061,5.302
0,Centro De Alto Rendimiento,343.0,4.137,5.32
5,Las Ferias,355.0,4.277,5.758
3,Jazmin,365.0,4.629,5.886
2,Fontibon,365.0,4.602,6.015
1,Ciudad Bolivar,365.0,5.069,6.626
4,Kennedy,365.0,4.972,6.649


In [None]:
# ==== 7) Importancias de variables y guardado del modelo ====
# Importancia "Feature Importance" de CatBoost (Gain)
fi = final_model.get_feature_importance(train_pool, type='FeatureImportance')
fi_df = pd.DataFrame({'feature': X.columns, 'importance': fi}).sort_values('importance', ascending=False)
fi_df.to_csv('feature_importance_catboost.csv', index=False)

# Guardar modelo y parámetros
final_model.save_model('catboost_pm25_model.cbm')
with open('best_params_catboost.json','w') as f:
    json.dump(best_params, f, indent=2)

print("✅ Guardados: catboost_pm25_model.cbm, best_params_catboost.json, feature_importance_catboost.csv, predicciones_holdout2024_catboost.csv")
fi_df.head(15)


✅ Guardados: catboost_pm25_model.cbm, best_params_catboost.json, feature_importance_catboost.csv, predicciones_holdout2024_catboost.csv


Unnamed: 0,feature,importance
12,PM25_lag1,36.368084
11,cos_doy,6.572275
8,dow,5.965072
23,Precip_lag1,5.716305
21,WindSpeed_lag1,4.722066
7,dayofyear,3.113445
16,PM25_rollmean7,3.099722
25,Rad_lag1,3.064718
10,sin_doy,3.029246
3,lon,2.989964
