In [9]:
#vamos a probar modelos de ML
#leamos x_train, y_train, x_test, y_test
import pandas as pd
x_train = pd.read_csv("../data/processed/splits/x_train.csv")
y_train = pd.read_csv("../data/processed/splits/y_train.csv")
x_test = pd.read_csv("../data/processed/splits/x_test.csv")
y_test = pd.read_csv("../data/processed/splits/y_test.csv")




In [10]:
# ==============================
# Optuna para RandomForestRegressor (split temporal) + "pruning" pasivo
# ==============================
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

SEED = 42
VAL_FRAC = 0.20    # último 20% del train como valid
N_TRIALS = 60      # subí/bajá según tiempo

# --------- Split temporal (train -> train/valid ; test queda intacto) ---------
X_train_all = x_train.copy()
y_train_all = y_train.copy()
X_test = x_test.copy()
y_test = y_test.copy()

n = len(X_train_all)
n_val = int(np.floor(n * VAL_FRAC))
n_tr = n - n_val
X_tr, y_tr = X_train_all.iloc[:n_tr], y_train_all[:n_tr]
X_val, y_val = X_train_all.iloc[n_tr:], y_train_all[n_tr:]

print(f"Train: {X_tr.shape}, Valid: {X_val.shape}, Test: {X_test.shape}")

# --------- Función objetivo ---------
def objective(trial: optuna.Trial) -> float:
    # Espacio de búsqueda (robusto para RF sklearn)
    max_depth_choice = trial.suggest_categorical("max_depth", [None, 6, 10, 16, 24, 32])
    max_features_choice = trial.suggest_categorical("max_features", ["sqrt", "log2", 0.5, 0.7, 1.0])
    bootstrap_choice = trial.suggest_categorical("bootstrap", [True, False])

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 2000),
        "criterion": "squared_error",
        "max_depth": max_depth_choice,                 # None = sin límite
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "max_features": max_features_choice,           # fracción o estrategia
        "bootstrap": bootstrap_choice,
        "random_state": SEED,
        "n_jobs": -1,
    }

    # Si se usa bootstrap, probar muestreo parcial de filas
    if bootstrap_choice:
        params["max_samples"] = trial.suggest_float("max_samples", 0.5, 1.0)

    # Modelo
    model = RandomForestRegressor(**params)

    # Entrenar y evaluar en VALID (métrica a minimizar: RMSE)
    model.fit(X_tr, y_tr)
    y_pred_val = model.predict(X_val)
    rmse_val = mean_squared_error(y_val, y_pred_val)

    # Registrar también como “valor intermedio” (no habrá pruning real, pero queda logueado)
    trial.report(rmse_val, step=0)
    # if trial.should_prune():   # en RF no habrá pasos intermedios útiles
    #     raise optuna.exceptions.TrialPruned()

    return rmse_val

# --------- Estudio Optuna ---------
study = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=SEED),
    pruner=MedianPruner(n_warmup_steps=5)  # no tendrá efecto real aquí, pero se deja por consistencia
)
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

print("\n=== Mejores hiperparámetros (VALID) ===")
print(study.best_params)
print(f"Mejor RMSE valid: {study.best_value:.4f}")

# --------- Re-entrenar con los mejores params (train+valid) ---------
best_params = study.best_params.copy()
best_model = RandomForestRegressor(**best_params, random_state=SEED, n_jobs=-1)

best_model.fit(X_train_all, y_train_all)

# --------- Evaluación en TEST ---------
y_pred_test = best_model.predict(X_test)
mse  = mean_squared_error(y_test, y_pred_test)
rmse = mean_squared_error(y_test, y_pred_test)
mae  = mean_absolute_error(y_test, y_pred_test)
r2   = r2_score(y_test, y_pred_test)

print("\n=== Métricas en TEST ===")
print(f"MSE : {mse:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"MAE : {mae:,.2f}")
print(f"R²  : {r2:.4f}")


[I 2025-11-16 23:05:19,974] A new study created in memory with name: no-name-f71a71cb-be2d-4f91-9a10-1d2eb8ac7707


Train: (667, 172), Valid: (166, 172), Test: (356, 172)


Best trial: 0. Best value: 8.70984e+07:   2%|▏         | 1/60 [00:00<00:29,  2.03it/s]

[I 2025-11-16 23:05:20,466] Trial 0 finished with value: 87098364.32751626 and parameters: {'max_depth': 6, 'max_features': 'log2', 'bootstrap': True, 'n_estimators': 582, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_samples': 0.6521211214797689}. Best is trial 0 with value: 87098364.32751626.


Best trial: 1. Best value: 2.76884e+07:   3%|▎         | 2/60 [00:02<01:11,  1.23s/it]

[I 2025-11-16 23:05:22,217] Trial 1 finished with value: 27688384.39069908 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1294, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_samples': 0.9744427686266666}. Best is trial 1 with value: 27688384.39069908.


Best trial: 1. Best value: 2.76884e+07:   5%|▌         | 3/60 [00:03<01:23,  1.46s/it]

[I 2025-11-16 23:05:23,939] Trial 2 finished with value: 374038608.11040074 and parameters: {'max_depth': None, 'max_features': 0.7, 'bootstrap': True, 'n_estimators': 1136, 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_samples': 0.9847923138822793}. Best is trial 1 with value: 27688384.39069908.


Best trial: 1. Best value: 2.76884e+07:   7%|▋         | 4/60 [00:04<01:08,  1.22s/it]

[I 2025-11-16 23:05:24,792] Trial 3 finished with value: 210478228.15190402 and parameters: {'max_depth': 6, 'max_features': 0.7, 'bootstrap': True, 'n_estimators': 705, 'min_samples_split': 12, 'min_samples_leaf': 3, 'max_samples': 0.9010984903770198}. Best is trial 1 with value: 27688384.39069908.


Best trial: 4. Best value: 2.16282e+07:   8%|▊         | 5/60 [00:06<01:18,  1.43s/it]

[I 2025-11-16 23:05:26,598] Trial 4 finished with value: 21628156.109393902 and parameters: {'max_depth': 6, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 1322, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  10%|█         | 6/60 [00:10<01:54,  2.11s/it]

[I 2025-11-16 23:05:30,035] Trial 5 finished with value: 39222150.82959805 and parameters: {'max_depth': 24, 'max_features': 1.0, 'bootstrap': False, 'n_estimators': 970, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  12%|█▏        | 7/60 [00:10<01:31,  1.72s/it]

[I 2025-11-16 23:05:30,956] Trial 6 finished with value: 101936008.27898385 and parameters: {'max_depth': 24, 'max_features': 'log2', 'bootstrap': False, 'n_estimators': 1655, 'min_samples_split': 14, 'min_samples_leaf': 18}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  13%|█▎        | 8/60 [00:12<01:24,  1.63s/it]

[I 2025-11-16 23:05:32,374] Trial 7 finished with value: 438151873.44276255 and parameters: {'max_depth': 32, 'max_features': 1.0, 'bootstrap': True, 'n_estimators': 1119, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_samples': 0.5599326836668415}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  15%|█▌        | 9/60 [00:13<01:14,  1.46s/it]

[I 2025-11-16 23:05:33,483] Trial 8 finished with value: 64547523.94070315 and parameters: {'max_depth': 6, 'max_features': 'sqrt', 'bootstrap': True, 'n_estimators': 1297, 'min_samples_split': 11, 'min_samples_leaf': 2, 'max_samples': 0.6393232321183058}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  17%|█▋        | 10/60 [00:14<01:00,  1.21s/it]

[I 2025-11-16 23:05:34,132] Trial 9 finished with value: 95495936.06333876 and parameters: {'max_depth': 24, 'max_features': 'log2', 'bootstrap': False, 'n_estimators': 1164, 'min_samples_split': 3, 'min_samples_leaf': 17}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  18%|█▊        | 11/60 [00:14<00:46,  1.04it/s]

[I 2025-11-16 23:05:34,511] Trial 10 finished with value: 803405930.6537734 and parameters: {'max_depth': 10, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 232, 'min_samples_split': 19, 'min_samples_leaf': 10}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  20%|██        | 12/60 [00:17<01:10,  1.48s/it]

[I 2025-11-16 23:05:37,171] Trial 11 finished with value: 815491664.4058796 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 1721, 'min_samples_split': 7, 'min_samples_leaf': 9}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  22%|██▏       | 13/60 [00:18<01:12,  1.54s/it]

[I 2025-11-16 23:05:38,856] Trial 12 finished with value: 276604966.30583125 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1554, 'min_samples_split': 6, 'min_samples_leaf': 7, 'max_samples': 0.8157762107920377}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  23%|██▎       | 14/60 [00:21<01:27,  1.90s/it]

[I 2025-11-16 23:05:41,607] Trial 13 finished with value: 562015883.2369487 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 1973, 'min_samples_split': 8, 'min_samples_leaf': 13}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  25%|██▌       | 15/60 [00:23<01:20,  1.79s/it]

[I 2025-11-16 23:05:43,142] Trial 14 finished with value: 24080225.926096465 and parameters: {'max_depth': 10, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1372, 'min_samples_split': 16, 'min_samples_leaf': 1, 'max_samples': 0.8030513372748733}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  27%|██▋       | 16/60 [00:24<01:08,  1.55s/it]

[I 2025-11-16 23:05:44,130] Trial 15 finished with value: 41883079.26451693 and parameters: {'max_depth': 10, 'max_features': 'sqrt', 'bootstrap': False, 'n_estimators': 1475, 'min_samples_split': 17, 'min_samples_leaf': 1}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  28%|██▊       | 17/60 [00:25<01:02,  1.45s/it]

[I 2025-11-16 23:05:45,348] Trial 16 finished with value: 560605697.6058446 and parameters: {'max_depth': 10, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 819, 'min_samples_split': 15, 'min_samples_leaf': 13}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  30%|███       | 18/60 [00:27<01:12,  1.72s/it]

[I 2025-11-16 23:05:47,679] Trial 17 finished with value: 340596150.53306985 and parameters: {'max_depth': None, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1945, 'min_samples_split': 19, 'min_samples_leaf': 6, 'max_samples': 0.7698777899432435}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  32%|███▏      | 19/60 [00:29<01:17,  1.88s/it]

[I 2025-11-16 23:05:49,959] Trial 18 finished with value: 763561728.9388517 and parameters: {'max_depth': 32, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 1428, 'min_samples_split': 16, 'min_samples_leaf': 7}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  33%|███▎      | 20/60 [00:32<01:17,  1.95s/it]

[I 2025-11-16 23:05:52,057] Trial 19 finished with value: 189958762.62073272 and parameters: {'max_depth': 6, 'max_features': 0.7, 'bootstrap': True, 'n_estimators': 1813, 'min_samples_split': 10, 'min_samples_leaf': 13, 'max_samples': 0.8412617858914677}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  35%|███▌      | 21/60 [00:34<01:24,  2.17s/it]

[I 2025-11-16 23:05:54,730] Trial 20 finished with value: 1450676916.9785368 and parameters: {'max_depth': 10, 'max_features': 1.0, 'bootstrap': False, 'n_estimators': 936, 'min_samples_split': 13, 'min_samples_leaf': 8}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  37%|███▋      | 22/60 [00:36<01:20,  2.12s/it]

[I 2025-11-16 23:05:56,744] Trial 21 finished with value: 22280189.898544498 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1339, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.9829705066274331}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  38%|███▊      | 23/60 [00:38<01:12,  1.97s/it]

[I 2025-11-16 23:05:58,371] Trial 22 finished with value: 28276105.792765196 and parameters: {'max_depth': 10, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1356, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_samples': 0.7004831188261248}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  40%|████      | 24/60 [00:40<01:09,  1.94s/it]

[I 2025-11-16 23:06:00,234] Trial 23 finished with value: 22668209.753500503 and parameters: {'max_depth': 6, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1621, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_samples': 0.8976469851027725}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  42%|████▏     | 25/60 [00:42<01:06,  1.90s/it]

[I 2025-11-16 23:06:02,036] Trial 24 finished with value: 394033698.0316708 and parameters: {'max_depth': 6, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1608, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_samples': 0.9106412457318777}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  43%|████▎     | 26/60 [00:43<01:03,  1.88s/it]

[I 2025-11-16 23:06:03,877] Trial 25 finished with value: 97923330.12450229 and parameters: {'max_depth': 6, 'max_features': 'sqrt', 'bootstrap': True, 'n_estimators': 1779, 'min_samples_split': 3, 'min_samples_leaf': 20, 'max_samples': 0.8952128953015633}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  45%|████▌     | 27/60 [00:45<01:00,  1.83s/it]

[I 2025-11-16 23:06:05,597] Trial 26 finished with value: 108347475.46962352 and parameters: {'max_depth': 6, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1523, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_samples': 0.9960572597563107}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  47%|████▋     | 28/60 [00:47<00:57,  1.81s/it]

[I 2025-11-16 23:06:07,360] Trial 27 finished with value: 21656901.02030865 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1206, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.9394554332606727}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  48%|████▊     | 29/60 [00:49<01:00,  1.94s/it]

[I 2025-11-16 23:06:09,597] Trial 28 finished with value: 466988435.0808019 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 1239, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  50%|█████     | 30/60 [00:50<00:48,  1.61s/it]

[I 2025-11-16 23:06:10,444] Trial 29 finished with value: 118922411.30487005 and parameters: {'max_depth': 16, 'max_features': 'log2', 'bootstrap': True, 'n_estimators': 973, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_samples': 0.5158108276738461}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  52%|█████▏    | 31/60 [00:50<00:36,  1.28s/it]

[I 2025-11-16 23:06:10,934] Trial 30 finished with value: 109967347.14202848 and parameters: {'max_depth': 16, 'max_features': 'sqrt', 'bootstrap': True, 'n_estimators': 550, 'min_samples_split': 6, 'min_samples_leaf': 11, 'max_samples': 0.9435849383457131}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  53%|█████▎    | 32/60 [00:52<00:37,  1.35s/it]

[I 2025-11-16 23:06:12,449] Trial 31 finished with value: 22736080.741153717 and parameters: {'max_depth': 6, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1419, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0.8724659879810116}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  55%|█████▌    | 33/60 [00:54<00:38,  1.44s/it]

[I 2025-11-16 23:06:14,111] Trial 32 finished with value: 30351384.55239808 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1206, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_samples': 0.9423078954167953}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  57%|█████▋    | 34/60 [00:55<00:36,  1.41s/it]

[I 2025-11-16 23:06:15,435] Trial 33 finished with value: 130157956.32485959 and parameters: {'max_depth': None, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1006, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_samples': 0.9384289189205327}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  58%|█████▊    | 35/60 [00:57<00:36,  1.47s/it]

[I 2025-11-16 23:06:17,051] Trial 34 finished with value: 37596466.79474969 and parameters: {'max_depth': 6, 'max_features': 0.7, 'bootstrap': True, 'n_estimators': 1267, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_samples': 0.8565122554129694}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  60%|██████    | 36/60 [00:58<00:33,  1.41s/it]

[I 2025-11-16 23:06:18,327] Trial 35 finished with value: 290313497.0579016 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1074, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_samples': 0.9514767630278114}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  62%|██████▏   | 37/60 [01:01<00:44,  1.93s/it]

[I 2025-11-16 23:06:21,474] Trial 36 finished with value: 74287575.01155432 and parameters: {'max_depth': 32, 'max_features': 1.0, 'bootstrap': True, 'n_estimators': 1672, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_samples': 0.7361846126691549}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  63%|██████▎   | 38/60 [01:03<00:44,  2.02s/it]

[I 2025-11-16 23:06:23,713] Trial 37 finished with value: 204445168.0806499 and parameters: {'max_depth': 6, 'max_features': 0.7, 'bootstrap': True, 'n_estimators': 1856, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_samples': 0.9094933355136832}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  65%|██████▌   | 39/60 [01:04<00:35,  1.69s/it]

[I 2025-11-16 23:06:24,624] Trial 38 finished with value: 85513272.53563541 and parameters: {'max_depth': 24, 'max_features': 'log2', 'bootstrap': False, 'n_estimators': 1590, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  67%|██████▋   | 40/60 [01:05<00:30,  1.53s/it]

[I 2025-11-16 23:06:25,771] Trial 39 finished with value: 23613070.63479337 and parameters: {'max_depth': None, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 863, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.827502401257227}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  68%|██████▊   | 41/60 [01:07<00:32,  1.70s/it]

[I 2025-11-16 23:06:27,889] Trial 40 finished with value: 553924227.7398015 and parameters: {'max_depth': 6, 'max_features': 1.0, 'bootstrap': True, 'n_estimators': 1329, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_samples': 0.9929074793381558}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  70%|███████   | 42/60 [01:09<00:29,  1.65s/it]

[I 2025-11-16 23:06:29,396] Trial 41 finished with value: 23155195.393185146 and parameters: {'max_depth': 6, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1427, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0.8687647636437872}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  72%|███████▏  | 43/60 [01:10<00:25,  1.51s/it]

[I 2025-11-16 23:06:30,593] Trial 42 finished with value: 33251507.07612553 and parameters: {'max_depth': 6, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1147, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_samples': 0.8703484219312485}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  73%|███████▎  | 44/60 [01:12<00:24,  1.51s/it]

[I 2025-11-16 23:06:32,086] Trial 43 finished with value: 185230900.30690512 and parameters: {'max_depth': 6, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1464, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_samples': 0.7833003135765791}. Best is trial 4 with value: 21628156.109393902.


Best trial: 4. Best value: 2.16282e+07:  75%|███████▌  | 45/60 [01:13<00:22,  1.53s/it]

[I 2025-11-16 23:06:33,671] Trial 44 finished with value: 22507300.535725538 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': True, 'n_estimators': 1066, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0.8878199096806557}. Best is trial 4 with value: 21628156.109393902.


Best trial: 45. Best value: 2.14753e+07:  77%|███████▋  | 46/60 [01:15<00:24,  1.76s/it]

[I 2025-11-16 23:06:35,960] Trial 45 finished with value: 21475322.46381998 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 1077, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 45 with value: 21475322.46381998.


Best trial: 45. Best value: 2.14753e+07:  78%|███████▊  | 47/60 [01:16<00:18,  1.44s/it]

[I 2025-11-16 23:06:36,662] Trial 46 finished with value: 56127684.127517186 and parameters: {'max_depth': 16, 'max_features': 'log2', 'bootstrap': False, 'n_estimators': 1083, 'min_samples_split': 11, 'min_samples_leaf': 2}. Best is trial 45 with value: 21475322.46381998.


Best trial: 47. Best value: 2.13676e+07:  80%|████████  | 48/60 [01:17<00:16,  1.40s/it]

[I 2025-11-16 23:06:37,967] Trial 47 finished with value: 21367557.522698008 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 654, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 47 with value: 21367557.522698008.


Best trial: 47. Best value: 2.13676e+07:  82%|████████▏ | 49/60 [01:19<00:14,  1.28s/it]

[I 2025-11-16 23:06:38,979] Trial 48 finished with value: 492616290.3923698 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 558, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 47 with value: 21367557.522698008.


Best trial: 47. Best value: 2.13676e+07:  83%|████████▎ | 50/60 [01:19<00:10,  1.02s/it]

[I 2025-11-16 23:06:39,398] Trial 49 finished with value: 132441149.91905741 and parameters: {'max_depth': 16, 'max_features': 'sqrt', 'bootstrap': False, 'n_estimators': 688, 'min_samples_split': 8, 'min_samples_leaf': 15}. Best is trial 47 with value: 21367557.522698008.


Best trial: 47. Best value: 2.13676e+07:  85%|████████▌ | 51/60 [01:20<00:09,  1.11s/it]

[I 2025-11-16 23:06:40,709] Trial 50 finished with value: 21414747.43707615 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 691, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 47 with value: 21367557.522698008.


Best trial: 47. Best value: 2.13676e+07:  87%|████████▋ | 52/60 [01:21<00:08,  1.02s/it]

[I 2025-11-16 23:06:41,513] Trial 51 finished with value: 21506844.39052489 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 412, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 47 with value: 21367557.522698008.


Best trial: 47. Best value: 2.13676e+07:  88%|████████▊ | 53/60 [01:22<00:06,  1.07it/s]

[I 2025-11-16 23:06:42,258] Trial 52 finished with value: 692120809.8219405 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 423, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 47 with value: 21367557.522698008.


Best trial: 53. Best value: 2.105e+07:  90%|█████████ | 54/60 [01:22<00:05,  1.20it/s]  

[I 2025-11-16 23:06:42,858] Trial 53 finished with value: 21050032.750871297 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 277, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 53 with value: 21050032.750871297.


Best trial: 53. Best value: 2.105e+07:  92%|█████████▏| 55/60 [01:23<00:03,  1.40it/s]

[I 2025-11-16 23:06:43,288] Trial 54 finished with value: 21647745.632779203 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 207, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 53 with value: 21050032.750871297.


Best trial: 53. Best value: 2.105e+07:  93%|█████████▎| 56/60 [01:24<00:02,  1.35it/s]

[I 2025-11-16 23:06:44,095] Trial 55 finished with value: 935260121.1763645 and parameters: {'max_depth': 24, 'max_features': 0.7, 'bootstrap': False, 'n_estimators': 367, 'min_samples_split': 7, 'min_samples_leaf': 8}. Best is trial 53 with value: 21050032.750871297.


Best trial: 53. Best value: 2.105e+07:  95%|█████████▌| 57/60 [01:24<00:02,  1.33it/s]

[I 2025-11-16 23:06:44,874] Trial 56 finished with value: 499308876.0168645 and parameters: {'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 399, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 53 with value: 21050032.750871297.


Best trial: 53. Best value: 2.105e+07:  97%|█████████▋| 58/60 [01:26<00:01,  1.10it/s]

[I 2025-11-16 23:06:46,154] Trial 57 finished with value: 21351657.363803364 and parameters: {'max_depth': 32, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 640, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 53 with value: 21050032.750871297.


Best trial: 53. Best value: 2.105e+07:  98%|█████████▊| 59/60 [01:27<00:01,  1.03s/it]

[I 2025-11-16 23:06:47,459] Trial 58 finished with value: 21450513.018034276 and parameters: {'max_depth': 32, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 651, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 53 with value: 21050032.750871297.


Best trial: 53. Best value: 2.105e+07: 100%|██████████| 60/60 [01:29<00:00,  1.50s/it]


[I 2025-11-16 23:06:49,867] Trial 59 finished with value: 39215678.85230417 and parameters: {'max_depth': 32, 'max_features': 1.0, 'bootstrap': False, 'n_estimators': 665, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 53 with value: 21050032.750871297.

=== Mejores hiperparámetros (VALID) ===
{'max_depth': 16, 'max_features': 0.5, 'bootstrap': False, 'n_estimators': 277, 'min_samples_split': 5, 'min_samples_leaf': 3}
Mejor RMSE valid: 21050032.7509

=== Métricas en TEST ===
MSE : 13,506,138.19
RMSE: 13,506,138.19
MAE : 2,828.49
R²  : 0.6180


In [11]:
"""El mejor modelo fue RFRegressor con los siguientes hiperparámetros:
    {'max_depth': 16, 'max_features': 1.0, 'bootstrap': True, 'n_estimators': 1183, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_samples': 0.8654367662002576}"""
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
#Entrenemos un modelo con esos hiperparámetros

best_model = RandomForestRegressor(
    n_estimators=1183,
    max_depth=16,
    max_features=1.0,
    bootstrap=True,
    min_samples_split=2,
    min_samples_leaf=2,
    max_samples=0.8654367662002576,
    random_state=42,
    n_jobs=-1
)
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='yeo-johnson')
x_train_ready = pd.DataFrame(pt.fit_transform(x_train), columns=x_train.columns)
x_test_ready = pd.DataFrame(pt.transform(x_test), columns=x_test.columns)

best_model.fit(x_train_ready, y_train)
y_pred_test = best_model.predict(x_test_ready)
mse  = mean_squared_error(y_test, y_pred_test)
rmse = mean_squared_error(y_test, y_pred_test)
mae  = mean_absolute_error(y_test, y_pred_test)
r2   = r2_score(y_test, y_pred_test)
print("\n=== Métricas en TEST del mejor modelo entrenado manualmente ===")
print(f"MSE : {mse:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"MAE : {mae:,.2f}")
print(f"R²  : {r2:.4f}")



=== Métricas en TEST del mejor modelo entrenado manualmente ===
MSE : 11,877,246.10
RMSE: 11,877,246.10
MAE : 2,663.78
R²  : 0.6641


In [12]:
#gracias a este random forest vamos a definir nuestras variables más relevantes

feature_importances = best_model.feature_importances_
feature_names = x_train.columns
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
feature_importances = feature_importances[:20]
print("Feature importances:")
print(feature_importances)
x_train = x_train[feature_importances['feature']]
x_test = x_test[feature_importances['feature']]

Feature importances:
                    feature  importance
39                Frio (Kw)    0.532835
35            Sala Maq (Kw)    0.196138
166   Frio_roll_mean_7_lag1    0.118774
31            Envasado (Kw)    0.017530
168  Frio_roll_mean_14_lag1    0.014589
34           Servicios (Kw)    0.013320
170  Frio_roll_mean_28_lag1    0.004626
159                 mes_cos    0.003590
44           KW Gral Planta    0.003088
167    Frio_roll_std_7_lag1    0.002991
10             ET Bodega/Hl    0.002679
40     Pta Agua / Eflu (Kw)    0.002468
12        ET Servicios / Hl    0.002414
25            Hl Cerveza L4    0.002244
5          EE Sala Maq / Hl    0.002163
42          Resto Serv (Kw)    0.002061
171   Frio_roll_std_28_lag1    0.002054
41           Prod Agua (Kw)    0.001993
33             Linea 3 (Kw)    0.001952
47         Agua Bodega (Hl)    0.001837


In [13]:
print(feature_importances["feature"].tolist())

['Frio (Kw)', 'Sala Maq (Kw)', 'Frio_roll_mean_7_lag1', 'Envasado (Kw)', 'Frio_roll_mean_14_lag1', 'Servicios (Kw)', 'Frio_roll_mean_28_lag1', 'mes_cos', 'KW Gral Planta', 'Frio_roll_std_7_lag1', 'ET Bodega/Hl', 'Pta Agua / Eflu (Kw)', 'ET Servicios / Hl', 'Hl Cerveza L4', 'EE Sala Maq / Hl', 'Resto Serv (Kw)', 'Frio_roll_std_28_lag1', 'Prod Agua (Kw)', 'Linea 3 (Kw)', 'Agua Bodega (Hl)']


In [14]:
# ============================================
# Experiments: Tracking, Evaluation & Plots
# ============================================
import os, json, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

# Modelos y utils
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
from lightgbm import LGBMRegressor, early_stopping
import optuna

import matplotlib.pyplot as plt

# =========================
# Config
# =========================
SEED = 42
VAL_FRAC = 0.20
N_TRIALS = 50   # Optuna
RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)
LOG_PATH = RESULTS_DIR / "experiment_logs.csv"
TS = datetime.now().strftime("%Y%m%d_%H%M%S")

# =========================
# Helpers
# =========================
def metrics_dict(y_true, y_pred):
    return {
        "mae":  mean_absolute_error(y_true, y_pred),
        "rmse": mean_squared_error(y_true, y_pred ),
        "mse":  mean_squared_error(y_true, y_pred),
        "r2":   r2_score(y_true, y_pred),
    }

def append_log(row: dict, log_path: Path = LOG_PATH):
    row = row.copy()
    row["timestamp"] = datetime.now().isoformat()
    df_row = pd.DataFrame([row])
    if not log_path.exists():
        df_row.to_csv(log_path, index=False)
    else:
        df_row.to_csv(log_path, index=False, mode="a", header=False)

def plot_pred_vs_true(y_true, y_pred, title, out_path):
    plt.figure()
    plt.scatter(y_true, y_pred, s=8)
    minv = np.min([y_true.min(), y_pred.min()])
    maxv = np.max([y_true.max(), y_pred.max()])
    plt.plot([minv, maxv], [minv, maxv])
    plt.xlabel("Valor real")
    plt.ylabel("Predicción")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

def plot_residuals(y_true, y_pred, title, out_path_scatter, out_path_hist):
    resid = y_pred - y_true
    # Residual vs Pred
    plt.figure()
    plt.scatter(y_pred, resid, s=8)
    plt.axhline(0)
    plt.xlabel("Predicción")
    plt.ylabel("Residuo")
    plt.title(title + " - Residual vs Pred")
    plt.tight_layout()
    plt.savefig(out_path_scatter, dpi=150)
    plt.close()
    # Histograma
    plt.figure()
    plt.hist(resid, bins=30)
    plt.xlabel("Residuo")
    plt.ylabel("Frecuencia")
    plt.title(title + " - Hist Residuales")
    plt.tight_layout()
    plt.savefig(out_path_hist, dpi=150)
    plt.close()

def save_feature_importance(model, feature_names, out_csv, out_png, top_k=30):
    # Tree models: feature_importances_; Lineales: coef_
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        imp = pd.DataFrame({"feature": feature_names, "importance": importances})
    elif hasattr(model, "coef_"):
        coef = np.ravel(model.coef_)
        imp = pd.DataFrame({"feature": feature_names, "importance": np.abs(coef)})
    else:
        # XGB/LGBM wrappers suelen tener feature_importances_
        try:
            importances = model.get_booster().get_score(importance_type="gain")
            imp = pd.DataFrame({"feature": list(importances.keys()),
                                "importance": list(importances.values())})
        except Exception:
            imp = pd.DataFrame({"feature": feature_names, "importance": np.zeros(len(feature_names))})

    imp = imp.sort_values("importance", ascending=False)
    imp.to_csv(out_csv, index=False)

    top = imp.head(top_k)
    plt.figure(figsize=(8, max(3, 0.28*len(top))))
    plt.barh(top["feature"][::-1], top["importance"][::-1])
    plt.xlabel("Importancia")
    plt.title("Feature importance (top)")
    plt.tight_layout()
    plt.savefig(out_png, dpi=150)
    plt.close()

def refit_full_for_test(model, X_tr, y_tr, X_val=None, y_val=None):
    """
    Refit en todo el train usando mejor n_estimators si hay early stopping.
    """
    # XGB
    if isinstance(model, xgb.XGBRegressor):
        best_n = getattr(model, "best_iteration", None)
        params = model.get_params()
        if best_n is not None:
            params["n_estimators"] = best_n
        m = xgb.XGBRegressor(**params)
        m.fit(X_tr, y_tr)  # full train
        return m

    # LGBM
    if isinstance(model, LGBMRegressor):
        best_n = getattr(model, "best_iteration_", None)
        params = model.get_params()
        if best_n is not None and best_n > 0:
            params["n_estimators"] = best_n
        m = LGBMRegressor(**params)
        m.fit(X_tr, y_tr)
        return m

    # Otros: re-usa el modelo tal cual y refitea
    m = model.__class__(**model.get_params())
    m.fit(X_tr, y_tr)
    return m

def write_summary_md(log_path: Path, summary_path: Path):
    df = pd.read_csv(log_path)
    # Orden por RMSE de validación
    df = df.sort_values("val_rmse").reset_index(drop=True)
    lines = []
    lines.append(f"# Experiment Summary — {datetime.now().isoformat()}\n")
    lines.append("## Top 10 por RMSE de Validación\n")
    cols = ["timestamp","model_name","val_rmse","val_mae","val_r2","test_rmse","test_mae","test_r2"]
    lines.append(df[cols].head(10).to_markdown(index=False))
    lines.append("\n## Espacios de búsqueda declarados\n")
    for _, row in df.iterrows():
        if pd.notna(row.get("search_space_str", "")) and str(row.get("search_space_str", "")).strip():
            lines.append(f"- **{row['model_name']}**: `{row['search_space_str']}`")
    lines.append("\n## Justificación\n")
    lines.append("- Se elige el mejor modelo por **RMSE de validación** (menor es mejor).")
    lines.append("- Se reportan métricas en test del modelo **refiteado en todo el train**.")
    lines.append("- La elección queda respaldada por `results/experiment_logs.csv` y este resumen.")
    summary_path.write_text("\n".join(lines), encoding="utf-8")

# =========================
# 0) Se espera que tengas x_train, y_train, x_test, y_test en memoria
# =========================

# =========================
# 1) PowerTransformer como baseline
# =========================
pt = PowerTransformer(method='yeo-johnson')  # standardize=True por default
X_train_ready = pd.DataFrame(pt.fit_transform(x_train), columns=x_train.columns, index=x_train.index)
X_test_ready  = pd.DataFrame(pt.transform(x_test),      columns=x_test.columns,  index=x_test.index)

# Split temporal interno para validación
n = len(X_train_ready)
n_val = int(np.floor(n * VAL_FRAC))
n_tr = n - n_val
X_tr, y_tr = X_train_ready.iloc[:n_tr], np.ravel(y_train.iloc[:n_tr])
X_val, y_val = X_train_ready.iloc[n_tr:], np.ravel(y_train.iloc[n_tr:])
X_full, y_full = X_train_ready, np.ravel(y_train)

# Para logs
base_log_ctx = {
    "n_train": len(X_tr),
    "n_val": len(X_val),
    "n_full_train": len(X_full),
    "n_test": len(X_test_ready),
    "n_features": X_train_ready.shape[1],
}

# =========================
# 2) Modelos y búsquedas
# =========================
all_results = []

# (a) RandomForest — baseline
rf = RandomForestRegressor(
    n_estimators=1183,
    max_depth=16,
    max_features=1.0,
    bootstrap=True,
    min_samples_split=2,
    min_samples_leaf=2,
    max_samples=0.8654367662002576,
    random_state=SEED,
    n_jobs=-1
)
rf.fit(X_tr, y_tr)
y_val_pred = rf.predict(X_val)
val_metrics = metrics_dict(y_val, y_val_pred)

rf_refit = refit_full_for_test(rf, X_full, y_full)
y_test_pred = rf_refit.predict(X_test_ready)
test_metrics = metrics_dict(np.ravel(y_test), y_test_pred)

append_log({
    **base_log_ctx,
    "model_name": "RandomForest",
    "params_json": json.dumps(rf.get_params()),
    "search_space_str": "Fixed params (baseline)",
    "val_mae": val_metrics["mae"],
    "val_rmse": val_metrics["rmse"],
    "val_r2": val_metrics["r2"],
    "test_mae": test_metrics["mae"],
    "test_rmse": test_metrics["rmse"],
    "test_r2": test_metrics["r2"],
}, LOG_PATH)
all_results.append(("RandomForest", val_metrics["rmse"], rf, rf_refit, y_val_pred, y_test_pred))

# (b) XGBoost — early stopping en validación
xgb_model = xgb.XGBRegressor(
    n_estimators=5000, learning_rate=0.02, max_depth=8,
    subsample=0.8, colsample_bytree=0.8, reg_alpha=0.0, reg_lambda=1.0,
    min_child_weight=1.0, random_state=SEED, tree_method="hist", n_jobs=-1
)
xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
y_val_pred = xgb_model.predict(X_val)
val_metrics = metrics_dict(y_val, y_val_pred)

xgb_refit = refit_full_for_test(xgb_model, X_full, y_full)
y_test_pred = xgb_refit.predict(X_test_ready)
test_metrics = metrics_dict(np.ravel(y_test), y_test_pred)

append_log({
    **base_log_ctx,
    "model_name": "XGBoost",
    "params_json": json.dumps(xgb_model.get_params()),
    "search_space_str": "Fixed params; early_stopping_rounds=200",
    "val_mae": val_metrics["mae"],
    "val_rmse": val_metrics["rmse"],
    "val_r2": val_metrics["r2"],
    "test_mae": test_metrics["mae"],
    "test_rmse": test_metrics["rmse"],
    "test_r2": test_metrics["r2"],
}, LOG_PATH)
all_results.append(("XGBoost", val_metrics["rmse"], xgb_model, xgb_refit, y_val_pred, y_test_pred))

# (c) LightGBM — early stopping en validación
lgbm_model = LGBMRegressor(
    n_estimators=5000, learning_rate=0.02, num_leaves=127, max_depth=-1,
    subsample=0.8, colsample_bytree=0.8, reg_alpha=0.0, reg_lambda=1.0,
    min_child_samples=20, random_state=SEED, n_jobs=-1
)
lgbm_model.fit(X_tr, y_tr,
               eval_set=[(X_val, y_val)],
               callbacks=[early_stopping(stopping_rounds=200, verbose=False)])
y_val_pred = lgbm_model.predict(X_val)
val_metrics = metrics_dict(y_val, y_val_pred)

lgbm_refit = refit_full_for_test(lgbm_model, X_full, y_full)
y_test_pred = lgbm_refit.predict(X_test_ready)
test_metrics = metrics_dict(np.ravel(y_test), y_test_pred)

append_log({
    **base_log_ctx,
    "model_name": "LightGBM",
    "params_json": json.dumps(lgbm_model.get_params()),
    "search_space_str": "Fixed params; early_stopping_rounds=200",
    "val_mae": val_metrics["mae"],
    "val_rmse": val_metrics["rmse"],
    "val_r2": val_metrics["r2"],
    "test_mae": test_metrics["mae"],
    "test_rmse": test_metrics["rmse"],
    "test_r2": test_metrics["r2"],
}, LOG_PATH)
all_results.append(("LightGBM", val_metrics["rmse"], lgbm_model, lgbm_refit, y_val_pred, y_test_pred))

# (d) Ridge — Optuna (CV TimeSeriesSplit) -> elegir alpha
tscv = TimeSeriesSplit(n_splits=5)
def ridge_objective(trial: optuna.Trial):
    alpha = trial.suggest_float("alpha", 1e-4, 1e3, log=True)
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=alpha, random_state=SEED))
    ])
    scores = cross_val_score(pipe, X_full, y_full, cv=tscv,
                             scoring="neg_root_mean_squared_error", n_jobs=-1)
    return -scores.mean()

ridge_space = "alpha ~ loguniform[1e-4, 1e3]"
ridge_study = optuna.create_study(direction="minimize", study_name="ridge_opt")
ridge_study.optimize(ridge_objective, n_trials=N_TRIALS, show_progress_bar=False)
best_alpha_ridge = ridge_study.best_params["alpha"]

ridge_best = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=best_alpha_ridge, random_state=SEED))
])
# Validación entrenando en X_tr
ridge_best.fit(X_tr, y_tr)
y_val_pred = ridge_best.predict(X_val)
val_metrics = metrics_dict(y_val, y_val_pred)

# Test refiteado en todo el train
ridge_refit = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=best_alpha_ridge, random_state=SEED))
])
ridge_refit.fit(X_full, y_full)
y_test_pred = ridge_refit.predict(X_test_ready)
test_metrics = metrics_dict(np.ravel(y_test), y_test_pred)

append_log({
    **base_log_ctx,
    "model_name": "Ridge (Optuna)",
    "params_json": json.dumps({"alpha": float(best_alpha_ridge)}),
    "search_space_str": ridge_space + f"; n_trials={N_TRIALS}",
    "val_mae": val_metrics["mae"],
    "val_rmse": val_metrics["rmse"],
    "val_r2": val_metrics["r2"],
    "test_mae": test_metrics["mae"],
    "test_rmse": test_metrics["rmse"],
    "test_r2": test_metrics["r2"],
}, LOG_PATH)
all_results.append(("Ridge (Optuna)", val_metrics["rmse"], ridge_best, ridge_refit, y_val_pred, y_test_pred))

# (e) Lasso — Optuna
def lasso_objective(trial: optuna.Trial):
    alpha = trial.suggest_float("alpha", 1e-4, 1e2, log=True)
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("lasso", Lasso(alpha=alpha, random_state=SEED, max_iter=10000))
    ])
    scores = cross_val_score(pipe, X_full, y_full, cv=tscv,
                             scoring="neg_root_mean_squared_error", n_jobs=-1)
    return -scores.mean()

lasso_space = "alpha ~ loguniform[1e-4, 1e2]"
lasso_study = optuna.create_study(direction="minimize", study_name="lasso_opt")
lasso_study.optimize(lasso_objective, n_trials=N_TRIALS, show_progress_bar=False)
best_alpha_lasso = lasso_study.best_params["alpha"]

lasso_best = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", Lasso(alpha=best_alpha_lasso, random_state=SEED, max_iter=10000))
])
lasso_best.fit(X_tr, y_tr)
y_val_pred = lasso_best.predict(X_val)
val_metrics = metrics_dict(y_val, y_val_pred)

lasso_refit = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", Lasso(alpha=best_alpha_lasso, random_state=SEED, max_iter=10000))
])
lasso_refit.fit(X_full, y_full)
y_test_pred = lasso_refit.predict(X_test_ready)
test_metrics = metrics_dict(np.ravel(y_test), y_test_pred)

append_log({
    **base_log_ctx,
    "model_name": "Lasso (Optuna)",
    "params_json": json.dumps({"alpha": float(best_alpha_lasso)}),
    "search_space_str": lasso_space + f"; n_trials={N_TRIALS}",
    "val_mae": val_metrics["mae"],
    "val_rmse": val_metrics["rmse"],
    "val_r2": val_metrics["r2"],
    "test_mae": test_metrics["mae"],
    "test_rmse": test_metrics["rmse"],
    "test_r2": test_metrics["r2"],
}, LOG_PATH)
all_results.append(("Lasso (Optuna)", val_metrics["rmse"], lasso_best, lasso_refit, y_val_pred, y_test_pred))

# =========================
# 3) Selección por RMSE de Validación + gráficos y FI
# =========================
# all_results: (name, val_rmse, model_val_fitted, model_test_refit, y_val_pred, y_test_pred)
all_results.sort(key=lambda t: t[1])
best_name, best_val_rmse, best_model_val, best_model_full, best_yval_pred, best_ytest_pred = all_results[0]
print(f"\n>>> Mejor por RMSE de validación: {best_name} (val_RMSE={best_val_rmse:.4f})")

# Plots (validación y test) para el mejor
# Validación
plot_pred_vs_true(
    y_true=y_val, y_pred=best_yval_pred,
    title=f"{best_name} - Validación",
    out_path=RESULTS_DIR / f"{TS}_{best_name.replace(' ','_')}_val_pred_vs_true.png"
)
plot_residuals(
    y_true=y_val, y_pred=best_yval_pred,
    title=f"{best_name} - Validación",
    out_path_scatter=RESULTS_DIR / f"{TS}_{best_name.replace(' ','_')}_val_resid_scatter.png",
    out_path_hist=RESULTS_DIR / f"{TS}_{best_name.replace(' ','_')}_val_resid_hist.png"
)

# Test
plot_pred_vs_true(
    y_true=np.ravel(y_test), y_pred=best_ytest_pred,
    title=f"{best_name} - Test",
    out_path=RESULTS_DIR / f"{TS}_{best_name.replace(' ','_')}_test_pred_vs_true.png"
)
plot_residuals(
    y_true=np.ravel(y_test), y_pred=best_ytest_pred,
    title=f"{best_name} - Test",
    out_path_scatter=RESULTS_DIR / f"{TS}_{best_name.replace(' ','_')}_test_resid_scatter.png",
    out_path_hist=RESULTS_DIR / f"{TS}_{best_name.replace(' ','_')}_test_resid_hist.png"
)

# Feature importance del mejor (en el modelo refiteado full)
save_feature_importance(
    best_model_full,
    feature_names=X_train_ready.columns.tolist(),
    out_csv=RESULTS_DIR / f"{TS}_{best_name.replace(' ','_')}_feature_importance.csv",
    out_png=RESULTS_DIR / f"{TS}_{best_name.replace(' ','_')}_feature_importance.png",
    top_k=30
)

# =========================
# 4) Resumen de experimentos (MD)
# =========================
summary_md_path = RESULTS_DIR / f"experiment_summary_{TS}.md"
write_summary_md(LOG_PATH, summary_md_path)

print("\n==== Listo ====")
print(f"- Logs: {LOG_PATH}")
print(f"- Resumen: {summary_md_path}")
print(f"- Gráficos y FI en: {RESULTS_DIR}/")


[0]	validation_0-rmse:6580.94473
[1]	validation_0-rmse:6504.99236
[2]	validation_0-rmse:6447.49828
[3]	validation_0-rmse:6387.52801
[4]	validation_0-rmse:6339.26682
[5]	validation_0-rmse:6286.29978
[6]	validation_0-rmse:6230.75460
[7]	validation_0-rmse:6157.75520
[8]	validation_0-rmse:6111.92179
[9]	validation_0-rmse:6049.95832
[10]	validation_0-rmse:5992.76394
[11]	validation_0-rmse:5935.86998
[12]	validation_0-rmse:5886.97330
[13]	validation_0-rmse:5839.24383
[14]	validation_0-rmse:5812.80321
[15]	validation_0-rmse:5768.57515
[16]	validation_0-rmse:5716.49059
[17]	validation_0-rmse:5676.30627
[18]	validation_0-rmse:5639.85094
[19]	validation_0-rmse:5611.79489
[20]	validation_0-rmse:5593.01273
[21]	validation_0-rmse:5567.76699
[22]	validation_0-rmse:5536.92145
[23]	validation_0-rmse:5493.50084
[24]	validation_0-rmse:5464.21150
[25]	validation_0-rmse:5426.62832
[26]	validation_0-rmse:5393.02071
[27]	validation_0-rmse:5375.18007
[28]	validation_0-rmse:5339.86674
[29]	validation_0-rmse:5

[I 2025-11-16 23:07:12,336] A new study created in memory with name: ridge_opt


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4240
[LightGBM] [Info] Number of data points in the train set: 667, number of used features: 20
[LightGBM] [Info] Start training from score 28568.025112
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4856
[LightGBM] [Info] Number of data points in the train set: 833, number of used features: 20
[LightGBM] [Info] Start training from score 28296.922269


[I 2025-11-16 23:07:14,675] Trial 0 finished with value: 75231.70153651616 and parameters: {'alpha': 0.001123584592103265}. Best is trial 0 with value: 75231.70153651616.
[I 2025-11-16 23:07:16,331] Trial 1 finished with value: 70692.84123000434 and parameters: {'alpha': 2.3561265633649393}. Best is trial 1 with value: 70692.84123000434.
[I 2025-11-16 23:07:17,939] Trial 2 finished with value: 70837.59685865408 and parameters: {'alpha': 2.2250955099806218}. Best is trial 1 with value: 70692.84123000434.
[I 2025-11-16 23:07:19,355] Trial 3 finished with value: 67625.98692534714 and parameters: {'alpha': 7.056978568258645}. Best is trial 3 with value: 67625.98692534714.
[I 2025-11-16 23:07:19,369] Trial 4 finished with value: 75162.52890773879 and parameters: {'alpha': 0.01953353105919643}. Best is trial 3 with value: 67625.98692534714.
[I 2025-11-16 23:07:19,384] Trial 5 finished with value: 74547.72861989577 and parameters: {'alpha': 0.2018872258107501}. Best is trial 3 with value: 676


>>> Mejor por RMSE de validación: XGBoost (val_RMSE=23948432.7133)

==== Listo ====
- Logs: results\experiment_logs.csv
- Resumen: results\experiment_summary_20251116_230654.md
- Gráficos y FI en: results/
