In [1]:
# Import libraries
import wandb
from data import X_train, y_train, X_val, y_val, X_test, y_test
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
# Import W&B module for Sklearn
# from models.training_utils import SklearnTrainer
# Import preprocessing module
# from preprocessing_pipeline import pipeline_features
# Import evaluation module
from models.training_utils import evaluate_model

In [None]:
# Instantiate the W&B api
# api = wandb.Api()
api = wandb.Api(timeout=60) # Set timeout to 60 seconds to avoid a warning received.

In [4]:
# Project path (change sweep_id as needed)
sweep_id = "c87zll3a"
sweep_path = f"100496657-universidad-carlos-iii-de-madrid/DS-HPE/{sweep_id}"

In [5]:
print(sweep_path)

100496657-universidad-carlos-iii-de-madrid/DS-HPE/c87zll3a


#### Evaluate performance of best model recovered by sweep

In [None]:
# Recover sweep information
sweep = api.sweep(sweep_path)
best_run = None
best_metric = float("inf")  # we want to minimize RMSE

for run in sweep.runs:
    if "val_rmse_mean" in run.summary:
        metric = run.summary["val_rmse_mean"]
        if metric < best_metric:
            best_metric = metric
            best_run = run

print("Best run ID:", best_run.id)
print("Best metric:", best_metric)
print("Best config:", best_run.config)




Best run ID: blq0thkd
Best metric: 2005.1759502375764
Best config: {'max_depth': 10, 'subsample': 0.6051888233175461, 'n_estimators': 500, 'learning_rate': 0.011957603924852534, 'colsample_bytree': 0.7835834755878521, 'min_child_weight': 7}


In [None]:
# Recover the best run with the wandb API without the loop

# Recover best hyperparameters values

best_run = sweep.best_run()  # uses the sweep's metric (val_rmse_mean) and goal (minimize)
cfg = best_run.config

BEST_N_ESTIMATORS     = cfg["n_estimators"]
BEST_MAX_DEPTH        = cfg["max_depth"]
BEST_LR               = cfg["learning_rate"]
BEST_SUBSAMPLE        = cfg["subsample"]
BEST_COLSAMPLE        = cfg["colsample_bytree"]
BEST_MIN_CHILD_WEIGHT = cfg["min_child_weight"]


[34m[1mwandb[0m: Sorting runs by +summary_metrics.val_rmse_mean


In [17]:
# Prepare data
X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

X_train_full_np = X_train_full.to_numpy()
y_train_full_np = y_train_full.to_numpy()
X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy()


In [18]:
# Build model with chosen best hyperparameters
best_model = MultiOutputRegressor(
    xgb.XGBRegressor(
        tree_method="hist",
        enable_categorical=True,
        n_estimators=BEST_N_ESTIMATORS,
        max_depth=BEST_MAX_DEPTH,
        learning_rate=BEST_LR,
        subsample=BEST_SUBSAMPLE,
        colsample_bytree=BEST_COLSAMPLE,
        min_child_weight=BEST_MIN_CHILD_WEIGHT,
        random_state=42,
    )
)

best_model.fit(X_train_full_np, y_train_full_np)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,n_jobs,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7835834755878521
,device,
,early_stopping_rounds,
,enable_categorical,True


In [None]:
# Final test evaluation
metrics_test, y_pred_test = evaluate_model(best_model, X_test_np, y_test_np, model_type="sklearn")


{'test_rmse_mean_power': 1961.0076152243614, 'test_mae_mean_power': 411.9548391832555, 'test_r2_mean_power': 0.9519173003345748, 'test_mape_mean_power': 23.728127205863125, 'test_rmse_min_power': 1682.9888165081343, 'test_mae_min_power': 380.6175634539869, 'test_r2_min_power': 0.9717275045229045, 'test_mape_min_power': 18.286536680099285, 'test_rmse_max_power': 2137.994675461447, 'test_mae_max_power': 519.955699972344, 'test_r2_max_power': 0.9603550219737873, 'test_mape_max_power': 22.66288604074829, 'test_rmse_mean': 1927.3303690646474, 'test_mae_mean': 437.50936753652877, 'test_r2_mean': 0.9613332756104223, 'test_mape_mean': 21.559183308903567}


In [20]:
metrics_test

{'test_rmse_mean_power': 1961.0076152243614,
 'test_mae_mean_power': 411.9548391832555,
 'test_r2_mean_power': 0.9519173003345748,
 'test_mape_mean_power': 23.728127205863125,
 'test_rmse_min_power': 1682.9888165081343,
 'test_mae_min_power': 380.6175634539869,
 'test_r2_min_power': 0.9717275045229045,
 'test_mape_min_power': 18.286536680099285,
 'test_rmse_max_power': 2137.994675461447,
 'test_mae_max_power': 519.955699972344,
 'test_r2_max_power': 0.9603550219737873,
 'test_mape_max_power': 22.66288604074829,
 'test_rmse_mean': 1927.3303690646474,
 'test_mae_mean': 437.50936753652877,
 'test_r2_mean': 0.9613332756104223,
 'test_mape_mean': 21.559183308903567}

In [30]:
# Reorganize into a nested dict, to get the DF as we want it
structured = {
    "mean_power": {
        "rmse (w)": metrics_test["test_rmse_mean_power"],
        "mae (w)":  metrics_test["test_mae_mean_power"],
        "r2":   metrics_test["test_r2_mean_power"],
        "mape %": metrics_test["test_mape_mean_power"],
    },
    "min_power": {
        "rmse (w)": metrics_test["test_rmse_min_power"],
        "mae (w)":  metrics_test["test_mae_min_power"],
        "r2":   metrics_test["test_r2_min_power"],
        "mape %": metrics_test["test_mape_min_power"],
    },
    "max_power": {
        "rmse (w)": metrics_test["test_rmse_max_power"],
        "mae (w)":  metrics_test["test_mae_max_power"],
        "r2":   metrics_test["test_r2_max_power"],
        "mape %": metrics_test["test_mape_max_power"],
    }
}

df_metrics = pd.DataFrame(structured).T



In [31]:
df_metrics

Unnamed: 0,rmse (w),mae (w),r2,mape %
mean_power,1961.007615,411.954839,0.951917,23.728127
min_power,1682.988817,380.617563,0.971728,18.286537
max_power,2137.994675,519.9557,0.960355,22.662886


- r_square: good result, as explain around/more that 0.95 of the variance.



#### Refine the sweep with finer hyperparameters

In [36]:
# Project path (change sweep_id as needed)
sweep_id = "4qg5wv04"
sweep_path = f"100496657-universidad-carlos-iii-de-madrid/DS-HPE/{sweep_id}"

In [40]:
sweep_2 = api.sweep(sweep_path)

In [41]:
# Recover best hyperparameters values

best_run = sweep_2.best_run()  # uses the sweep's metric (val_rmse_mean) and goal (minimize)
cfg = best_run.config

BEST_N_ESTIMATORS     = cfg["n_estimators"]
BEST_MAX_DEPTH        = cfg["max_depth"]
BEST_LR               = cfg["learning_rate"]
BEST_SUBSAMPLE        = cfg["subsample"]
BEST_COLSAMPLE        = cfg["colsample_bytree"]
BEST_MIN_CHILD_WEIGHT = cfg["min_child_weight"]

[34m[1mwandb[0m: Sorting runs by +summary_metrics.val_rmse_mean


In [43]:
print(BEST_N_ESTIMATORS, BEST_MAX_DEPTH, BEST_LR, BEST_SUBSAMPLE, BEST_COLSAMPLE, BEST_MIN_CHILD_WEIGHT)

500 11 0.012 0.6 0.8 7


In [44]:
# Prepare data
X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

X_train_full_np = X_train_full.to_numpy()
y_train_full_np = y_train_full.to_numpy()
X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy()


In [45]:
# Build model with chosen best hyperparameters
best_model_refined = MultiOutputRegressor(
    xgb.XGBRegressor(
        tree_method="hist",
        enable_categorical=True,
        n_estimators=BEST_N_ESTIMATORS,
        max_depth=BEST_MAX_DEPTH,
        learning_rate=BEST_LR,
        subsample=BEST_SUBSAMPLE,
        colsample_bytree=BEST_COLSAMPLE,
        min_child_weight=BEST_MIN_CHILD_WEIGHT,
        random_state=42,
    )
)

best_model_refined.fit(X_train_full_np, y_train_full_np)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,n_jobs,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,True


In [None]:
# Final test evaluation
metrics_test, y_pred_test = evaluate_model(best_model_refined, X_test_np, y_test_np, model_type="sklearn")


In [47]:
# Reorganize into a nested dict, to get the DF as we want it
structured = {
    "mean_power": {
        "rmse (w)": metrics_test["test_rmse_mean_power"],
        "mae (w)":  metrics_test["test_mae_mean_power"],
        "r2":   metrics_test["test_r2_mean_power"],
        "mape %": metrics_test["test_mape_mean_power"],
    },
    "min_power": {
        "rmse (w)": metrics_test["test_rmse_min_power"],
        "mae (w)":  metrics_test["test_mae_min_power"],
        "r2":   metrics_test["test_r2_min_power"],
        "mape %": metrics_test["test_mape_min_power"],
    },
    "max_power": {
        "rmse (w)": metrics_test["test_rmse_max_power"],
        "mae (w)":  metrics_test["test_mae_max_power"],
        "r2":   metrics_test["test_r2_max_power"],
        "mape %": metrics_test["test_mape_max_power"],
    }
}

df_metrics = pd.DataFrame(structured).T



In [48]:
df_metrics

Unnamed: 0,rmse (w),mae (w),r2,mape %
mean_power,1953.490168,405.260674,0.952285,23.095143
min_power,1658.562758,371.849153,0.972542,17.849615
max_power,2105.551365,508.578924,0.961549,22.168387
