### 1. Import libraries and instantiate W&B api

In [1]:
# Import libraries
import wandb
from data import X_train, y_train, X_val, y_val, X_test, y_test
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
# Import evaluation module
from models.training_utils import evaluate_model

In [2]:
# Instantiate the W&B api
# api = wandb.Api()
api = wandb.Api(timeout=60) # Set timeout to 60 seconds to avoid a warning received.

### 2. Configure current sweep information to recover best run

The current sweep configuration is stored on file: sweep.yaml

The sweep is run directly from the anaconda terminal. <br>
We recover the sweep_id to construct the path where it is stored:

In [3]:
# Project path (change sweep_id as needed)
sweep_id = "c87zll3a"
sweep_path = f"100496657-universidad-carlos-iii-de-madrid/DS-HPE/{sweep_id}"

In [4]:
print(sweep_path)

100496657-universidad-carlos-iii-de-madrid/DS-HPE/c87zll3a


Once the sweep has finished, we recover the information concerning the best hyperparameter set.

In [5]:
# Recover sweep information
sweep = api.sweep(sweep_path)

# This code goes over all runs to find the best one, very slow
""" best_run = None
best_metric = float("inf")  # we want to minimize RMSE

for run in sweep.runs:
    if "val_rmse_mean" in run.summary:
        metric = run.summary["val_rmse_mean"]
        if metric < best_metric:
            best_metric = metric
            best_run = run

print("Best run ID:", best_run.id)
print("Best metric:", best_metric)
print("Best config:", best_run.config)"""


' best_run = None\nbest_metric = float("inf")  # we want to minimize RMSE\n\nfor run in sweep.runs:\n    if "val_rmse_mean" in run.summary:\n        metric = run.summary["val_rmse_mean"]\n        if metric < best_metric:\n            best_metric = metric\n            best_run = run\n\nprint("Best run ID:", best_run.id)\nprint("Best metric:", best_metric)\nprint("Best config:", best_run.config)'

In [6]:
# Recover the best run with the wandb API without the loop
# Recover best hyperparameters values
best_run = sweep.best_run()  # uses the sweep's metric (val_rmse_mean) and goal (minimize)
cfg = best_run.config

BEST_N_ESTIMATORS     = cfg["n_estimators"]
BEST_MAX_DEPTH        = cfg["max_depth"]
BEST_LR               = cfg["learning_rate"]
BEST_SUBSAMPLE        = cfg["subsample"]
BEST_COLSAMPLE        = cfg["colsample_bytree"]
BEST_MIN_CHILD_WEIGHT = cfg["min_child_weight"]


[34m[1mwandb[0m: Sorting runs by +summary_metrics.val_rmse_mean


In [9]:
print("Best run ID:", best_run.id)

for elem in cfg:
    print(f"{elem}: {cfg[elem]}")


Best run ID: blq0thkd
max_depth: 10
subsample: 0.6051888233175461
n_estimators: 500
learning_rate: 0.011957603924852534
colsample_bytree: 0.7835834755878521
min_child_weight: 7


### 3. Evaluate performance of best model recovered by sweep

Once we have the set of best hyperparameters as determined by W&B sweep, we run the model with them.

In [17]:
# Prepare data
X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

X_train_full_np = X_train_full.to_numpy()
y_train_full_np = y_train_full.to_numpy()
X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy()


In [18]:
# Build model with chosen best hyperparameters
best_model = MultiOutputRegressor(
    xgb.XGBRegressor(
        tree_method="hist",
        enable_categorical=True,
        n_estimators=BEST_N_ESTIMATORS,
        max_depth=BEST_MAX_DEPTH,
        learning_rate=BEST_LR,
        subsample=BEST_SUBSAMPLE,
        colsample_bytree=BEST_COLSAMPLE,
        min_child_weight=BEST_MIN_CHILD_WEIGHT,
        random_state=42,
    )
)

best_model.fit(X_train_full_np, y_train_full_np)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,n_jobs,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7835834755878521
,device,
,early_stopping_rounds,
,enable_categorical,True


To evaluate the performance, use the evaluate_model() method contained in '\models\training_utils.py' over the test set.

In [None]:
# Final test evaluation
metrics_test, y_pred_test = evaluate_model(best_model, X_test_np, y_test_np, model_type="sklearn")


{'test_rmse_mean_power': 1961.0076152243614, 'test_mae_mean_power': 411.9548391832555, 'test_r2_mean_power': 0.9519173003345748, 'test_mape_mean_power': 23.728127205863125, 'test_rmse_min_power': 1682.9888165081343, 'test_mae_min_power': 380.6175634539869, 'test_r2_min_power': 0.9717275045229045, 'test_mape_min_power': 18.286536680099285, 'test_rmse_max_power': 2137.994675461447, 'test_mae_max_power': 519.955699972344, 'test_r2_max_power': 0.9603550219737873, 'test_mape_max_power': 22.66288604074829, 'test_rmse_mean': 1927.3303690646474, 'test_mae_mean': 437.50936753652877, 'test_r2_mean': 0.9613332756104223, 'test_mape_mean': 21.559183308903567}


In [30]:
# Reorganize into a nested dict, to get the DF as we want it
structured = {
    "mean_power": {
        "rmse (w)": metrics_test["test_rmse_mean_power"],
        "mae (w)":  metrics_test["test_mae_mean_power"],
        "r2":   metrics_test["test_r2_mean_power"],
        "mape %": metrics_test["test_mape_mean_power"],
    },
    "min_power": {
        "rmse (w)": metrics_test["test_rmse_min_power"],
        "mae (w)":  metrics_test["test_mae_min_power"],
        "r2":   metrics_test["test_r2_min_power"],
        "mape %": metrics_test["test_mape_min_power"],
    },
    "max_power": {
        "rmse (w)": metrics_test["test_rmse_max_power"],
        "mae (w)":  metrics_test["test_mae_max_power"],
        "r2":   metrics_test["test_r2_max_power"],
        "mape %": metrics_test["test_mape_max_power"],
    }
}

df_metrics = pd.DataFrame(structured).T



In [31]:
df_metrics

Unnamed: 0,rmse (w),mae (w),r2,mape %
mean_power,1961.007615,411.954839,0.951917,23.728127
min_power,1682.988817,380.617563,0.971728,18.286537
max_power,2137.994675,519.9557,0.960355,22.662886


- r_square: good result, as explain around/more that 0.95 of the variance.
- rmse ~ 1.9 kW
- mae ~ 430 w
- mape ~ 18-24%


### 4. Refine the sweep with finer hyperparameters

The current sweep configuration is stored on file: sweep_2.yaml

The sweep is run directly from the anaconda terminal. <br>
We recover the sweep_id to construct the path where it is stored:

In [10]:
# Project path (change sweep_id as needed)
sweep_id = "4qg5wv04"
sweep_path = f"100496657-universidad-carlos-iii-de-madrid/DS-HPE/{sweep_id}"

In [11]:
sweep_2 = api.sweep(sweep_path)

In [12]:
# Recover best hyperparameters values

best_run = sweep_2.best_run()  # uses the sweep's metric (val_rmse_mean) and goal (minimize)
cfg = best_run.config

BEST_N_ESTIMATORS     = cfg["n_estimators"]
BEST_MAX_DEPTH        = cfg["max_depth"]
BEST_LR               = cfg["learning_rate"]
BEST_SUBSAMPLE        = cfg["subsample"]
BEST_COLSAMPLE        = cfg["colsample_bytree"]
BEST_MIN_CHILD_WEIGHT = cfg["min_child_weight"]

[34m[1mwandb[0m: Sorting runs by +summary_metrics.val_rmse_mean


In [13]:
print("Best run ID:", best_run.id)

for elem in cfg:
    print(f"{elem}: {cfg[elem]}")

Best run ID: pfmqvqhl
max_depth: 11
subsample: 0.6
n_estimators: 500
learning_rate: 0.012
colsample_bytree: 0.8
min_child_weight: 7


In [43]:
print(BEST_N_ESTIMATORS, BEST_MAX_DEPTH, BEST_LR, BEST_SUBSAMPLE, BEST_COLSAMPLE, BEST_MIN_CHILD_WEIGHT)

500 11 0.012 0.6 0.8 7


In [None]:
# Prepare data
X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

X_train_full_np = X_train_full.to_numpy()
y_train_full_np = y_train_full.to_numpy()
X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy()

In [45]:
# Build model with chosen best hyperparameters
best_model_refined = MultiOutputRegressor(
    xgb.XGBRegressor(
        tree_method="hist",
        enable_categorical=True,
        n_estimators=BEST_N_ESTIMATORS,
        max_depth=BEST_MAX_DEPTH,
        learning_rate=BEST_LR,
        subsample=BEST_SUBSAMPLE,
        colsample_bytree=BEST_COLSAMPLE,
        min_child_weight=BEST_MIN_CHILD_WEIGHT,
        random_state=42,
    )
)

best_model_refined.fit(X_train_full_np, y_train_full_np)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,n_jobs,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,True


In [None]:
# Final test evaluation
metrics_test, y_pred_test = evaluate_model(best_model_refined, X_test_np, y_test_np, model_type="sklearn")


In [47]:
# Reorganize into a nested dict, to get the DF as we want it
structured = {
    "mean_power": {
        "rmse (w)": metrics_test["test_rmse_mean_power"],
        "mae (w)":  metrics_test["test_mae_mean_power"],
        "r2":   metrics_test["test_r2_mean_power"],
        "mape %": metrics_test["test_mape_mean_power"],
    },
    "min_power": {
        "rmse (w)": metrics_test["test_rmse_min_power"],
        "mae (w)":  metrics_test["test_mae_min_power"],
        "r2":   metrics_test["test_r2_min_power"],
        "mape %": metrics_test["test_mape_min_power"],
    },
    "max_power": {
        "rmse (w)": metrics_test["test_rmse_max_power"],
        "mae (w)":  metrics_test["test_mae_max_power"],
        "r2":   metrics_test["test_r2_max_power"],
        "mape %": metrics_test["test_mape_max_power"],
    }
}

df_metrics = pd.DataFrame(structured).T



In [48]:
df_metrics

Unnamed: 0,rmse (w),mae (w),r2,mape %
mean_power,1953.490168,405.260674,0.952285,23.095143
min_power,1658.562758,371.849153,0.972542,17.849615
max_power,2105.551365,508.578924,0.961549,22.168387


Marginal gains in performance. Likely XGBoost has reached a limit with the current features.



#### Best refined hyperparameters with the AutoML datasets

In [36]:
# Para modelos de árbol no es necesario escalado
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model_train_val(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)

    # Predicciones
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    # Métricas de entrenamiento
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(train_mse)

    # Métricas de validación
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    val_rmse = np.sqrt(val_mse)

    return {
        "train_mse": train_mse,
        "train_mae": train_mae,
        "train_rmse": train_rmse,
        "train_r2": train_r2,
        "val_mse": val_mse,
        "val_mae": val_mae,
        "val_rmse": val_rmse,
        "val_r2": val_r2,
        "predictions": y_val_pred
    }

In [37]:
# df1
# df2
df=pd.read_csv(r'C:\Users\adeli\Documents 4-Q1\Data Project\GitHub HPE Code\DS-HPE\data\my_dataframe2.csv')

In [38]:
df.head()

Unnamed: 0,group,num_tasks_final,num_tasks_missing_or_inconsistent,submit_time,time_limit_cat,time_limit_scaled,num_nodes_req,has_req_nodes,num_cores_req,cores_per_task,...,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,node_power_min,node_power_mean,node_power_max
0,main,64,False,2020-05-31 22:09:29+00:00,medium (1–5h),-0.007838,16,0,256,4,...,False,False,False,False,False,True,False,7440,8318.888889,8500
1,main,64,False,2020-05-31 22:22:08+00:00,medium (1–5h),-0.007838,16,0,256,4,...,False,False,False,False,False,True,False,5820,8164.827586,8530
2,main,64,False,2020-05-31 22:41:38+00:00,medium (1–5h),-0.007838,16,0,256,4,...,False,False,False,False,False,True,False,5800,8193.111111,8510
3,main,0,True,2020-05-31 23:26:23+00:00,short (<1h),-1.243395,1,0,32,32,...,False,False,False,False,False,False,True,520,630.377358,860
4,main,0,True,2020-05-31 23:08:01+00:00,short (<1h),-1.243395,1,0,32,32,...,False,False,False,False,False,False,True,860,860.208333,870


In [39]:
df["dow"] = pd.to_datetime(df["submit_time"]).dt.dayofweek 
df["dom"] = pd.to_datetime(df["submit_time"]).dt.day                  # 1–31
df["hour"] = pd.to_datetime(df["submit_time"]).dt.hour
df["is_weekend"] = df["dow"].isin([5,6]).astype(int)
df["month"] = pd.to_datetime(df["submit_time"]).dt.month
df["is_night"] = ((df["hour"] < 7) | (df["hour"] >= 22)).astype(int)
df["is_peak"] = df["hour"].between(9, 18).astype(int)


# Once we have extracted all time-based features, we can drop submit_time
df.drop(columns=["submit_time", ], inplace=True)

# We can now drop the OHE time-based features
cols_to_drop = (
    df.filter(like='dow_').columns.tolist() +
    df.filter(like='dom_').columns.tolist() +
    df.filter(like='hour_').columns.tolist()
)
df = df.drop(columns=cols_to_drop)


In [40]:
df.columns

Index(['group', 'num_tasks_final', 'num_tasks_missing_or_inconsistent',
       'time_limit_cat', 'time_limit_scaled', 'num_nodes_req', 'has_req_nodes',
       'num_cores_req', 'cores_per_task', 'num_gpus_req', 'mem_req',
       'has_req_threads_per_core', 'is_shared_job', 'partition_final',
       'qos_final', 'node_power_min', 'node_power_mean', 'node_power_max',
       'dow', 'dom', 'hour', 'is_weekend', 'month', 'is_night', 'is_peak'],
      dtype='object')

In [41]:
# Convert categorical columns to category codes
categorical_cols = ["group", "time_limit_cat"]

for col in categorical_cols:
    df[col] = df[col].astype("category").cat.codes

In [42]:
# Separate features and target columns
target_cols = [
    "node_power_min",
    "node_power_mean",
    "node_power_max"
]

feature_cols = [
    'group',
    'num_tasks_final',
    'num_tasks_missing_or_inconsistent',
    'time_limit_scaled',
    'time_limit_cat',
    'num_nodes_req',
    'has_req_nodes',
    'num_cores_req',
    'cores_per_task',
    'num_gpus_req',
    'mem_req',
    'has_req_threads_per_core',
    'is_shared_job',
    'dow', 'dom', 'hour', 'is_weekend', 'month', 'is_night', 'is_peak'
]

# Split df into X and Y
X = df[feature_cols].copy()
Y = df[target_cols].copy()

# Split the data into training, validation, and test sets
# Choose: 70% training, 15% validation, 15% test
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, Y, test_size=0.30, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42
)




In [43]:
# Prepare data
X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

X_train_full_np = X_train_full.to_numpy()
y_train_full_np = y_train_full.to_numpy()
X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy()

from sklearn.multioutput import MultiOutputRegressor
# Build model with chosen best hyperparameters
best_model_refined = MultiOutputRegressor(
    xgb.XGBRegressor(
        tree_method="hist",
        enable_categorical=True,
        n_estimators=500,
        max_depth=10,
        learning_rate=0.011957603924852534,
        subsample=0.60515888233175461,
        colsample_bytree=0.7835834755878521,
        min_child_weight=7,
        random_state=42,
    )
)

best_model_refined.fit(X_train_full_np, y_train_full_np)

from models.training_utils import evaluate_model
# Final test evaluation
metrics_test, y_pred_test = evaluate_model(best_model_refined, X_test_np, y_test_np, model_type="sklearn")
# Reorganize into a nested dict, to get the DF as we want it
structured = {
    "mean_power": {
        "rmse (w)": metrics_test["test_rmse_mean_power"],
        "mae (w)":  metrics_test["test_mae_mean_power"],
        "r2":   metrics_test["test_r2_mean_power"],
        "mape %": metrics_test["test_mape_mean_power"],
    },
    "min_power": {
        "rmse (w)": metrics_test["test_rmse_min_power"],
        "mae (w)":  metrics_test["test_mae_min_power"],
        "r2":   metrics_test["test_r2_min_power"],
        "mape %": metrics_test["test_mape_min_power"],
    },
    "max_power": {
        "rmse (w)": metrics_test["test_rmse_max_power"],
        "mae (w)":  metrics_test["test_mae_max_power"],
        "r2":   metrics_test["test_r2_max_power"],
        "mape %": metrics_test["test_mape_max_power"],
    }
}

df_metrics = pd.DataFrame(structured).T



In [35]:
df_metrics1=df_metrics
df_metrics1

Unnamed: 0,rmse (w),mae (w),r2,mape %
mean_power,1955.41869,506.581277,0.841551,25.185728
min_power,1246.683433,386.94655,0.970222,15.644508
max_power,2130.42803,590.260103,0.942625,18.832413


In [44]:
df_metrics2=df_metrics
df_metrics2

Unnamed: 0,rmse (w),mae (w),r2,mape %
mean_power,1473.284997,403.236158,0.960002,21.27593
min_power,1158.471818,353.683116,0.982494,17.281759
max_power,1838.357498,509.844223,0.965598,21.34832


In [None]:
df_metrics2=df_metrics
df_metrics2