# XGBoost Model Development

**XGBoost** is an optimized distributed gradient boosting library designed to be highly efficient, flexible, and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solves many data science problems in a fast and accurate way.

### 1. Load the Datasets

In [1]:
import pandas as pd
from pathlib import Path

# Directory where the processed data is stored
data_path = Path("../processed_data")

# Load the training and validation datasets
X_train, X_val, y_train, y_val = (
    pd.read_csv(data_path / "X_train.csv"),
    pd.read_csv(data_path / "X_val.csv"),
    pd.read_csv(data_path / "y_train.csv"),
    pd.read_csv(data_path / "y_val.csv")
)

# Combine train and validation sets for robust K-Fold tuning
features = pd.concat([X_train, X_val], ignore_index=True)
targets = pd.concat([y_train, y_val], ignore_index=True)

# Display the shapes of the datasets
print(f"features shape: {features.shape}")
print(f"targets shape: {targets.shape}")

features shape: (2000, 157)
targets shape: (2000, 10)


### 2. Hyperparameter Tuning & Model Training

In [2]:
import optuna
import warnings
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Define the objective function for Optuna
def objective(trial: optuna.Trial, X: pd.DataFrame, y: pd.Series):
    """
    Objective function for Optuna to minimize Mean Absolute Percentage Error (MAPE)
    using an XGBoost Regressor with K-Fold cross-validation.

    Parameters:
      trial (optuna.Trial): An Optuna trial object that suggests hyperparameters.
      X (pd.DataFrame): Feature matrix for training.
      y (pd.Series): Target variable for training.

    Returns:
      float: The mean absolute percentage error (MAPE) of the model on the validation set during cross-validation.
    """
    # Define the hyperparameter search space for XGBoost
    param = {
        'objective': 'reg:squarederror',
        'eval_metric': 'mape',
        'early_stopping_round': 10,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'random_state': 42,
        'n_jobs': -1
    }

    # Use K-Fold cross-validation to get a robust estimate of the model's performance
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mape_scores = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model = xgb.XGBRegressor(**param)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        preds = model.predict(X_val)
        mape_scores.append(mean_absolute_percentage_error(y_val, preds))

    return np.mean(mape_scores)

In [3]:
import joblib

# Define the directory for saving models and Optuna studies
model_dir = Path("../models/xgboost")
model_dir.mkdir(parents=True, exist_ok=True)

optuna_dir = Path("../optuna_db")
optuna_dir.mkdir(parents=True, exist_ok=True)
storage_name = f"sqlite:///{optuna_dir}/xgboost_studies.db"

# Dictionary to store the best models
best_models = {}

# Iterate over each target property to tune and train a model
for target in targets.columns:
    print(f"\n--- Tuning and Training for {target} ---\n")
    y = targets[target]

    # Create an Optuna study to find the best hyperparameters
    study = optuna.create_study(direction='minimize',
                                study_name='lightgbm-tuning-' + target,
                                storage=storage_name)
    study.optimize(lambda trial: objective(trial, features, y), n_trials=50)

    # Get the best hyperparameters
    best_params = study.best_params
    print(f"Best MAPE for {target}: {study.best_value}")
    print(f"Best hyperparameters for {target}: {best_params}")

    # Train the final model with the best hyperparameters on the entire training set
    final_model = xgb.XGBRegressor(**best_params, random_state=42, n_jobs=-1)
    final_model.fit(features, y)

    # Save the trained model to a file
    joblib.dump(final_model, f'{model_dir}/{target}_model.joblib')
    print(f"Saved best model for {target}")

    # Store the best model with its MAPE score in the dictionary
    best_models[target] = (final_model, study.best_value)


--- Tuning and Training for BlendProperty1 ---



[I 2025-07-18 18:02:10,912] A new study created in RDB with name: lightgbm-tuning-BlendProperty1
[I 2025-07-18 18:02:18,871] Trial 0 finished with value: 4.141636731702515 and parameters: {'n_estimators': 371, 'learning_rate': 0.12336954551514612, 'max_depth': 12, 'min_child_weight': 1, 'subsample': 0.6645881996196517, 'colsample_bytree': 0.9274202984276753, 'gamma': 0.9768356863092365}. Best is trial 0 with value: 4.141636731702515.
[I 2025-07-18 18:02:27,523] Trial 1 finished with value: 1.8327686129433811 and parameters: {'n_estimators': 599, 'learning_rate': 0.2710510703021149, 'max_depth': 4, 'min_child_weight': 4, 'subsample': 0.6027453482840737, 'colsample_bytree': 0.9304553850344833, 'gamma': 0.4868454381737741}. Best is trial 1 with value: 1.8327686129433811.
[I 2025-07-18 18:02:51,336] Trial 2 finished with value: 3.4797449080854412 and parameters: {'n_estimators': 995, 'learning_rate': 0.06564972684491026, 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.853074398113560

Best MAPE for BlendProperty1: 1.1241262852486753
Best hyperparameters for BlendProperty1: {'n_estimators': 880, 'learning_rate': 0.22507387950939334, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.6978817111166078, 'colsample_bytree': 0.8463349997475814, 'gamma': 2.8690698326696795}


[I 2025-07-18 18:11:36,715] A new study created in RDB with name: lightgbm-tuning-BlendProperty2


Saved best model for BlendProperty1

--- Tuning and Training for BlendProperty2 ---



[I 2025-07-18 18:11:46,485] Trial 0 finished with value: 1.1002017132746689 and parameters: {'n_estimators': 607, 'learning_rate': 0.2614929109044249, 'max_depth': 11, 'min_child_weight': 10, 'subsample': 0.8782122807079289, 'colsample_bytree': 0.7227047112202676, 'gamma': 1.4303406413260182}. Best is trial 0 with value: 1.1002017132746689.
[I 2025-07-18 18:11:59,704] Trial 1 finished with value: 0.927929783607872 and parameters: {'n_estimators': 941, 'learning_rate': 0.07495811255311499, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.9968521136229886, 'colsample_bytree': 0.9710798062461475, 'gamma': 4.2862014678802325}. Best is trial 1 with value: 0.927929783607872.
[I 2025-07-18 18:12:07,655] Trial 2 finished with value: 1.3170733076058927 and parameters: {'n_estimators': 515, 'learning_rate': 0.25272536336898854, 'max_depth': 12, 'min_child_weight': 10, 'subsample': 0.6101684621395093, 'colsample_bytree': 0.9586262544975117, 'gamma': 2.024317046038116}. Best is trial 1 with v

Best MAPE for BlendProperty2: 0.7361791339023062
Best hyperparameters for BlendProperty2: {'n_estimators': 852, 'learning_rate': 0.028984637397741222, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.6426589058512316, 'colsample_bytree': 0.9565959429780223, 'gamma': 0.24319246921625806}


[I 2025-07-18 18:29:14,217] A new study created in RDB with name: lightgbm-tuning-BlendProperty3


Saved best model for BlendProperty2

--- Tuning and Training for BlendProperty3 ---



[I 2025-07-18 18:29:24,113] Trial 0 finished with value: 1.4558833952232137 and parameters: {'n_estimators': 617, 'learning_rate': 0.13667540152134555, 'max_depth': 6, 'min_child_weight': 10, 'subsample': 0.6792724623395565, 'colsample_bytree': 0.9545057481869889, 'gamma': 1.5341544697736187}. Best is trial 0 with value: 1.4558833952232137.
[I 2025-07-18 18:29:36,688] Trial 1 finished with value: 1.4987255175071268 and parameters: {'n_estimators': 573, 'learning_rate': 0.2887397111147945, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.8864004651817182, 'colsample_bytree': 0.6975810592889595, 'gamma': 1.9344185199672903}. Best is trial 0 with value: 1.4558833952232137.
[I 2025-07-18 18:29:47,074] Trial 2 finished with value: 1.390595459062029 and parameters: {'n_estimators': 638, 'learning_rate': 0.18851098577814468, 'max_depth': 12, 'min_child_weight': 5, 'subsample': 0.8505385476681101, 'colsample_bytree': 0.7345127761722261, 'gamma': 0.9257970736993737}. Best is trial 2 with v

Best MAPE for BlendProperty3: 1.0321010452934707
Best hyperparameters for BlendProperty3: {'n_estimators': 476, 'learning_rate': 0.08162178316962118, 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.7236777642323883, 'colsample_bytree': 0.8748667005043861, 'gamma': 0.10049373241987958}


[I 2025-07-18 18:41:59,869] A new study created in RDB with name: lightgbm-tuning-BlendProperty4


Saved best model for BlendProperty3

--- Tuning and Training for BlendProperty4 ---



[I 2025-07-18 18:42:13,167] Trial 0 finished with value: 1.4249944884593087 and parameters: {'n_estimators': 932, 'learning_rate': 0.22503118985612516, 'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.625044857398063, 'colsample_bytree': 0.6582378024384704, 'gamma': 3.0597875614158445}. Best is trial 0 with value: 1.4249944884593087.
[I 2025-07-18 18:42:25,496] Trial 1 finished with value: 1.0979762088058047 and parameters: {'n_estimators': 780, 'learning_rate': 0.07206505978940098, 'max_depth': 4, 'min_child_weight': 10, 'subsample': 0.842583416133943, 'colsample_bytree': 0.9615551768012877, 'gamma': 4.131173788032324}. Best is trial 1 with value: 1.0979762088058047.
[I 2025-07-18 18:42:37,340] Trial 2 finished with value: 1.0374999856306395 and parameters: {'n_estimators': 387, 'learning_rate': 0.07884229749794658, 'max_depth': 5, 'min_child_weight': 9, 'subsample': 0.8800221404390621, 'colsample_bytree': 0.6370824449038793, 'gamma': 0.6788476862770204}. Best is trial 2 with val

Best MAPE for BlendProperty4: 0.7858338673851748
Best hyperparameters for BlendProperty4: {'n_estimators': 425, 'learning_rate': 0.09193583839271054, 'max_depth': 4, 'min_child_weight': 3, 'subsample': 0.6499636125937205, 'colsample_bytree': 0.8006761219334343, 'gamma': 0.22350016984563037}


[I 2025-07-18 18:49:19,196] A new study created in RDB with name: lightgbm-tuning-BlendProperty5


Saved best model for BlendProperty4

--- Tuning and Training for BlendProperty5 ---



[I 2025-07-18 18:49:28,952] Trial 0 finished with value: 0.9623853200672192 and parameters: {'n_estimators': 695, 'learning_rate': 0.20625559147348096, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.8700798091326055, 'colsample_bytree': 0.7510621930917555, 'gamma': 2.7267592782835592}. Best is trial 0 with value: 0.9623853200672192.
[I 2025-07-18 18:49:41,693] Trial 1 finished with value: 1.6856865863171258 and parameters: {'n_estimators': 601, 'learning_rate': 0.2959176346106126, 'max_depth': 5, 'min_child_weight': 8, 'subsample': 0.8308012051280824, 'colsample_bytree': 0.6783304847384662, 'gamma': 2.7488217137948228}. Best is trial 0 with value: 0.9623853200672192.
[I 2025-07-18 18:49:48,145] Trial 2 finished with value: 0.5396950464969622 and parameters: {'n_estimators': 453, 'learning_rate': 0.20573240099860873, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.9141982220877876, 'colsample_bytree': 0.768142740492366, 'gamma': 4.330467169523913}. Best is trial 2 with val

Best MAPE for BlendProperty5: 0.10843618885685638
Best hyperparameters for BlendProperty5: {'n_estimators': 387, 'learning_rate': 0.033077575590718146, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.6583575663383667, 'colsample_bytree': 0.9786326642441552, 'gamma': 0.1720418681846134}


[I 2025-07-18 18:55:53,423] A new study created in RDB with name: lightgbm-tuning-BlendProperty6


Saved best model for BlendProperty5

--- Tuning and Training for BlendProperty6 ---



[I 2025-07-18 18:56:05,600] Trial 0 finished with value: 1.132631092944124 and parameters: {'n_estimators': 891, 'learning_rate': 0.2587131414398806, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.899785789862459, 'colsample_bytree': 0.8145764896401169, 'gamma': 2.849186617569761}. Best is trial 0 with value: 1.132631092944124.
[I 2025-07-18 18:56:14,481] Trial 1 finished with value: 0.9319498690445771 and parameters: {'n_estimators': 559, 'learning_rate': 0.2459008164537578, 'max_depth': 3, 'min_child_weight': 8, 'subsample': 0.7965032103713777, 'colsample_bytree': 0.8575406197909252, 'gamma': 1.3096497673964314}. Best is trial 1 with value: 0.9319498690445771.
[I 2025-07-18 18:56:21,196] Trial 2 finished with value: 0.9745259157283659 and parameters: {'n_estimators': 289, 'learning_rate': 0.052518716381281284, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.6400597357587372, 'colsample_bytree': 0.9670698525671745, 'gamma': 4.382351869348925}. Best is trial 1 with value: 

Best MAPE for BlendProperty6: 0.5083318467118708
Best hyperparameters for BlendProperty6: {'n_estimators': 786, 'learning_rate': 0.022152427802757575, 'max_depth': 3, 'min_child_weight': 7, 'subsample': 0.8899104751708139, 'colsample_bytree': 0.6155627494231547, 'gamma': 0.0033095618007862315}


[I 2025-07-18 19:07:51,271] A new study created in RDB with name: lightgbm-tuning-BlendProperty7


Saved best model for BlendProperty6

--- Tuning and Training for BlendProperty7 ---



[I 2025-07-18 19:08:03,876] Trial 0 finished with value: 2.3454663335950663 and parameters: {'n_estimators': 749, 'learning_rate': 0.0691062753679637, 'max_depth': 9, 'min_child_weight': 1, 'subsample': 0.7715136649096801, 'colsample_bytree': 0.7388235819064919, 'gamma': 1.6494609106113278}. Best is trial 0 with value: 2.3454663335950663.
[I 2025-07-18 19:08:10,117] Trial 1 finished with value: 2.366803781999487 and parameters: {'n_estimators': 347, 'learning_rate': 0.08160969889864614, 'max_depth': 6, 'min_child_weight': 9, 'subsample': 0.6356772804352856, 'colsample_bytree': 0.7332828007251767, 'gamma': 2.938199552712975}. Best is trial 0 with value: 2.3454663335950663.
[I 2025-07-18 19:08:23,866] Trial 2 finished with value: 1.8318456412043436 and parameters: {'n_estimators': 871, 'learning_rate': 0.07077475290040229, 'max_depth': 12, 'min_child_weight': 10, 'subsample': 0.7868458067764973, 'colsample_bytree': 0.8190258935914303, 'gamma': 1.1967127814362817}. Best is trial 2 with va

Best MAPE for BlendProperty7: 1.2475005259880885
Best hyperparameters for BlendProperty7: {'n_estimators': 659, 'learning_rate': 0.20672246147027495, 'max_depth': 6, 'min_child_weight': 5, 'subsample': 0.817080911470997, 'colsample_bytree': 0.8489057118340431, 'gamma': 0.08389820949239107}


[I 2025-07-18 19:14:42,510] A new study created in RDB with name: lightgbm-tuning-BlendProperty8


Saved best model for BlendProperty7

--- Tuning and Training for BlendProperty8 ---



[I 2025-07-18 19:14:48,858] Trial 0 finished with value: 1.3611075666581953 and parameters: {'n_estimators': 441, 'learning_rate': 0.2508202292027521, 'max_depth': 12, 'min_child_weight': 10, 'subsample': 0.675719777536427, 'colsample_bytree': 0.6146623527254323, 'gamma': 2.47946197781575}. Best is trial 0 with value: 1.3611075666581953.
[I 2025-07-18 19:15:06,242] Trial 1 finished with value: 1.2574931932156894 and parameters: {'n_estimators': 824, 'learning_rate': 0.11067646938664769, 'max_depth': 11, 'min_child_weight': 9, 'subsample': 0.9781735350640823, 'colsample_bytree': 0.861807175077381, 'gamma': 0.3017772788188966}. Best is trial 1 with value: 1.2574931932156894.
[I 2025-07-18 19:15:18,820] Trial 2 finished with value: 0.9129056077104775 and parameters: {'n_estimators': 768, 'learning_rate': 0.06482933871294348, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.9213327984781736, 'colsample_bytree': 0.8528441252560182, 'gamma': 0.5057652297433962}. Best is trial 2 with val

Best MAPE for BlendProperty8: 0.8355401512919418
Best hyperparameters for BlendProperty8: {'n_estimators': 816, 'learning_rate': 0.022224695560252732, 'max_depth': 4, 'min_child_weight': 10, 'subsample': 0.7397469357992161, 'colsample_bytree': 0.8322767171232157, 'gamma': 0.20365762188005826}


[I 2025-07-18 19:26:44,423] A new study created in RDB with name: lightgbm-tuning-BlendProperty9


Saved best model for BlendProperty8

--- Tuning and Training for BlendProperty9 ---



[I 2025-07-18 19:26:58,242] Trial 0 finished with value: 2.0755718994294425 and parameters: {'n_estimators': 939, 'learning_rate': 0.10249483868084532, 'max_depth': 9, 'min_child_weight': 6, 'subsample': 0.7897771856631932, 'colsample_bytree': 0.8974205942031239, 'gamma': 3.9464387873835642}. Best is trial 0 with value: 2.0755718994294425.
[I 2025-07-18 19:27:03,826] Trial 1 finished with value: 2.137714736414493 and parameters: {'n_estimators': 213, 'learning_rate': 0.05107447644515432, 'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.9645202502229324, 'colsample_bytree': 0.6646571672609154, 'gamma': 4.430006086454757}. Best is trial 0 with value: 2.0755718994294425.
[I 2025-07-18 19:27:16,354] Trial 2 finished with value: 1.9865888913560923 and parameters: {'n_estimators': 575, 'learning_rate': 0.05904563804357888, 'max_depth': 11, 'min_child_weight': 3, 'subsample': 0.8264601922516867, 'colsample_bytree': 0.7805863440859031, 'gamma': 1.8850793405979838}. Best is trial 2 with va

Best MAPE for BlendProperty9: 1.2869225268241347
Best hyperparameters for BlendProperty9: {'n_estimators': 774, 'learning_rate': 0.14749442928467016, 'max_depth': 8, 'min_child_weight': 7, 'subsample': 0.7304992131867065, 'colsample_bytree': 0.7151321380366973, 'gamma': 0.7974542821606588}


[I 2025-07-18 19:41:01,315] A new study created in RDB with name: lightgbm-tuning-BlendProperty10


Saved best model for BlendProperty9

--- Tuning and Training for BlendProperty10 ---



[I 2025-07-18 19:41:14,215] Trial 0 finished with value: 0.9983601362868069 and parameters: {'n_estimators': 910, 'learning_rate': 0.23283198153222062, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.6618314725628516, 'colsample_bytree': 0.9336520233255374, 'gamma': 1.7412321783481004}. Best is trial 0 with value: 0.9983601362868069.
[I 2025-07-18 19:41:25,109] Trial 1 finished with value: 1.3377526006046 and parameters: {'n_estimators': 773, 'learning_rate': 0.1975905326060107, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.7333482346004172, 'colsample_bytree': 0.6427291964715055, 'gamma': 4.982229312136605}. Best is trial 0 with value: 0.9983601362868069.
[I 2025-07-18 19:41:40,271] Trial 2 finished with value: 0.8624816962546552 and parameters: {'n_estimators': 726, 'learning_rate': 0.09207896776720402, 'max_depth': 6, 'min_child_weight': 9, 'subsample': 0.7482231921331831, 'colsample_bytree': 0.662025698895869, 'gamma': 1.5594917071310883}. Best is trial 2 with value: 

Best MAPE for BlendProperty10: 0.597802886377014
Best hyperparameters for BlendProperty10: {'n_estimators': 140, 'learning_rate': 0.07885605063192688, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.9812543671296039, 'colsample_bytree': 0.6765990082833855, 'gamma': 0.05835828780964476}
Saved best model for BlendProperty10


### 3. List the Best Models with MAPE Scores

In [4]:
print("--- BEST MODELS AND MAPE SCORES ---\n")
for target, (model, mape) in best_models.items():
    print(f"{target}: MAPE = {mape:.4f}, Model = {model}")
print("--- ALL MODELS SAVED TO DISK ---\n")
print(f"Models are saved in: {model_dir}")
print(f"Optuna studies are saved in: {optuna_dir}")

--- BEST MODELS AND MAPE SCORES ---

BlendProperty1: MAPE = 1.1241, Model = XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8463349997475814, device=None,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, feature_weights=None,
             gamma=2.8690698326696795, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.22507387950939334,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=880, n_jobs=-1,
             num_parallel_tree=None, ...)
BlendProperty2: MAPE = 0.7362, Model = XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylev