### **LightGBM Model with Hyperparameter Tuning**

LightGBM is a gradient boosting framework that uses tree-based algorithms and is designed for efficiency and low memory usage.

In [2]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import os
import joblib
import time
import optuna
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
X = pd.read_csv('X_processed.csv')
X_test = pd.read_csv('X_test_processed.csv')
y = pd.read_csv('y_processed.csv')

# Drop if still in the data
if 'PID' in X.columns:
    X = X.drop(columns=['PID'])
if 'site' in X.columns:
    X = X.drop(columns=['site'])

if 'PID' in X_test.columns:
    X_test = X_test.drop(columns=['PID'])
if 'site' in X_test.columns:
    X_test = X_test.drop(columns=['site'])


#split the data into training and validation sets 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (6195, 31)
y_train shape: (6195, 11)


In [4]:
#Function to  evalute  model constently

def evaluate_model(model,X_train,y_train,X_val,y_val,model_name,):
      #tracking training time 
    StartTime = time.time()
    
    #fit the model
    model.fit(X_train,y_train)

    #trainin the time
    trainTime = time.time()-StartTime

    #prediction
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)


    #check the errors
    train_mae = mean_absolute_error(y_train,y_pred_train)
    val_mae = mean_absolute_error(y_val,y_pred_val)


    #check the RMSE
    train_rmse = np.sqrt(mean_squared_error(y_train,y_pred_train))
    val_rmse = np.sqrt(mean_squared_error(y_val,y_pred_val))
   

     # Print results
    print(f"\n{model_name} Results:")
    print(f"Training Time: {trainTime:.2f} seconds")
    print(f"Training MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}")
    print(f"Validation MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}")

    # Return the results
    return {
        'model': model,
        'name': model_name,
        'train_mae': train_mae,
        'val_mae': val_mae,
        'train_rmse': train_rmse,
        'val_rmse': val_rmse,
        'train_time': trainTime
    }





In [5]:
# Define objective function for LightGBM hyperparameter tuning
def objective_lgb(trial):
    # Define hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', -1, 15),  # -1 means no limit
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': 42
    }
    
    # Create LightGBM MultiOutputRegressor
    lgb_model = MultiOutputRegressor(lgb.LGBMRegressor(**params))
    
    # Train the model
    lgb_model.fit(X_train, y_train)

   
    
    
    # Make predictions
    y_pred = lgb_model.predict(X_val)
    
    # Calculate MAE
    mae = mean_absolute_error(y_val, y_pred)
    
    return mae


In [6]:

# Run the hyperparameter optimization
print("Tuning LightGBM hyperparameters...")
study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(objective_lgb, n_trials=10)  # Adjust n_trials as needed

print("Best LightGBM Parameters:", study_lgb.best_params)
print("Best LightGBM MAE:", study_lgb.best_value)

# Create the optimized LightGBM model
best_lgb_model = MultiOutputRegressor(lgb.LGBMRegressor(**study_lgb.best_params, random_state=42))

# Evaluate LightGBM model
lgb_results = evaluate_model(best_lgb_model, X_train, y_train, X_val, y_val, "LightGBM")

[I 2025-07-28 01:19:27,846] A new study created in memory with name: no-name-9d5413c1-b4c0-49c1-a3ad-b986466eb571


Tuning LightGBM hyperparameters...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001972 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used feat

[I 2025-07-28 01:19:40,600] Trial 0 finished with value: 160.64048188641925 and parameters: {'n_estimators': 368, 'max_depth': 8, 'learning_rate': 0.11096012833290714, 'num_leaves': 100, 'subsample': 0.7899342443398943, 'colsample_bytree': 0.6814171889347739, 'min_child_samples': 76, 'reg_alpha': 3.232991211934121, 'reg_lambda': 5.87413986192187}. Best is trial 0 with value: 160.64048188641925.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001795 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start tr

[I 2025-07-28 01:19:43,705] Trial 1 finished with value: 178.81106520977696 and parameters: {'n_estimators': 297, 'max_depth': 2, 'learning_rate': 0.060611805072987336, 'num_leaves': 45, 'subsample': 0.7714845618870332, 'colsample_bytree': 0.785477648409876, 'min_child_samples': 27, 'reg_alpha': 0.391005515035886, 'reg_lambda': 3.2811186281065585}. Best is trial 0 with value: 160.64048188641925.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004007 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001621 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start tr

[I 2025-07-28 01:19:46,546] Trial 2 finished with value: 173.57121126715683 and parameters: {'n_estimators': 211, 'max_depth': 2, 'learning_rate': 0.17082058260580366, 'num_leaves': 111, 'subsample': 0.8561743213631976, 'colsample_bytree': 0.937722040365868, 'min_child_samples': 59, 'reg_alpha': 9.576113188432455, 'reg_lambda': 1.125166113772108}. Best is trial 0 with value: 160.64048188641925.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001891 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start tr

[I 2025-07-28 01:19:56,875] Trial 3 finished with value: 163.86707315950522 and parameters: {'n_estimators': 114, 'max_depth': 15, 'learning_rate': 0.21369775269616942, 'num_leaves': 120, 'subsample': 0.8054533316333506, 'colsample_bytree': 0.7789173043873641, 'min_child_samples': 25, 'reg_alpha': 2.8497024910176885, 'reg_lambda': 0.7681732062782065}. Best is trial 0 with value: 160.64048188641925.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start tr

[I 2025-07-28 01:20:02,010] Trial 4 finished with value: 166.44788178686358 and parameters: {'n_estimators': 448, 'max_depth': 3, 'learning_rate': 0.24949606155035828, 'num_leaves': 53, 'subsample': 0.6902496742614069, 'colsample_bytree': 0.8798761729471603, 'min_child_samples': 43, 'reg_alpha': 7.543082640862995, 'reg_lambda': 5.762163295770749}. Best is trial 0 with value: 160.64048188641925.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001818 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002394 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003720 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start tr

[I 2025-07-28 01:20:23,266] Trial 5 finished with value: 168.90183264707554 and parameters: {'n_estimators': 442, 'max_depth': 12, 'learning_rate': 0.19759242066785818, 'num_leaves': 106, 'subsample': 0.9891058026167565, 'colsample_bytree': 0.8682147081049165, 'min_child_samples': 100, 'reg_alpha': 8.06209761696913, 'reg_lambda': 0.9305942361936981}. Best is trial 0 with value: 160.64048188641925.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001938 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002292 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002630 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start tr

[I 2025-07-28 01:20:27,482] Trial 6 finished with value: 171.6737465541922 and parameters: {'n_estimators': 357, 'max_depth': 2, 'learning_rate': 0.11313995016968012, 'num_leaves': 52, 'subsample': 0.6290927025160837, 'colsample_bytree': 0.8633480377401888, 'min_child_samples': 70, 'reg_alpha': 2.537320323569374, 'reg_lambda': 0.8239818818570988}. Best is trial 0 with value: 160.64048188641925.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001845 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start tr

[I 2025-07-28 01:20:37,842] Trial 7 finished with value: 162.39995051814634 and parameters: {'n_estimators': 459, 'max_depth': 5, 'learning_rate': 0.10886536259263507, 'num_leaves': 115, 'subsample': 0.7266822646887711, 'colsample_bytree': 0.6810652689257735, 'min_child_samples': 49, 'reg_alpha': 1.3066979257521572, 'reg_lambda': 4.155209733088027}. Best is trial 0 with value: 160.64048188641925.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start tr

[I 2025-07-28 01:20:51,693] Trial 8 finished with value: 158.60456579026547 and parameters: {'n_estimators': 291, 'max_depth': 0, 'learning_rate': 0.07167055518766671, 'num_leaves': 33, 'subsample': 0.9225858314439385, 'colsample_bytree': 0.9152274285931559, 'min_child_samples': 63, 'reg_alpha': 1.2678505744945012, 'reg_lambda': 6.678507516347635}. Best is trial 8 with value: 158.60456579026547.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15.498505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001777 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start tr

[I 2025-07-28 01:21:09,362] Trial 9 finished with value: 155.459391416349 and parameters: {'n_estimators': 204, 'max_depth': -1, 'learning_rate': 0.042021659277242655, 'num_leaves': 74, 'subsample': 0.7605786230708458, 'colsample_bytree': 0.8594614489133334, 'min_child_samples': 23, 'reg_alpha': 8.801949165323022, 'reg_lambda': 0.9570375630933903}. Best is trial 9 with value: 155.459391416349.


Best LightGBM Parameters: {'n_estimators': 204, 'max_depth': -1, 'learning_rate': 0.042021659277242655, 'num_leaves': 74, 'subsample': 0.7605786230708458, 'colsample_bytree': 0.8594614489133334, 'min_child_samples': 23, 'reg_alpha': 8.801949165323022, 'reg_lambda': 0.9570375630933903}
Best LightGBM MAE: 155.459391416349
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 1659.143341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6693
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 31
[LightGBM] [Info] Start training from score 15

In [8]:
import pickle

with open("lgb_results.pkl", "wb") as f:
    pickle.dump(lgb_results, f)