XGBOOST IS KNOWN FOR SPEED AND PERFORMANCE

### **Next Steps**
- Experiment with different models such as XGBoost
- Tune hyperparameters to improve performance.
- Try feature engineering to enhance the dataset.


In [1]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import time
import optuna

import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


LETS EXPLORE THE DATA TO ADVANCE LEVEL

In [2]:
X = pd.read_csv('X_processed.csv')
X_test = pd.read_csv('X_test_processed.csv')
y = pd.read_csv('y_processed.csv')

# Drop if still in the data
if 'PID' in X.columns:
    X = X.drop(columns=['PID'])
if 'site' in X.columns:
    X = X.drop(columns=['site'])

if 'PID' in X_test.columns:
    X_test = X_test.drop(columns=['PID'])
if 'site' in X_test.columns:
    X_test = X_test.drop(columns=['site'])


#split the data into training and validation sets 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (6195, 31)
y_train shape: (6195, 11)


LETS CREATE A FRAMEWORK TO EVLAUTE A MODEL

In [3]:
#Function to  evalute  model constently

def evaluate_model(model,X_train,y_train,X_val,y_val,model_name,):
      #tracking training time 
    StartTime = time.time()
    
    #fit the model
    model.fit(X_train,y_train)

    #trainin the time
    trainTime = time.time()-StartTime

    #prediction
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)


    #check the errors
    train_mae = mean_absolute_error(y_train,y_pred_train)
    val_mae = mean_absolute_error(y_val,y_pred_val)


    #check the RMSE
    train_rmse = np.sqrt(mean_squared_error(y_train,y_pred_train))
    val_rmse = np.sqrt(mean_squared_error(y_val,y_pred_val))
   

     # Print results
    print(f"\n{model_name} Results:")
    print(f"Training Time: {trainTime:.2f} seconds")
    print(f"Training MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}")
    print(f"Validation MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}")

    # Return the results
    return {
        'model': model,
        'name': model_name,
        'train_mae': train_mae,
        'val_mae': val_mae,
        'train_rmse': train_rmse,
        'val_rmse': val_rmse,
        'train_time': trainTime
    }





XGBOOST MODEL WITH HYPERPARAMETER TUNNING
THEY ARE TOOLS USED IN SETTING THE BEHAVIOR OF TH MODEL

In [4]:
#xgboost model using the framework optuna
#Define obiective function for optuna

def get_objective_xgb(X_train, y_train, X_val, y_val):
  def objective_xgb(trial):
    #define the hyperparameters
    params = {
        'n_estimators':trial.suggest_int('n_estimators',50,500)#how many tress to use
        ,"max_depth":trial.suggest_int('max_depth',3,10)#how deep the tree can go
        ,'learning_rate':trial.suggest_float('learning_rate',0.01,0.3)#how fast the model learns
        ,'subsample': trial.suggest_float('subsample', 0.6, 1.0)#controls randomness and reduce overfiting 
        ,'colsample_bytree':trial.suggest_float('colsample_bytree',0.6,1.0)#controls the number of features tp use for each tree
        ,'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)#minimum sum of instance weight
        ,'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0)#L1 regularization term
        ,'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0)#l2 regularization term 
        ,'random_state':42 #for reproductibility
    }

    #create the model of multioutput regressor
    #since xbgregressor is a single target we wap it on with multioutput rregressor to predict multiple targets
   
    xgb_model = MultiOutputRegressor(xgb.XGBRegressor(**params))
    
    #train and fit  the model 
    xgb_model.fit(X_train,y_train)

     #make predictions
    y_pred = xgb_model.predict(X_val)

    #check the errors
    Mae = mean_absolute_error(y_val,y_pred)

    return Mae
  return objective_xgb

#runinn the optuna process
  
print('tunning xgboost hyperparameter')
study_xgb = optuna.create_study(direction='minimize')
objective = get_objective_xgb(X_train, y_train, X_val, y_val)
study_xgb.optimize(objective, n_trials=10) #ajust trail are needed


print("Best XGBoost Parameters:", study_xgb.best_params)
print("Best XGBoost MAE:", study_xgb.best_value)

#create a optimaxize xgboost model
best_xgb_model = MultiOutputRegressor(xgb.XGBRegressor(**study_xgb.best_params, random_state=42))

#evaluate the result
xgb_results = evaluate_model(best_xgb_model,X_train, y_train, X_val, y_val, "XGBoost")

[I 2025-07-07 02:05:40,030] A new study created in memory with name: no-name-48dbdf05-00dd-4720-a7f9-ca757e509db1


tunning xgboost hyperparameter


[I 2025-07-07 02:05:51,516] Trial 0 finished with value: 163.81642150878906 and parameters: {'n_estimators': 392, 'max_depth': 4, 'learning_rate': 0.1740155897487656, 'subsample': 0.8464387587994547, 'colsample_bytree': 0.9802973048705034, 'min_child_weight': 4, 'reg_alpha': 3.2583388217501295, 'reg_lambda': 9.6778674183621}. Best is trial 0 with value: 163.81642150878906.
[I 2025-07-07 02:06:17,250] Trial 1 finished with value: 159.65391540527344 and parameters: {'n_estimators': 342, 'max_depth': 6, 'learning_rate': 0.11614053644429538, 'subsample': 0.828164696839107, 'colsample_bytree': 0.9898418028542564, 'min_child_weight': 1, 'reg_alpha': 0.14943952574635877, 'reg_lambda': 7.120749296186429}. Best is trial 1 with value: 159.65391540527344.
[I 2025-07-07 02:06:33,689] Trial 2 finished with value: 171.03424072265625 and parameters: {'n_estimators': 471, 'max_depth': 4, 'learning_rate': 0.2834714986785285, 'subsample': 0.8110985601169758, 'colsample_bytree': 0.8947036805588438, 'min_

Best XGBoost Parameters: {'n_estimators': 489, 'max_depth': 7, 'learning_rate': 0.07634824088301091, 'subsample': 0.7081509898915024, 'colsample_bytree': 0.9042825182383529, 'min_child_weight': 8, 'reg_alpha': 1.4844375869510362, 'reg_lambda': 9.717618159757942}
Best XGBoost MAE: 157.08587646484375

XGBoost Results:
Training Time: 45.51 seconds
Training MAE: 38.5811, RMSE: 115.9903
Validation MAE: 157.0859, RMSE: 476.6529
