### **CatBoost Model with Hyperparameter Tuning**

CatBoost is especially good at handling categorical features with minimal preprocessing, and often achieves high performance out-of-the-box.

In [8]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import catboost as cb
import time
import optuna
import joblib # for saving the model
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


LETS EXPLORE THE DATA TO AN ADVANCE LEVEL

In [9]:
X = pd.read_csv('X_processed.csv')
X_test = pd.read_csv('X_test_processed.csv')
y = pd.read_csv('y_processed.csv')

# Drop if still in the data
if 'PID' in X.columns:
    X = X.drop(columns=['PID'])
if 'site' in X.columns:
    X = X.drop(columns=['site'])

if 'PID' in X_test.columns:
    X_test = X_test.drop(columns=['PID'])
if 'site' in X_test.columns:
    X_test = X_test.drop(columns=['site'])


#split the data into training and validation sets 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (6195, 31)
y_train shape: (6195, 11)


In [12]:
def evaluate_model(model,X_train,y_train,X_val,y_val,model_name,):
      #tracking training time 
    StartTime = time.time()
    
    #fit the model
    model.fit(X_train,y_train)

    #trainin the time
    trainTime = time.time()-StartTime

    #prediction
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)


    #check the errors
    train_mae = mean_absolute_error(y_train,y_pred_train)
    val_mae = mean_absolute_error(y_val,y_pred_val)


    #check the RMSE
    train_rmse = np.sqrt(mean_squared_error(y_train,y_pred_train))
    val_rmse = np.sqrt(mean_squared_error(y_val,y_pred_val))
   
 
     # Print results
    print(f"\n{model_name} Results:")
    print(f"Training Time: {trainTime:.2f} seconds")
    print(f"Training MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}")
    print(f"Validation MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}")

    # Return the results
    return {
        'model': model,
        'name': model_name,
        'train_mae': train_mae,
        'val_mae': val_mae,
        'train_rmse': train_rmse,
        'val_rmse': val_rmse,
        'train_time': trainTime
    }





In [14]:
# Define objective function for CatBoost hyperparameter tuning

def get_objective_cb(X_train, y_train, X_val, y_val):
  def objective_cb(trial):
    # Define hyperparameters to tune
    params = {
        'iterations': trial.suggest_int('iterations', 50, 500),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_seed': 42,
        'verbose': False
    }
    
    # Create CatBoost MultiOutputRegressor
    cb_model = MultiOutputRegressor(cb.CatBoostRegressor(**params))
    
    # Train the model
    cb_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = cb_model.predict(X_val)
    
    # Calculate MAE
    mae = mean_absolute_error(y_val, y_pred)
    
    return mae
  return objective_cb


In [15]:

# Run the hyperparameter optimization
print("Tuning CatBoost hyperparameters...")

study_cb = optuna.create_study(direction='minimize')
objective = get_objective_cb(X_train, y_train, X_val, y_val)
study_cb.optimize(objective, n_trials=10)  # Adjust n_trials as needed

print("Best CatBoost Parameters:", study_cb.best_params)
print("Best CatBoost MAE:", study_cb.best_value)

# Create the optimized CatBoost model
best_cb_model = MultiOutputRegressor(cb.CatBoostRegressor(**study_cb.best_params, random_seed=42, verbose=False))

# Evaluate CatBoost model
cb_results = evaluate_model(best_cb_model, X_train, y_train, X_val, y_val, "CatBoost")

[I 2025-07-28 01:30:36,090] A new study created in memory with name: no-name-db75eba3-da13-429f-b5a2-55ceddfc4d33


Tuning CatBoost hyperparameters...


[I 2025-07-28 01:31:32,226] Trial 0 finished with value: 196.09216792231825 and parameters: {'iterations': 228, 'depth': 7, 'learning_rate': 0.013592308776644446, 'l2_leaf_reg': 9.107069390683748, 'random_strength': 6.08636489689598, 'bagging_temperature': 8.619302345109665, 'border_count': 166}. Best is trial 0 with value: 196.09216792231825.
[I 2025-07-28 01:31:51,869] Trial 1 finished with value: 164.27686954645102 and parameters: {'iterations': 181, 'depth': 5, 'learning_rate': 0.26352481047899995, 'l2_leaf_reg': 3.4421764403593578, 'random_strength': 0.685174831713475, 'bagging_temperature': 2.325972510658554, 'border_count': 81}. Best is trial 1 with value: 164.27686954645102.
[I 2025-07-28 01:32:36,146] Trial 2 finished with value: 163.36552804243647 and parameters: {'iterations': 304, 'depth': 7, 'learning_rate': 0.21245097440488545, 'l2_leaf_reg': 4.2024436282738, 'random_strength': 6.373694506837399, 'bagging_temperature': 7.3978419762545595, 'border_count': 85}. Best is tria

Best CatBoost Parameters: {'iterations': 201, 'depth': 10, 'learning_rate': 0.13951721250716237, 'l2_leaf_reg': 6.903741322341244, 'random_strength': 9.16915561060852, 'bagging_temperature': 4.422016354727651, 'border_count': 102}
Best CatBoost MAE: 159.01367684587993

CatBoost Results:
Training Time: 109.33 seconds
Training MAE: 86.2613, RMSE: 230.5522
Validation MAE: 159.0137, RMSE: 480.1490


In [16]:
import pickle

with open("cb_results.pkl", "wb") as f:
    pickle.dump(cb_results, f)
