XGBOOST IS KNOWN FOR SPEED AND PERFORMANCE

### **Next Steps**
- Experiment with different models such as XGBoost
- Tune hyperparameters to improve performance.
- Try feature engineering to enhance the dataset.


In [2]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import time
import optuna
import joblib # for saving the model

import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


LETS EXPLORE THE DATA TO ADVANCE LEVEL

In [3]:
X = pd.read_csv('X_processed.csv')
X_test = pd.read_csv('X_test_processed.csv')
y = pd.read_csv('y_processed.csv')

# Drop if still in the data
if 'PID' in X.columns:
    X = X.drop(columns=['PID'])
if 'site' in X.columns:
    X = X.drop(columns=['site'])

if 'PID' in X_test.columns:
    X_test = X_test.drop(columns=['PID'])
if 'site' in X_test.columns:
    X_test = X_test.drop(columns=['site'])


#split the data into training and validation sets 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (6195, 31)
y_train shape: (6195, 11)


LETS CREATE A FRAMEWORK TO EVLAUTE A MODEL

In [4]:
#Function to  evalute  model constently

def evaluate_model(model,X_train,y_train,X_val,y_val,model_name,):
      #tracking training time 
    StartTime = time.time()
    
    #fit the model
    model.fit(X_train,y_train)

    #trainin the time
    trainTime = time.time()-StartTime

    #prediction
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)


    #check the errors
    train_mae = mean_absolute_error(y_train,y_pred_train)
    val_mae = mean_absolute_error(y_val,y_pred_val)


    #check the RMSE
    train_rmse = np.sqrt(mean_squared_error(y_train,y_pred_train))
    val_rmse = np.sqrt(mean_squared_error(y_val,y_pred_val))
   

     # Print results
    print(f"\n{model_name} Results:")
    print(f"Training Time: {trainTime:.2f} seconds")
    print(f"Training MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}")
    print(f"Validation MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}")

    # Return the results
    return {
        'model': model,
        'name': model_name,
        'train_mae': train_mae,
        'val_mae': val_mae,
        'train_rmse': train_rmse,
        'val_rmse': val_rmse,
        'train_time': trainTime
    }





XGBOOST MODEL WITH HYPERPARAMETER TUNNING
THEY ARE TOOLS USED IN SETTING THE BEHAVIOR OF TH MODEL

In [5]:
#xgboost model using the framework optuna
#Define obiective function for optuna

def get_objective_xgb(X_train, y_train, X_val, y_val):
  def objective_xgb(trial):
    #define the hyperparameters
    params = {
        'n_estimators':trial.suggest_int('n_estimators',50,500)#how many tress to use
        ,"max_depth":trial.suggest_int('max_depth',3,10)#how deep the tree can go
        ,'learning_rate':trial.suggest_float('learning_rate',0.01,0.3)#how fast the model learns
        ,'subsample': trial.suggest_float('subsample', 0.6, 1.0)#controls randomness and reduce overfiting 
        ,'colsample_bytree':trial.suggest_float('colsample_bytree',0.6,1.0)#controls the number of features tp use for each tree
        ,'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)#minimum sum of instance weight
        ,'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0)#L1 regularization term
        ,'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0)#l2 regularization term 
        ,'random_state':42 #for reproductibility
    }

    #create the model of multioutput regressor
    #since xbgregressor is a single target we wap it on with multioutput rregressor to predict multiple targets
   
    xgb_model = MultiOutputRegressor(xgb.XGBRegressor(**params))
    
    #train and fit  the model 
    xgb_model.fit(X_train,y_train)

  

     #make predictions
    y_pred = xgb_model.predict(X_val)

    #check the errors
    Mae = mean_absolute_error(y_val,y_pred)

    return Mae
  return objective_xgb



In [6]:
import os 
print('saved to:',os.getcwd())

saved to: c:\Users\creed\OneDrive\Documents\GitHub\Amini-Soil-prediction\notebooks


In [7]:
# Run the hyperparameter optimization
print("Tuning XGBoost hyperparameters...")
study_xgb = optuna.create_study(direction='minimize')
objective = get_objective_xgb(X_train, y_train, X_val, y_val)
study_xgb.optimize(objective, n_trials=10)  # Adjust n_trials as needed

print("Best XGBoost Parameters:", study_xgb.best_params)
print("Best XGBoost MAE:", study_xgb.best_value)

# Create the optimized XGBoost model
best_xgb_model = MultiOutputRegressor(xgb.XGBRegressor(**study_xgb.best_params, random_state=42))

# Evaluate XGBoost model
xgb_results = evaluate_model(best_xgb_model, X_train, y_train, X_val, y_val, "XGBoost")

[I 2025-07-28 01:02:38,771] A new study created in memory with name: no-name-2033c442-401f-4c60-88f7-3af4721a6c69


Tuning XGBoost hyperparameters...


[I 2025-07-28 01:02:57,329] Trial 0 finished with value: 160.5139923095703 and parameters: {'n_estimators': 154, 'max_depth': 6, 'learning_rate': 0.1498710211879789, 'subsample': 0.8914060114387905, 'colsample_bytree': 0.8938603790324549, 'min_child_weight': 1, 'reg_alpha': 8.385903128181834, 'reg_lambda': 1.9211224628380341}. Best is trial 0 with value: 160.5139923095703.
[I 2025-07-28 01:05:34,309] Trial 1 finished with value: 157.76116943359375 and parameters: {'n_estimators': 317, 'max_depth': 10, 'learning_rate': 0.06209089413840654, 'subsample': 0.9731408637767737, 'colsample_bytree': 0.8001844858002015, 'min_child_weight': 2, 'reg_alpha': 0.9310955068919124, 'reg_lambda': 1.390805275432394}. Best is trial 1 with value: 157.76116943359375.
[I 2025-07-28 01:06:40,702] Trial 2 finished with value: 166.58709716796875 and parameters: {'n_estimators': 275, 'max_depth': 9, 'learning_rate': 0.27322732678213646, 'subsample': 0.9049023414050906, 'colsample_bytree': 0.7069752601632435, 'mi

Best XGBoost Parameters: {'n_estimators': 421, 'max_depth': 7, 'learning_rate': 0.019363899570497556, 'subsample': 0.7622718977762688, 'colsample_bytree': 0.9635535700444304, 'min_child_weight': 1, 'reg_alpha': 1.7301739338630806, 'reg_lambda': 6.514324152089582}
Best XGBoost MAE: 156.6277313232422

XGBoost Results:
Training Time: 82.08 seconds
Training MAE: 92.5507, RMSE: 269.8127
Validation MAE: 156.6277, RMSE: 474.1683


In [9]:
import pickle

with open("xgb_results.pkl", "wb") as f:
    pickle.dump(xgb_results, f)