## Finding optimal hyperparameters for XGBoost

In [4]:
import os
import sys
import pandas as pd
import numpy as np  
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import metrics   # Additional sklearn functions
from sklearn.model_selection import GridSearchCV
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
import random
from decimal import Decimal

sys.path.append('..')
from Workflow import Workflow

In [2]:
wf = Workflow()
Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest = wf.load_data()

In [None]:
def xgbfit(alg, Xtrain, ytrain, cv_folds=5, early_stopping_rounds=50):
    """
    Fits the XGBoost model using cross-validation to determine the optimal number of boosting rounds.
    
    Parameters:
    alg: The XGBoost model to be trained (should be an instance of XGBRegressor).
    Xtrain: Training features (input dataset).
    ytrain: Training target variable (output labels).
    cv_folds: Number of folds for cross-validation (default is 5).
    early_stopping_rounds: Number of rounds without improvement to stop training (default is 50).
    """
    
    # Get the parameters of the XGBoost model for cross-validation
    xgb_param = alg.get_xgb_params()
        
    # Convert the training data to the DMatrix format required by XGBoost
    xgtrain = xgb.DMatrix(Xtrain, ytrain)
        
    # Perform cross-validation to find the optimal number of boosting rounds
    cvresult = xgb.cv(
        xgb_param, 
        xgtrain, 
        num_boost_round=alg.get_params()['n_estimators'],  # Evaluate this many boosting rounds
        nfold=cv_folds,  # Number of folds for cross-validation
        metrics='rmse',  # Metric to optimize (Root Mean Squared Error)
        early_stopping_rounds=early_stopping_rounds  # Stop training if no improvement over this number of rounds
    )
        
    # Update the model's n_estimators to the optimal number found from cross-validation
    alg.set_params(n_estimators=cvresult.shape[0])
    print(f'Optimal n_estimators: {cvresult.shape[0]}')

    # Fit the model on the entire training dataset with the optimized number of boosting rounds
    alg.fit(Xtrain, ytrain)
        
    # Make predictions on the training set using the trained model
    dtrain_predictions = alg.predict(Xtrain)
        
    # Print a report summarizing model performance
    print("\nModel Report")
    print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error(ytrain, dtrain_predictions)))  # Calculate RMSE
    print("R2 Score : %.4g" % metrics.r2_score(ytrain, dtrain_predictions))  # Calculate R2 Score
                    
    # Calculate feature importances using the trained model's booster
    # 'importance_type' defines how feature importances are calculated (here using weight)
    feat_imp = pd.Series(alg.get_booster().get_score(importance_type='weight')).sort_values(ascending=False)
    
    # Plot the feature importances
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')  # Set the y-axis label for the plot

In [None]:
#Choose all predictors except target & IDcols
xgb1 = XGBRegressor(
 learning_rate=0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='reg:squarederror',  # Changed to regression objective
 nthread=4,
 seed=42)

train_data_file = 'train_data.csv'
train_data = pd.read_csv(train_data_file)
train_params = ['Rain mm/y', 'rainfall_seasonality', 'PET mm/y', 'elevation_mahd', 'distance_to_coast_km', 'ndvi_avg', 'clay_perc', 'soil_class']
y_var='Recharge RC 50% mm/y'
Xtrain = train_data[train_params]
ytrain = train_data[y_var]

xgbfit(xgb1, Xtrain, ytrain)