In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV




# 1. Abstract
In this notebook we will consider 3 ways of hyperparameters optimization:

- by Grid Search
- by Random Search
- by Optuna optimization module

For testing purposes we will use the Diabetes prediction dataset from Kaggle (https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset).

About the Dataset

The diabetes_prediction_dataset.csv file contains medical and demographic data of patients along with their diabetes status, whether positive or negative. It consists of various features such as age, gender, body mass index (BMI), hypertension, heart disease, smoking history, HbA1c level, and blood glucose level. The Dataset can be utilized to construct machine learning models that can predict the likelihood of diabetes in patients based on their medical history and demographic details.

In [58]:
data = pd.read_csv('diabetes_prediction_dataset.csv')
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [40]:
X = data.drop('diabetes',axis=1)
y = data['diabetes']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y,train_size=0.7)

In [49]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
32510,Female,80.0,0,0,No Info,27.32,4.8,80
62071,Male,74.0,1,0,never,27.32,8.8,200
96665,Female,20.0,0,0,never,20.37,6.6,90
43848,Male,27.0,0,0,No Info,20.93,5.8,200
47377,Female,30.0,0,0,never,27.82,5.0,80
...,...,...,...,...,...,...,...,...
92401,Female,74.0,0,0,never,28.12,8.8,240
1080,Male,3.0,0,0,No Info,14.31,6.6,80
13032,Female,63.0,0,0,former,33.90,8.2,240
46709,Male,17.0,0,0,No Info,27.86,6.0,85


In [50]:
categorical_features = ['gender','hypertension','heart_disease','smoking_history']

numeric_features = [i for i in X_train.columns if i not in categorical_features]


# numeric data pipeline
pipe_num = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power_tr', PowerTransformer()),
    ('scaler', StandardScaler()) ])

# cat data pipeline
pipe_cat = Pipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value='unknown')),
    ('encoding', OneHotEncoder(sparse_output=False, handle_unknown='ignore')) ])

# column transformer
ct = ColumnTransformer([
    ('pipe_num', pipe_num, numeric_features),
    ('pipe_cat', pipe_cat, categorical_features) ])

#full pipeline
pipe = Pipeline([
    ('column_transformer', ct),
    ('model', xgb.XGBRegressor()),  # It denotes the fraction of observations to be randomly samples for each tree. 
])



# 2.Hyperparameters of XG_Boost

#### 1. eta

eta [default=0.3, alias: learning_rate]

It is analogous to learning rate in GBM.
It is the step size shrinkage used in update to prevent overfitting.
After each boosting step, we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process more conservative.
It makes the model more robust by shrinking the weights on each step.
range : [0,1]
Typical final values : 0.01-0.2.

#### 2.  gamma

gamma [default=0, alias: min_split_loss]

A node is split only when the resulting split gives a positive reduction in the loss function.
Gamma specifies the minimum loss reduction required to make a split.
It makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
The larger gamma is, the more conservative the algorithm will be.
Range: [0,∞]



#### 3. max_depth

max_depth [default=6]

The maximum depth of a tree, same as GBM.
It is used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample.
Increasing this value will make the model more complex and more likely to overfit.
The value 0 is only accepted in lossguided growing policy when tree_method is set as hist and it indicates no limit on depth.
We should be careful when setting large value of max_depth because XGBoost aggressively consumes memory when training a deep tree.
range: [0,∞] (0 is only accepted in lossguided growing policy when tree_method is set as hist.
Should be tuned using CV.
Typical values: 3-10

#### 4. min_child_weight

min_child_weight [default=1]

It defines the minimum sum of weights of all observations required in a child.
This is similar to min_child_leaf in GBM but not exactly. This refers to min “sum of weights” of observations while GBM has min “number of observations”.
It is used to control over-fitting.
Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.
    Too high values can lead to under-fitting.
Hence, it should be tuned using CV.
The larger min_child_weight is, the more conservative the algorithm will be.
range: [0,∞]

#### 5. max_delta_step

max_delta_step [default=0]

In maximum delta step we allow each tree’s weight estimation to be.
If the value is set to 0, it means there is no constraint.
If it is set to a positive value, it can help making the update step more conservative.
Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced.
Set it to value of 1-10 might help control the update.
range: [0,∞]

#### 6. subsample

subsample [default=1]

It denotes the fraction of observations to be randomly samples for each tree.
    Subsample ratio of the training instances.
Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. - This will prevent overfitting.
Subsampling will occur once in every boosting iteration.
Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting.
Typical values: 0.5-1
range: (0,1]


#### 7. colsample_bytree, colsample_bylevel, colsample_bynode
Table of Contents

colsample_bytree, colsample_bylevel, colsample_bynode [default=1]

This is a family of parameters for subsampling of columns.

All colsample_by parameters have a range of (0, 1], the default value of 1, and specify the fraction of columns to be subsampled.

colsample_bytree is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.

colsample_bylevel is the subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree.

colsample_bynode is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level.

colsample_by* parameters work cumulatively. For instance, the combination {'colsample_bytree':0.5, 'colsample_bylevel':0.5, 'colsample_bynode':0.5} with 64 features will leave 8 features to choose from at each split.

#### 8. lambda

        lambda [default=1, alias: reg_lambda]

L2 regularization term on weights (analogous to Ridge regression).
This is used to handle the regularization part of XGBoost.
Increasing this value will make model more conservative.


#### 9. alpha

alpha [default=0, alias: reg_alpha]

L1 regularization term on weights (analogous to Lasso regression).
It can be used in case of very high dimensionality so that the algorithm runs faster when implemented.
Increasing this value will make model more conservative.


#### 10. tree_method

tree_method string [default= auto]

The tree construction algorithm used in XGBoost.

XGBoost supports approx, hist and gpu_hist for distributed training. Experimental support for external memory is available for approx and gpu_hist.

Choices: auto, exact, approx, hist, gpu_hist

auto: Use heuristic to choose the fastest method.

For small to medium dataset, exact greedy (exact) will be used.

For very large dataset, approximate algorithm (approx) will be chosen.

Because old behavior is always use exact greedy in single machine, user will get a message when approximate algorithm is chosen to notify this choice.

exact: Exact greedy algorithm.

approx: Approximate greedy algorithm using quantile sketch and gradient histogram.

hist: Fast histogram optimized approximate greedy algorithm. It uses some performance improvements such as bins caching.

gpu_hist: GPU implementation of hist algorithm.

#### 11. scale_pos_weight

scale_pos_weight [default=1]

It controls the balance of positive and negative weights,
It is useful for imbalanced classes.
    A value greater than 0 should be used in case of high class imbalance as it helps in faster convergence.
A typical value to consider: sum(negative instances) / sum(positive instances).


#### 12. max_leaves

max_leaves [default=0]

Maximum number of nodes to be added.
Only relevant when grow_policy=lossguide is set.
There are other hyperparameters like sketch_eps,updater, refresh_leaf, process_type, grow_policy, max_bin, predictor and num_parallel_tree.

(source: https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning#2.-XGBoost-hyperparameters- )


In [54]:
params = { 'model__eta': (0.01, 0.2),
           'model__gamma': (0, 5),
           'model__max_depth': (3, 10),
           'model__min_child_weight': (1, 10),
           'model__max_delta_step': (0, 10),
           'model__subsample': (0.1,1),
           'model__colsample_bytree': (0.5, 1),
           'model__colsample_bylevel': (0.5, 1),
           'model__colsample_bynode': (0.5, 1),
           'model__lambda': (0, 5),
           'model__alpha': (0, 5),
           'model__scale_pos_weight': (1, 10),
           'model__max_leaves': (0, 100)
           }

In [55]:
random_search = RandomizedSearchCV(pipe, param_distributions=params, n_iter=10, cv=5)
random_search.fit(X_train, y_train)

In [57]:
random_search.best_params_

{'model__subsample': 1,
 'model__scale_pos_weight': 10,
 'model__min_child_weight': 1,
 'model__max_leaves': 0,
 'model__max_depth': 10,
 'model__max_delta_step': 10,
 'model__lambda': 5,
 'model__gamma': 0,
 'model__eta': 0.2,
 'model__colsample_bytree': 0.5,
 'model__colsample_bynode': 1,
 'model__colsample_bylevel': 0.5,
 'model__alpha': 5}