In [1]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd 
from sklearn.model_selection import GridSearchCV
import Build_Evaluate_Model as bem
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

In [2]:
X_train,y_train,X_test,y_test=bem.get_xy_traintest(scale=False,scaler=None)

### BASIC XGBOOST MODEL

In [3]:
xg_score=bem.build_basic_model(X_train,y_train,X_test,y_test,classifier='XGBOOST')





In [4]:
xg_score

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.10565758, 0.04975895, 0.16611354, 0.5044843...","[0.0322351, 0.18366416, 0.17870252, 0.5164707,...",0.952056,0.763209,0.188847


- Our Xgboost model is clearly overfitting .Let us see if we can reduce the overfitting

**WAYS TO CONTROL OVERFIT** 
-  max_depth default=6
-  min_child_weigh -higher value more conservative the model
-  gamma - min reduction in loss reqd to split data

In [5]:
xg_model_1=xgb.XGBClassifier(max_depth=2,gamma=10,colsample_bytree=0.4)

In [6]:
xg_score=bem.build_model(X_train,y_train,X_test,y_test,classifier=xg_model_1,score_df=xg_score,classifier_name='xg_model_1')





In [7]:
xg_score

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.10565758, 0.04975895, 0.16611354, 0.5044843...","[0.0322351, 0.18366416, 0.17870252, 0.5164707,...",0.952056,0.763209,0.188847
1,xg_model_1,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.14725907, 0.22128628, 0.2087531, 0.6687061,...","[0.20298962, 0.38469675, 0.22370285, 0.5019234...",0.81029,0.773939,0.03635


### HYPER-PARAMETER TUNING

In [8]:
classifier=xgb.XGBClassifier(random_state=0)
param_grid={'colsample_bytree':[0.4,0.5,0.8],'n_estimators':[100,200,300],'gamma':[5,10,15],'max_depth':[1,2,3]}
model=RandomizedSearchCV(estimator=classifier,param_distributions=param_grid,scoring='roc_auc',verbose=10,n_jobs=-1)
model.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:   38.5s remaining:    8.4s
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:   40.9s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   41.0s finished




RandomizedSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=0, reg_alpha=None,
                                           reg_lambda=

In [9]:
model.best_params_

{'n_estimators': 200, 'max_depth': 3, 'gamma': 5, 'colsample_bytree': 0.5}

In [10]:
best_xgb=xgb.XGBClassifier(n_estimators= 200, max_depth= 3, gamma= 5, colsample_bytree= 0.5)

In [11]:
xg_score=bem.build_model(X_train,y_train,X_test,y_test,classifier=best_xgb,score_df=xg_score,classifier_name='best_xgb')



In [12]:
xg_score

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.10565758, 0.04975895, 0.16611354, 0.5044843...","[0.0322351, 0.18366416, 0.17870252, 0.5164707,...",0.952056,0.763209,0.188847
1,xg_model_1,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.14725907, 0.22128628, 0.2087531, 0.6687061,...","[0.20298962, 0.38469675, 0.22370285, 0.5019234...",0.81029,0.773939,0.03635
2,best_xgb,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.09483935, 0.12683137, 0.21216293, 0.6220972...","[0.041104645, 0.34025073, 0.23019384, 0.375729...",0.860262,0.775857,0.084405


In [13]:
# Lets try and regularize best xgb model a bit more

best_xgb_reg=xgb.XGBClassifier(n_estimators= 100, max_depth= 3, gamma= 15, colsample_bytree= 0.5)

In [14]:
xg_score=bem.build_model(X_train,y_train,X_test,y_test,classifier=best_xgb_reg,score_df=xg_score,classifier_name='best_xgb_reg')





In [15]:
xg_score.sort_values(by='DIFFERENCE')

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
1,xg_model_1,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.14725907, 0.22128628, 0.2087531, 0.6687061,...","[0.20298962, 0.38469675, 0.22370285, 0.5019234...",0.81029,0.773939,0.03635
3,best_xgb_reg,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.1516564, 0.2382438, 0.21785408, 0.6372192, ...","[0.13614623, 0.3620115, 0.27364364, 0.4818658,...",0.817803,0.775549,0.042254
2,best_xgb,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.09483935, 0.12683137, 0.21216293, 0.6220972...","[0.041104645, 0.34025073, 0.23019384, 0.375729...",0.860262,0.775857,0.084405
0,Basic,"XGBClassifier(base_score=0.5, booster='gbtree'...","[0.10565758, 0.04975895, 0.16611354, 0.5044843...","[0.0322351, 0.18366416, 0.17870252, 0.5164707,...",0.952056,0.763209,0.188847


`Best Model`:xg_model_1
