In [1]:
# Importing Required Python Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [2]:
# Importing Cross_val_score & Stratified K fold 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [3]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [4]:
# Loading both versions of Training sets
X_train_red = pd.read_csv('X_train_final.csv')
X_train = pd.read_csv('X_train_full_final.csv')
y_train = pd.read_csv('y_train.final.csv')

In [5]:
# Loading both versions of Test sets
X_test_red = pd.read_csv('X_test_final.csv')
X_test = pd.read_csv('X_test_full_final.csv')
y_test = pd.read_csv('y_test.final.csv')

## Model_6: Light Gbm classifier with Tuned Hyperparameters using Optuna.

In [6]:
# Importing Optuna and lightgbm 
from lightgbm import LGBMClassifier
import optuna

In [7]:
# Defining the appropriate objective function for the Light GBM classifier

def objective_wrappper_lgbm(X_tr, y_tr, cls=None):
    '''
    Optimizes classifier's cls (LightGBM here) parameters on the given training set X_tr, y_tr
    using cv_strat cross-validation object
    
    '''
    # Defing the class weights
    # Initializing the K Fold object
    cv_strat = StratifiedKFold(10,random_state=42)
    cl_weight = ['balanced',{0:1.0,1:9.0},{0:1.0,1:10},{0:1.0,1:11},{0:1.0,1:12},{0:1.0,1:13}]
    def objective(trial):
        params = {
        'class_weight': trial.suggest_categorical('class_weight',cl_weight),
        'n_estimators': trial.suggest_int('n_estimators', 10, 1500),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 250),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
        }
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [10]:
# Defining the evaluation function for study's best parameters
def train_test_roc_auc(X_tr, y_tr, cls, obj_func, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier on the Training set and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)


In [12]:
# Initializing the LightGBM Classifier
lgb_s = LGBMClassifier(random_state=42, objective='binary', n_jobs=5)

#### Computing the best hyperparameters for the LightGBM classifier using Full Feature Training Set, _just for trial sake_

In [15]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train, y_train, lgb_s, objective_wrappper_lgbm, n_trials=100)

[I 2020-10-02 18:18:28,585] A new study created in memory with name: no-name-90059758-1e42-4f9e-82e5-4435a7d5d2b1
[I 2020-10-02 18:18:30,562] Trial 0 finished with value: 0.7931826199313016 and parameters: {'class_weight': {0: 1.0, 1: 11}, 'n_estimators': 14, 'min_child_samples': 210, 'reg_alpha': 0.29789473063238314, 'reg_lambda': 1.022524311863069, 'num_leaves': 137, 'max_depth': 36, 'colsample_bytree': 0.8150244526430145, 'learning_rate': 0.018875110532311307}. Best is trial 0 with value: 0.7931826199313016.
[I 2020-10-02 18:18:36,226] Trial 1 finished with value: 0.799762357724733 and parameters: {'class_weight': 'balanced', 'n_estimators': 424, 'min_child_samples': 55, 'reg_alpha': 4.929175913721871, 'reg_lambda': 2.184344914001952, 'num_leaves': 169, 'max_depth': 34, 'colsample_bytree': 0.7903282549344903, 'learning_rate': 0.0023304554592018406}. Best is trial 1 with value: 0.799762357724733.
[I 2020-10-02 18:18:53,396] Trial 2 finished with value: 0.7969981274740675 and paramete

[I 2020-10-02 18:23:14,600] Trial 40 finished with value: 0.796946885135078 and parameters: {'class_weight': {0: 1.0, 1: 11}, 'n_estimators': 1488, 'min_child_samples': 93, 'reg_alpha': 4.334460878896855, 'reg_lambda': 2.871891705954342, 'num_leaves': 150, 'max_depth': 37, 'colsample_bytree': 0.6018011468225847, 'learning_rate': 0.0018512700536502129}. Best is trial 36 with value: 0.8002499540869625.
[I 2020-10-02 18:23:20,082] Trial 41 finished with value: 0.8001476998480668 and parameters: {'class_weight': {0: 1.0, 1: 9.0}, 'n_estimators': 379, 'min_child_samples': 22, 'reg_alpha': 5.613227333625672, 'reg_lambda': 0.4135949043451545, 'num_leaves': 66, 'max_depth': 47, 'colsample_bytree': 0.7140521794435508, 'learning_rate': 0.0011618907936117584}. Best is trial 36 with value: 0.8002499540869625.
[I 2020-10-02 18:23:25,249] Trial 42 finished with value: 0.8002947411161203 and parameters: {'class_weight': {0: 1.0, 1: 9.0}, 'n_estimators': 381, 'min_child_samples': 25, 'reg_alpha': 5.60

[I 2020-10-02 18:28:10,131] Trial 80 finished with value: 0.801117156515082 and parameters: {'class_weight': {0: 1.0, 1: 12}, 'n_estimators': 996, 'min_child_samples': 48, 'reg_alpha': 0.275939660334905, 'reg_lambda': 5.203659641844332, 'num_leaves': 24, 'max_depth': 100, 'colsample_bytree': 0.9740066880900202, 'learning_rate': 0.0036062271908801524}. Best is trial 60 with value: 0.8011979215060248.
[I 2020-10-02 18:28:19,174] Trial 81 finished with value: 0.8007577297046578 and parameters: {'class_weight': {0: 1.0, 1: 12}, 'n_estimators': 1114, 'min_child_samples': 48, 'reg_alpha': 0.26755003723392934, 'reg_lambda': 5.069041602328622, 'num_leaves': 24, 'max_depth': 99, 'colsample_bytree': 0.9763648712232151, 'learning_rate': 0.003693183358873974}. Best is trial 60 with value: 0.8011979215060248.
[I 2020-10-02 18:28:27,333] Trial 82 finished with value: 0.8009022677733585 and parameters: {'class_weight': {0: 1.0, 1: 12}, 'n_estimators': 1000, 'min_child_samples': 64, 'reg_alpha': 0.298

In [16]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.8011979215060248


In [17]:
print(('The best study parameters for the classifier are: ', best_study_params))

('The best study parameters for the classifier are: ', {'class_weight': {0: 1.0, 1: 9.0}, 'n_estimators': 1119, 'min_child_samples': 71, 'reg_alpha': 1.8342063129544979, 'reg_lambda': 1.2659254886608724, 'num_leaves': 15, 'max_depth': 96, 'colsample_bytree': 0.9599513442760467, 'learning_rate': 0.0037006968151344836})


#### Computing the  full feature roc_auc score for the test data using the best study Parameters

In [18]:
# Obtaining the best full feature Light GBM model by setting best study parameters.
lgb_f = lgb_s.set_params(**best_study_params)

In [19]:
# fitting the best lightgbm model on the full feature training set
lgb_f.fit(X_train, y_train)

LGBMClassifier(class_weight={0: 1.0, 1: 9.0},
               colsample_bytree=0.9599513442760467,
               learning_rate=0.0037006968151344836, max_depth=96,
               min_child_samples=71, n_estimators=1119, n_jobs=5, num_leaves=15,
               objective='binary', random_state=42,
               reg_alpha=1.8342063129544979, reg_lambda=1.2659254886608724)

In [21]:
# Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X, y, cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the best {} classifier is '.format(f_set,t_set,model_name),roc_auc_score(y, y_pred[:,1]))

In [22]:
# Calculating the full feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train, y_train, lgb_f, 'full feature', 'training', 'Light Gbm')

The roc_auc_score for the full feature training set using the best Light Gbm classifier is  0.8206971119967283


In [23]:
# Calculating the full feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test, y_test, lgb_f, 'full feature', 'test', 'Light Gbm')

The roc_auc_score for the full feature test set using the best Light Gbm classifier is  0.8173005654747865


In [24]:
#Saving the Full feature set best Light Gbm Classifier 
import joblib
joblib.dump(lgb_f,'Light_Gbm_Full.joblib')

['Light_Gbm_Full.joblib']

#### Computing the best hyperparameters for the  Light Gbm Classifier using Reduced feature Training Set.

In [25]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train_red, y_train, lgb_s, objective_wrappper_lgbm, n_trials=100)

[I 2020-10-02 18:43:27,981] A new study created in memory with name: no-name-5fcadd1e-ad20-4ac8-83c8-8ad1e33ed1c1
[I 2020-10-02 18:43:37,695] Trial 0 finished with value: 0.7831095984167584 and parameters: {'class_weight': {0: 1.0, 1: 13}, 'n_estimators': 638, 'min_child_samples': 56, 'reg_alpha': 1.971632847035213, 'reg_lambda': 5.803933849120054, 'num_leaves': 88, 'max_depth': 58, 'colsample_bytree': 0.7257517594866142, 'learning_rate': 0.035036128591893874}. Best is trial 0 with value: 0.7831095984167584.
[I 2020-10-02 18:43:39,569] Trial 1 finished with value: 0.7930558431305778 and parameters: {'class_weight': 'balanced', 'n_estimators': 89, 'min_child_samples': 30, 'reg_alpha': 3.082076721326441, 'reg_lambda': 3.94674074510855, 'num_leaves': 248, 'max_depth': 84, 'colsample_bytree': 0.993092131436921, 'learning_rate': 0.004630008930039161}. Best is trial 1 with value: 0.7930558431305778.
[I 2020-10-02 18:43:46,500] Trial 2 finished with value: 0.7973216857788675 and parameters: {

[I 2020-10-02 18:49:14,257] Trial 40 finished with value: 0.7979202258315542 and parameters: {'class_weight': {0: 1.0, 1: 13}, 'n_estimators': 1494, 'min_child_samples': 67, 'reg_alpha': 9.227301237362003, 'reg_lambda': 1.419689517309532, 'num_leaves': 156, 'max_depth': 88, 'colsample_bytree': 0.6484088801735026, 'learning_rate': 0.0014400383150777719}. Best is trial 39 with value: 0.8005817150984029.
[I 2020-10-02 18:49:18,350] Trial 41 finished with value: 0.7816136590530037 and parameters: {'class_weight': {0: 1.0, 1: 13}, 'n_estimators': 1415, 'min_child_samples': 39, 'reg_alpha': 8.072868347518188, 'reg_lambda': 3.6031088455585794, 'num_leaves': 2, 'max_depth': 75, 'colsample_bytree': 0.7802908543303481, 'learning_rate': 0.004007539095821941}. Best is trial 39 with value: 0.8005817150984029.
[I 2020-10-02 18:49:28,139] Trial 42 finished with value: 0.8008745862065698 and parameters: {'class_weight': {0: 1.0, 1: 13}, 'n_estimators': 1365, 'min_child_samples': 21, 'reg_alpha': 7.513

[I 2020-10-02 18:57:22,993] Trial 80 finished with value: 0.8008398624797476 and parameters: {'class_weight': {0: 1.0, 1: 13}, 'n_estimators': 1173, 'min_child_samples': 25, 'reg_alpha': 5.420252265590087, 'reg_lambda': 4.901013555162159, 'num_leaves': 29, 'max_depth': 70, 'colsample_bytree': 0.7024361774210798, 'learning_rate': 0.0030392233587817545}. Best is trial 66 with value: 0.8011102893824955.
[I 2020-10-02 18:57:33,168] Trial 81 finished with value: 0.8008303233987608 and parameters: {'class_weight': {0: 1.0, 1: 13}, 'n_estimators': 1163, 'min_child_samples': 23, 'reg_alpha': 5.450163952297743, 'reg_lambda': 4.37384048070583, 'num_leaves': 30, 'max_depth': 71, 'colsample_bytree': 0.7042574216136199, 'learning_rate': 0.0030824182876324404}. Best is trial 66 with value: 0.8011102893824955.
[I 2020-10-02 18:57:39,492] Trial 82 finished with value: 0.7956387720679319 and parameters: {'class_weight': {0: 1.0, 1: 13}, 'n_estimators': 1170, 'min_child_samples': 16, 'reg_alpha': 5.4734

In [26]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.8011102893824955


In [27]:
print(('The best study parameters for the classifier are: ', best_study_params))

('The best study parameters for the classifier are: ', {'class_weight': {0: 1.0, 1: 11}, 'n_estimators': 1083, 'min_child_samples': 16, 'reg_alpha': 6.964534534541776, 'reg_lambda': 5.143496762382144, 'num_leaves': 26, 'max_depth': 85, 'colsample_bytree': 0.6688983257726496, 'learning_rate': 0.0030081514412958507})


In [28]:
# Obtaining the best Reduced feature LightGbm model by setting best study parameters.
lgb_red = lgb_s.set_params(**best_study_params)

In [29]:
# fitting the best lightgbm model on the Reduced feature training set
lgb_red.fit(X_train_red, y_train)

LGBMClassifier(class_weight={0: 1.0, 1: 11},
               colsample_bytree=0.6688983257726496,
               learning_rate=0.0030081514412958507, max_depth=85,
               min_child_samples=16, n_estimators=1083, n_jobs=5, num_leaves=26,
               objective='binary', random_state=42, reg_alpha=6.964534534541776,
               reg_lambda=5.143496762382144)

In [30]:
# Calculating the Reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train_red, y_train, lgb_red, 'Reduced feature', 'training', 'Light Gbm')

The roc_auc_score for the Reduced feature training set using the best Light Gbm classifier is  0.8252509438369647


In [31]:
# Calculating the Reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test_red, y_test, lgb_red, 'Reduced feature', 'test','Light Gbm')

The roc_auc_score for the Reduced feature test set using the best Light Gbm classifier is  0.817919919571678


In [32]:
# Saving the Reduced feature set best Light Gbm Classifier 
joblib.dump(lgb_red,'Light_Gbm_Reduced.joblib')

['Light_Gbm_Reduced.joblib']

### Calculating R_R ratio for best Light GBM Classifier trained on reduced feature training set.

In [7]:
# Loading the best Light GBM  Classifier model
import joblib
lgb_red = joblib.load('Light_Gbm_Reduced.joblib')

In [8]:
# Instantiating the Stratified fold object
cv_strat = StratifiedKFold(10,random_state=42)

In [9]:
# Computing the CV scores using sklearn's cross_val_score
score_Light_Gbm = cross_val_score(lgb_red, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [10]:
print('The reward associated with the best Light GMB Classifier using roc_auc metric is: ',np.mean(score_Light_Gbm))

The reward associated with the best Light GMB Classifier using roc_auc metric is:  0.8011102893824955


In [11]:
print('The risk associated with the best Light GMB Classifier using roc_auc metric is: ',np.std(score_Light_Gbm))

The risk associated with the best Light GMB Classifier using roc_auc metric is:  0.017550265534215327


In [12]:
R_R_Ratio_Light_Gbm = np.mean(score_Light_Gbm)/np.std(score_Light_Gbm)

In [13]:
print('The reward risk ratio for the best Light Gbm Classifier using roc_auc metric is: ',R_R_Ratio_Light_Gbm)

The reward risk ratio for the best Light Gbm Classifier using roc_auc metric is:  45.64661929591216


#### The R_R Ratio for the best Light Gbm classifier, trained on reduced feature training set, using roc_auc score is:  45.64661929591216

## Observations:
### 1) From the above analysis we can clearly see that reduced feature test set roc_auc score is more than full feature roc_auc score , thereby clearly substantiating  the presence of noisy predictors in the feature set , _as has been observed before._
### 2) The Light Gbm classifier has beautifully fitted the dataset with no indications of overfitting and of all the classifiers tested till now , has given the best test set roc_auc score. May be xgboost tested next may beat that.
### 3) The Light Gbm has poorer R_R ratio as compared to those of Tree based bagging classifiers, indicating too much roc_auc variability across CV folds.
### 4) Surprisingly R_R ratio for the Light GBM model is on the lower side of the spectrum.


## Model_7: XGboost Classifier with Tuned Hyperparameters using Optuna.

In [6]:
# Importing Xgboost Classifier
import xgboost as xgb

In [7]:
# Computing the ratio of -ve to +ve classes. Note here class_weight is a series and should be converted to float value
# using float() to be used as a sclar inside xgboost. 
class_weight = float((y_train==0).sum()/(y_train==1).sum())

In [8]:
# Defining the appropriate objective function for the XGboost classifier

def objective_wrappper_xgb(X_tr, y_tr, cls=None, cv=None):
    '''
    Optimizes classifier's cls (Xgboost here) parameters on the given training set X_tr,y_tr
    using cv_strat cross-validation object
    
    '''
    
    # Initializing the K Fold object
    cv_strat = StratifiedKFold(cv, random_state=42)
    
    def objective(trial):
        params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1500),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 10),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
        }
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [9]:
# Defining the evaluation function for study's best parameters
def train_test_roc_auc(X_tr, y_tr, cls, obj_func, cv=None, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier on the Training set and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)


In [10]:
# Instantiating the xgboost classifier  
xgb_s = xgb.XGBClassifier(random_state=42,n_jobs=5,objective='binary:logistic',scale_pos_weight=class_weight)

#### Computing the best hyperparameters for the  XGboost Classifier using Reduced feature Training Set.

In [12]:
# Importing Optuna 
import optuna

In [13]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train_red, y_train, xgb_s, objective_wrappper_xgb, cv=10)

[I 2020-10-02 19:58:32,453] A new study created in memory with name: no-name-448d6adc-a682-4bf6-8b50-1a6883a9bf0a
[I 2020-10-02 19:58:45,569] Trial 0 finished with value: 0.7906373117466294 and parameters: {'n_estimators': 89, 'reg_alpha': 4.869778689168298, 'reg_lambda': 1.366968329714442, 'max_delta_step': 10, 'max_depth': 79, 'colsample_bytree': 0.6727252252434838, 'learning_rate': 0.022210275426501884}. Best is trial 0 with value: 0.7906373117466294.
[I 2020-10-02 19:58:51,032] Trial 1 finished with value: 0.7918439806633935 and parameters: {'n_estimators': 41, 'reg_alpha': 3.6629355626333195, 'reg_lambda': 6.391123981313559, 'max_delta_step': 5, 'max_depth': 76, 'colsample_bytree': 0.6559805066904831, 'learning_rate': 0.03339776941165402}. Best is trial 1 with value: 0.7918439806633935.
[I 2020-10-02 19:59:21,955] Trial 2 finished with value: 0.7923639516372817 and parameters: {'n_estimators': 1382, 'reg_alpha': 5.982587624038166, 'reg_lambda': 0.9553581315736037, 'max_delta_step'

[I 2020-10-02 20:42:01,225] Trial 46 finished with value: 0.7953670010112106 and parameters: {'n_estimators': 66, 'reg_alpha': 2.967692133740577, 'reg_lambda': 5.565006043237665, 'max_delta_step': 3, 'max_depth': 23, 'colsample_bytree': 0.6162402308788631, 'learning_rate': 0.0026670178556532746}. Best is trial 27 with value: 0.7983431640351506.
[I 2020-10-02 20:43:31,005] Trial 47 finished with value: 0.7616160536604778 and parameters: {'n_estimators': 1494, 'reg_alpha': 7.833218141299979, 'reg_lambda': 4.226511967562574, 'max_delta_step': 2, 'max_depth': 15, 'colsample_bytree': 0.661941266703873, 'learning_rate': 0.08553840743113823}. Best is trial 27 with value: 0.7983431640351506.
[I 2020-10-02 20:43:40,581] Trial 48 finished with value: 0.7986154934016285 and parameters: {'n_estimators': 157, 'reg_alpha': 8.206957603661671, 'reg_lambda': 5.07345131563363, 'max_delta_step': 3, 'max_depth': 9, 'colsample_bytree': 0.6281009852485614, 'learning_rate': 0.004637770285642403}. Best is tri

[I 2020-10-02 21:00:02,508] Trial 92 finished with value: 0.7987912468168675 and parameters: {'n_estimators': 499, 'reg_alpha': 8.604906386470105, 'reg_lambda': 1.5399992925098314, 'max_delta_step': 4, 'max_depth': 11, 'colsample_bytree': 0.6071724786283873, 'learning_rate': 0.0010144473698035811}. Best is trial 88 with value: 0.7988260240775197.
[I 2020-10-02 21:00:36,464] Trial 93 finished with value: 0.7988621489195012 and parameters: {'n_estimators': 501, 'reg_alpha': 8.41525434498203, 'reg_lambda': 0.25416519161264484, 'max_delta_step': 4, 'max_depth': 11, 'colsample_bytree': 0.6084241872329753, 'learning_rate': 0.0010220093439135254}. Best is trial 93 with value: 0.7988621489195012.
[I 2020-10-02 21:01:12,042] Trial 94 finished with value: 0.7986862161715107 and parameters: {'n_estimators': 509, 'reg_alpha': 6.262994437238417, 'reg_lambda': 1.4525246703231158, 'max_delta_step': 4, 'max_depth': 11, 'colsample_bytree': 0.6000859259663488, 'learning_rate': 0.0010385300680889728}. Be

In [14]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.7988621489195012


In [15]:
print(('The best study parameters for the classifier are: ',best_study_params))

('The best study parameters for the classifier are: ', {'n_estimators': 501, 'reg_alpha': 8.41525434498203, 'reg_lambda': 0.25416519161264484, 'max_delta_step': 4, 'max_depth': 11, 'colsample_bytree': 0.6084241872329753, 'learning_rate': 0.0010220093439135254})


In [16]:
# Obtaining the best Reduced feature Xgboost model by setting best study parameters.
xgb_red = xgb_s.set_params(**best_study_params)

In [17]:
# fitting the best Xgboost model on the Reduced feature training set
xgb_red.fit(X_train_red, y_train)

XGBClassifier(colsample_bytree=0.6084241872329753,
              learning_rate=0.0010220093439135254, max_delta_step=4,
              max_depth=11, n_estimators=501, n_jobs=5, random_state=42,
              reg_alpha=8.41525434498203, reg_lambda=0.25416519161264484,
              scale_pos_weight=7.876616379310345)

In [20]:
# Calculating the Reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train_red, y_train, xgb_red, 'Reduced feature', 'training', 'Xgboost')

The roc_auc_score for the Reduced feature training set using the best Xgboost classifier is  0.8384608606006355


In [21]:
# Calculating the Reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test_red, y_test, xgb_red, 'Reduced feature', 'test', 'Xgboost')

The roc_auc_score for the Reduced feature test set using the best Xgboost classifier is  0.8171690734232746


In [22]:
# Saving the Reduced feature set best Xgboost Classifier 
import joblib
joblib.dump(xgb_red, 'XGB_Reduced.joblib')

['XGB_Reduced.joblib']

### Calculating R_R ratio for best Xgboost Classifier.

In [14]:
# Loading the best Xgboost  Classifier model
xgb_red = joblib.load('XGB_Reduced.joblib')

In [15]:
# Computing the CV scores using sklearn's cross_val_score
score_xgb = cross_val_score(xgb_red, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [16]:
print('The reward associated with the best Xgboost Classifier using roc_auc metric is: ',np.mean(score_xgb))

The reward associated with the best Xgboost Classifier using roc_auc metric is:  0.7988621489195012


In [17]:
print('The risk associated with the best Xgboost Classifier using roc_auc metric is: ',np.std(score_xgb))

The risk associated with the best Xgboost Classifier using roc_auc metric is:  0.016493541570703495


In [18]:
R_R_Ratio_Xgboost = np.mean(score_xgb)/np.std(score_xgb)

In [19]:
print('The reward risk ratio for the best Xgboost Classifier using roc_auc metric is: ',R_R_Ratio_Xgboost)

The reward risk ratio for the best Xgboost Classifier using roc_auc metric is:  48.43484617872931


#### The R_R Ratio for the best XgBoost classifier using roc_auc score is: 48.43484617872931

## Observations:
### 1) Xgboost model ,conceptually similar to Light GBM , wonderfully fitted this dataset , with no apparent signs of overfitting. But surprisingly the test set roc_auc score for tuned Xgboost is less than that of Light GBM. 
### 2) Xgboost has a higher R_R ratio than that of Light GBM & hence should be preferred model of choice based on Rewards Risk ratio  metric, but has a higher computational cost than that of Light Gbm.
### 3) Both boosting models have been outshone by Logistic Regression & Tree based Bagging models in the domain of R_R ratio metric, due to their comparatively low roc_auc score Std Dev.

### R_R Ratio for the best Tree based boosting classifier using roc_auc score is:  48.43484617872931 , corresponding to tuned XgBoost classifier.

### _Thus keeping everything into account (including the computational costs) , for this dataset , the best Tree based boosting classifier is tuned Light Gbm classifier._