In [1]:
# Importing Required Python Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [2]:
# Loading both versions of Training sets
X_train_red = pd.read_csv('X_train_final.csv')
X_train = pd.read_csv('X_train_full_final.csv')
y_train = pd.read_csv('y_train.final.csv')

In [3]:
# Loading both versions of Test sets
X_test_red = pd.read_csv('X_test_final.csv')
X_test = pd.read_csv('X_test_full_final.csv')
y_test = pd.read_csv('y_test.final.csv')

In [4]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [5]:
# Importing  hyperparamater tuning optimizer optuna
import optuna

In [6]:
# Importing required Libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [7]:
# Instantiating the Stratified K fold object
cv_strat = StratifiedKFold(10,random_state=42)

 ## Model_2: Random Forest Classifier with Tuned Hyperparameters using Optuna.

In [10]:
# Defining the class weights
cl_weight = [None,'balanced',{0:1.0,1:9.0},{0:1.0,1:10},{0:1.0,1:11},{0:1.0,1:12}]

In [11]:
# Defining the appropriate objective function for the Random Forest classifier
def objective_wrappper_rf(X_tr, y_tr, cls=None, cv_strat=None):
    '''
    Optimizes Random Forest parameters on the given training set X_tr,y_tr
    using cv_strat cross-validation object
    
    '''
    
    def objective(trial):
        params = {
        'max_depth': trial.suggest_categorical('max_depth',list(range(2,50))+ [None]),
        'n_estimators':trial.suggest_int('n_estimators',100,2000,10),
        'class_weight':trial.suggest_categorical('class_weight',cl_weight),
        'min_samples_leaf':trial.suggest_loguniform('min_samples_leaf',.00001,.1)
          }
        
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [12]:
# Defining the evaluation function for study's best parameters
def train_test_roc_auc(X_tr, y_tr, cls, obj_func, cv_strat, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv_strat), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)


In [13]:
# Importing Random Forest Classifier from  Sklearn
from sklearn.ensemble import RandomForestClassifier

In [15]:
# Instantiating the Random forest classifier
rf_s = RandomForestClassifier(n_jobs=5, random_state=42)

#### Computing the best hyperparameters for the Random Forest classifier using full feature Training Set.

In [17]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train, y_train, rf_s, objective_wrappper_rf, cv_strat, n_trials=100)

[I 2020-10-01 16:31:09,362] A new study created in memory with name: no-name-640f0a3c-2079-4610-acc1-902dcdb68319
[I 2020-10-01 16:31:27,775] Trial 0 finished with value: 0.7957409511038811 and parameters: {'max_depth': 49, 'n_estimators': 530, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.0008448882384428245}. Best is trial 0 with value: 0.7957409511038811.
[I 2020-10-01 16:32:21,273] Trial 1 finished with value: 0.7746277219650395 and parameters: {'max_depth': 37, 'n_estimators': 1140, 'class_weight': None, 'min_samples_leaf': 2.9859292298747096e-05}. Best is trial 0 with value: 0.7957409511038811.
[I 2020-10-01 16:33:03,370] Trial 2 finished with value: 0.793058803897732 and parameters: {'max_depth': 26, 'n_estimators': 1090, 'class_weight': {0: 1.0, 1: 12}, 'min_samples_leaf': 0.00021087908935617076}. Best is trial 0 with value: 0.7957409511038811.
[I 2020-10-01 16:33:58,355] Trial 3 finished with value: 0.7745638307167447 and parameters: {'max_depth': 39, 'n_estimators': 

[I 2020-10-01 17:06:11,477] Trial 62 finished with value: 0.7981852960163683 and parameters: {'max_depth': 10, 'n_estimators': 1460, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.0002524951535183719}. Best is trial 51 with value: 0.7982052682591212.
[I 2020-10-01 17:06:53,854] Trial 63 finished with value: 0.7980735637376393 and parameters: {'max_depth': 11, 'n_estimators': 1470, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.00045582918364844047}. Best is trial 51 with value: 0.7982052682591212.
[I 2020-10-01 17:07:39,354] Trial 64 finished with value: 0.7984379763070117 and parameters: {'max_depth': 11, 'n_estimators': 1580, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.00047772643132783726}. Best is trial 64 with value: 0.7984379763070117.
[I 2020-10-01 17:08:25,040] Trial 65 finished with value: 0.7977189335698307 and parameters: {'max_depth': 11, 'n_estimators': 1530, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 7.771106438532338e-05}. Best is tri

In [18]:
print('The best roc_auc_score for the study is: ', best_study_score)

The best roc_auc_score for the study is:  0.7984710735207587


In [19]:
print(('The best study parameters for the classifier are: ', best_study_params))

('The best study parameters for the classifier are: ', {'max_depth': 12, 'n_estimators': 1940, 'class_weight': {0: 1.0, 1: 9.0}, 'min_samples_leaf': 0.00014661034840158529})


#### Computing the  full feature roc_auc score for the test data using the best study Parameters

In [21]:
# Obtaining the best full feature RF model by setting best study parameters.
rf_f = rf_s.set_params(**best_study_params)

In [22]:
# fitting the best Random Forest model on the full feature training set
rf_f.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 1.0, 1: 9.0}, max_depth=12,
                       min_samples_leaf=0.00014661034840158529,
                       n_estimators=1940, n_jobs=5, random_state=42)

In [26]:
# Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X, y, cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the best {} classifier is '.format(f_set, t_set, model_name), roc_auc_score(y,y_pred[:,1]))

In [27]:
# Calculating the full feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train, y_train, rf_f, 'full feature', 'training', 'Random Forest')

The roc_auc_score for the full feature training set using the best Random Forest classifier is  0.8813585851091872


In [28]:
# Calculating the full feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test, y_test, rf_f, 'full feature', 'test', 'Random Forest')

The roc_auc_score for the full feature test set using the best Random Forest classifier is  0.8142587061889712


In [29]:
print('The accuracy for the full feature test set is: ',rf_f.score(X_test, y_test))

The accuracy for the full feature test set is:  0.8565185724690458


In [30]:
# Saving the full feature set best Random forest Classifier 
import joblib
joblib.dump(rf_f,'Rand_Forest_Full.joblib')

['Rand_Forest_Full.joblib']

#### Computing the best hyperparameters for the  Random Forest Classifier using Reduced feature Training Set.

In [33]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train_red, y_train, rf_s, objective_wrappper_rf, cv_strat, n_trials=100)

[I 2020-10-01 18:03:32,343] A new study created in memory with name: no-name-42957682-1fe7-4281-978a-311a3b094c5c
[I 2020-10-01 18:03:59,369] Trial 0 finished with value: 0.795702317514923 and parameters: {'max_depth': 7, 'n_estimators': 1430, 'class_weight': {0: 1.0, 1: 9.0}, 'min_samples_leaf': 0.0009447228622446949}. Best is trial 0 with value: 0.795702317514923.
[I 2020-10-01 18:04:08,434] Trial 1 finished with value: 0.7888060115446315 and parameters: {'max_depth': 35, 'n_estimators': 490, 'class_weight': {0: 1.0, 1: 9.0}, 'min_samples_leaf': 0.011083727473134162}. Best is trial 0 with value: 0.795702317514923.
[I 2020-10-01 18:04:59,260] Trial 2 finished with value: 0.7818537196462192 and parameters: {'max_depth': 14, 'n_estimators': 1620, 'class_weight': {0: 1.0, 1: 12}, 'min_samples_leaf': 3.1794562444980524e-05}. Best is trial 0 with value: 0.795702317514923.
[I 2020-10-01 18:05:38,181] Trial 3 finished with value: 0.7938138334455298 and parameters: {'max_depth': 35, 'n_estima

[I 2020-10-01 18:41:02,137] Trial 62 finished with value: 0.7966046969720375 and parameters: {'max_depth': 20, 'n_estimators': 1280, 'class_weight': {0: 1.0, 1: 11}, 'min_samples_leaf': 0.0004909673882321841}. Best is trial 35 with value: 0.7996976437390577.
[I 2020-10-01 18:41:05,666] Trial 63 finished with value: 0.7925396074140748 and parameters: {'max_depth': 36, 'n_estimators': 100, 'class_weight': {0: 1.0, 1: 11}, 'min_samples_leaf': 0.00021657188066274285}. Best is trial 35 with value: 0.7996976437390577.
[I 2020-10-01 18:41:32,996] Trial 64 finished with value: 0.7985747610674434 and parameters: {'max_depth': 9, 'n_estimators': 1190, 'class_weight': {0: 1.0, 1: 11}, 'min_samples_leaf': 0.00013029424955964303}. Best is trial 35 with value: 0.7996976437390577.
[I 2020-10-01 18:42:20,949] Trial 65 finished with value: 0.7953396435979724 and parameters: {'max_depth': None, 'n_estimators': 1460, 'class_weight': None, 'min_samples_leaf': 0.00038302681502892447}. Best is trial 35 with

In [34]:
print('The best roc_auc_score for the study is: ', best_study_score)

The best roc_auc_score for the study is:  0.7997200159560963


In [35]:
print(('The best study parameters for the classifier are: ', best_study_params))

('The best study parameters for the classifier are: ', {'max_depth': 11, 'n_estimators': 1560, 'class_weight': 'balanced', 'min_samples_leaf': 0.000264150675671259})


#### Computing the  reduced feature roc_auc score for the test data using the best study Parameters

In [36]:
# Obtaining the best reduced feature RF model by setting best study parameters.
rf_R = rf_s.set_params(**best_study_params)

In [37]:
# fitting the best Random Forest model on the reduced feature training set
rf_R.fit(X_train_red, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=11,
                       min_samples_leaf=0.000264150675671259, n_estimators=1560,
                       n_jobs=5, random_state=42)

In [38]:
# Calculating the reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train_red, y_train, rf_R, 'reduced feature', 'training', 'Random Forest')

The roc_auc_score for the reduced feature training set using the best Random Forest classifier is  0.8575020591265264


In [40]:
# Calculating the reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test_red, y_test, rf_R, 'reduced feature', 'test', 'Random Forest')

The roc_auc_score for the reduced feature test set using the best Random Forest classifier is  0.815116647601302


In [41]:
# Saving the reduced feature set best Random forest Classifier 
joblib.dump(rf_R,'Rand_Forest_Reduced.joblib')

['Rand_Forest_Reduced.joblib']

### Calculating R_R ratio for best Random forest Classifier.

In [8]:
# Loading the best Random forest Classifier model
import joblib
rf_R = joblib.load('Rand_Forest_Reduced.joblib')

In [9]:
# Computing the CV scores using sklearn's cross_val_score
score_Rand_forest = cross_val_score(rf_R, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [11]:
print('The reward associated with the best Random Forest Classifier using roc_auc metric is: ',np.mean(score_Rand_forest))

The reward associated with the best Random Forest Classifier using roc_auc metric is:  0.7997200159560963


In [12]:
print('The risk associated with the best Random Forest Classifier using roc_auc metric is: ',np.std(score_Rand_forest))

The risk associated with the best Random Forest Classifier using roc_auc metric is:  0.015116312612015882


In [13]:
R_R_Ratio_Rand_forest = np.mean(score_Rand_forest)/np.std(score_Rand_forest)

In [16]:
print('The reward risk ratio for the best Random Forest Classifier using roc_auc metric is: ',R_R_Ratio_Rand_forest)

The reward risk ratio for the best Random Forest Classifier using roc_auc metric is:  52.904437509475876


#### The R_R ratio for the best Random Forest Classifier using roc_auc metric is:  52.904437509475876

## Observations:
### 1) The above results clearly show that the full feature dataset contains noisy components. Thus we got better roc_auc for the reduced feature test set than full feature test set. Further the amount of overfitting reduced  when reduced feature test set was used. _Hence we would only use reduced feature set from now on._
### 2) The R_R Ratio as well as test set roc_auc for the best Random Forest classifier are more than those of corresponding Logistic Regression classifier, thus former  fits the dataset better than the latter.
### 3) Random forest classifier overfitting is generally due to high test set variance. Perhaps using extra trees classifier, which trades off high variance for higher bias, may help.

## Model_3: Extra Trees Classifier with Tuned Hyperparameters using Optuna.

In [42]:
# Defining the appropriate objective function for the Extra Trees classifier
def objective_wrappper_ext(X_tr, y_tr, cls=None, cv_strat=None):
    '''
    Optimizes Extra Trees parameters on the given training set X_tr,y_tr
    using cv_strat cross-validation object
    
    '''
    
    def objective(trial):
        params = {
        'max_depth': trial.suggest_categorical('max_depth',list(range(2,50))+ [None]),
        'n_estimators':trial.suggest_int('n_estimators',100,2000,10),
        'class_weight':trial.suggest_categorical('class_weight',cl_weight),
        'min_samples_leaf':trial.suggest_loguniform('min_samples_leaf',.00001,.1)
          }
        
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [43]:
# Importing Extra Trees Classifier from  Sklearn
from sklearn.ensemble import ExtraTreesClassifier

In [45]:
# Instantiating the Extra Trees classifier
ext_s = ExtraTreesClassifier(n_jobs=5,random_state=42)

#### Computing the best hyperparameters for the Extra Trees Classifier using Reduced feature Training Set.

In [46]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train_red, y_train, ext_s, objective_wrappper_ext, cv_strat, n_trials=100)

[I 2020-10-01 19:44:52,922] A new study created in memory with name: no-name-cf31bdea-0d1b-42bc-8351-17e03f625c26
[I 2020-10-01 19:45:26,686] Trial 0 finished with value: 0.7901452752214027 and parameters: {'max_depth': 11, 'n_estimators': 1440, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.00485906968293306}. Best is trial 0 with value: 0.7901452752214027.
[I 2020-10-01 19:46:41,312] Trial 1 finished with value: 0.7819733080388087 and parameters: {'max_depth': 49, 'n_estimators': 1690, 'class_weight': None, 'min_samples_leaf': 5.5477758102182414e-05}. Best is trial 0 with value: 0.7901452752214027.
[I 2020-10-01 19:47:08,559] Trial 2 finished with value: 0.7857289436024057 and parameters: {'max_depth': 44, 'n_estimators': 1420, 'class_weight': {0: 1.0, 1: 12}, 'min_samples_leaf': 0.01445899080934333}. Best is trial 0 with value: 0.7901452752214027.
[I 2020-10-01 19:47:36,358] Trial 3 finished with value: 0.7835793756583126 and parameters: {'max_depth': 29, 'n_estimators': 181

[I 2020-10-01 20:25:21,312] Trial 63 finished with value: 0.7954530583205857 and parameters: {'max_depth': 12, 'n_estimators': 1640, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.0001897111307780357}. Best is trial 53 with value: 0.7955937625052967.
[I 2020-10-01 20:26:13,537] Trial 64 finished with value: 0.7954530583205857 and parameters: {'max_depth': 12, 'n_estimators': 1640, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.0001882759529498339}. Best is trial 53 with value: 0.7955937625052967.
[I 2020-10-01 20:27:02,856] Trial 65 finished with value: 0.7950059331746028 and parameters: {'max_depth': 12, 'n_estimators': 1630, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.0006451320984374319}. Best is trial 53 with value: 0.7955937625052967.
[I 2020-10-01 20:27:27,832] Trial 66 finished with value: 0.7894475035082407 and parameters: {'max_depth': 5, 'n_estimators': 1640, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.0002173167423454744}. Best is trial 

In [47]:
print('The best roc_auc_score for the study is: ', best_study_score)

The best roc_auc_score for the study is:  0.7955937625052967


In [48]:
print('The best study parameters for the classifier are: ', best_study_params)

('The best study parameters for the classifier are: ', {'max_depth': 10, 'n_estimators': 1880, 'class_weight': {0: 1.0, 1: 12}, 'min_samples_leaf': 8.519138749443575e-05})


#### Computing the  reduced feature roc_auc score for the test data using the best study Parameters

In [49]:
# Obtaining the best reduced feature Extra Trees model by setting best study parameters.
extr_R = ext_s.set_params(**best_study_params)

In [50]:
# fitting the best Extra Trees model on the reduced feature training set
extr_R.fit(X_train_red, y_train)

ExtraTreesClassifier(class_weight={0: 1.0, 1: 12}, max_depth=10,
                     min_samples_leaf=8.519138749443575e-05, n_estimators=1880,
                     n_jobs=5, random_state=42)

In [51]:
# Calculating the reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train_red, y_train, extr_R, 'reduced feature', 'training', 'Extra Trees')

The roc_auc_score for the reduced feature training set using the best Extra Trees classifier is  0.8384501171715599


In [53]:
# Calculating the reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test_red, y_test, extr_R, 'reduced feature', 'test', 'Extra Trees')

The roc_auc_score for the reduced feature test set using the best Extra Trees classifier is  0.8092702633378932


In [54]:
# Saving the reduced feature set best Extra Trees Classifier  
joblib.dump(extr_R, 'Extra_Trees_Reduced.joblib')

['Extra_Trees_Reduced.joblib']

## Observations:
### 1) From the above analysis, we can clearly see that Extra Trees classifier did a good job in reducing overfitting to a large extent, probably by reducing test set variance.
### 2) Though overfitting was reduced, test set roc_auc score also suffered, probably due to corresponding increase in test set bias, which undermined any corresponding decrease in variance.

### Trying bias reduction in Extra Trees by further further tuning  class weight parameter & n_estimators.

In [70]:
# Defing the new class weights
cl_weight_extra = ['balanced',{0:1.0,1:10},{0:1.0,1:12},{0:1.0,1:13},{0:1.0,1:14},{0:1.0,1:15},{0:1.0,1:16},
                  {0:1.0,1:17},{0:1.0,1:18},{0:1.0,1:19},{0:1.0,1:20}]

In [71]:
# Defining the appropriate objective function for the Extra Trees classifier
def objective_wrappper_ext_1(X_tr, y_tr, cls=None, cv_strat=None):
    '''
    Optimizes Extra Trees parameters on the given training set X_tr,y_tr
    using cv_strat cross-validation object
    
    '''
    
    def objective(trial):
        params = {
        'max_depth': trial.suggest_categorical('max_depth',list(range(5,50))+ [None]),
        'n_estimators':trial.suggest_int('n_estimators',1500,2000,10),
        'class_weight':trial.suggest_categorical('class_weight',cl_weight_extra),
        'min_samples_leaf':trial.suggest_loguniform('min_samples_leaf',.00001,.1)
          }
        
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [72]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train_red, y_train, ext_s, objective_wrappper_ext_1, cv_strat, n_trials=100)

[I 2020-10-02 00:33:58,065] A new study created in memory with name: no-name-e401365a-c15f-4f96-9c00-d3804b428448
[I 2020-10-02 00:34:35,032] Trial 0 finished with value: 0.7879921196780325 and parameters: {'max_depth': 19, 'n_estimators': 1700, 'class_weight': {0: 1.0, 1: 17}, 'min_samples_leaf': 0.008256442099799309}. Best is trial 0 with value: 0.7879921196780325.
[I 2020-10-02 00:35:07,187] Trial 1 finished with value: 0.7843989815015627 and parameters: {'max_depth': 30, 'n_estimators': 1790, 'class_weight': {0: 1.0, 1: 18}, 'min_samples_leaf': 0.01785356641779981}. Best is trial 0 with value: 0.7879921196780325.
[I 2020-10-02 00:35:55,125] Trial 2 finished with value: 0.7907049728470049 and parameters: {'max_depth': 33, 'n_estimators': 1810, 'class_weight': {0: 1.0, 1: 16}, 'min_samples_leaf': 0.0035453057509942294}. Best is trial 2 with value: 0.7907049728470049.
[I 2020-10-02 00:36:34,402] Trial 3 finished with value: 0.7891285961949535 and parameters: {'max_depth': 21, 'n_estim

[I 2020-10-02 01:33:37,296] Trial 62 finished with value: 0.7872814283932863 and parameters: {'max_depth': 42, 'n_estimators': 1690, 'class_weight': {0: 1.0, 1: 15}, 'min_samples_leaf': 0.0001512437613727551}. Best is trial 60 with value: 0.7956730841106945.
[I 2020-10-02 01:34:28,313] Trial 63 finished with value: 0.7946457346424182 and parameters: {'max_depth': 11, 'n_estimators': 1720, 'class_weight': {0: 1.0, 1: 15}, 'min_samples_leaf': 9.932073753735719e-05}. Best is trial 60 with value: 0.7956730841106945.
[I 2020-10-02 01:35:05,252] Trial 64 finished with value: 0.7946195976480379 and parameters: {'max_depth': 8, 'n_estimators': 1660, 'class_weight': {0: 1.0, 1: 15}, 'min_samples_leaf': 0.00010164324725984903}. Best is trial 60 with value: 0.7956730841106945.
[I 2020-10-02 01:35:42,339] Trial 65 finished with value: 0.7946195976480379 and parameters: {'max_depth': 8, 'n_estimators': 1660, 'class_weight': {0: 1.0, 1: 15}, 'min_samples_leaf': 0.00010931423401279901}. Best is trial

In [73]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.7960527015478644


In [74]:
print('The best study parameters for the classifier are: ', best_study_params)

The best study parameters for the classifier are:  {'max_depth': 10, 'n_estimators': 1760, 'class_weight': {0: 1.0, 1: 17}, 'min_samples_leaf': 0.00022883489615884475}


#### Computing the  reduced feature roc_auc score for the test data using the fine tuned best study Parameters

In [75]:
# Obtaining the best reduced feature Extra Trees model by setting tuned best study parameters.
extr_R_tu = ext_s.set_params(**best_study_params)

In [76]:
# fitting the best tuned Extra Trees model on the reduced feature training set
extr_R_tu.fit(X_train_red, y_train)

ExtraTreesClassifier(class_weight={0: 1.0, 1: 17}, max_depth=10,
                     min_samples_leaf=0.00022883489615884475, n_estimators=1760,
                     n_jobs=5, random_state=42)

In [77]:
# Calculating the reduced feature training set roc_auc score using the tuned best study parameters
cal_roc_auc(X_train_red, y_train, extr_R_tu, 'reduced feature', 'training', 'tuned Extra Trees')

The roc_auc_score for the reduced feature training set using the best tuned Extra Trees classifier is  0.8341873714474077


In [78]:
# Calculating the reduced feature test set roc_auc score using the tuned best study parameters
cal_roc_auc(X_test_red, y_test, extr_R_tu, 'reduced feature', 'test', 'tuned Extra Trees')

The roc_auc_score for the reduced feature test set using the best tuned Extra Trees classifier is  0.80904722805321


In [79]:
# Saving the reduced feature set best Extra Trees Classifier  
joblib.dump(extr_R_tu,'Extra_Trees_Reduced_Tuned.joblib')

['Extra_Trees_Reduced_Tuned.joblib']

## Observations:
### 1) The tuned Extra Tree Classifier has further reduced the overfitting by decreasing the gap between the training set & test set roc_auc score. However the test set roc_auc has also decreased by a small amount.


### Calculating R_R ratio for best Extra Trees Classifier .

In [17]:
# Loading the best Extra Trees Classifier model
extr_R = joblib.load('Extra_Trees_Reduced.joblib')

In [18]:
# Computing the CV scores using sklearn's cross_val_score
score_Extra_Trees = cross_val_score(extr_R, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [19]:
print('The reward associated with the best Extra Trees Classifier using roc_auc metric is: ',np.mean(score_Extra_Trees))

The reward associated with the best Extra Trees Classifier using roc_auc metric is:  0.7955937625052967


In [20]:
print('The risk associated with the best Extra Trees Classifier using roc_auc metric is: ',np.std(score_Extra_Trees))

The risk associated with the best Extra Trees Classifier using roc_auc metric is:  0.01526478266685032


In [21]:
R_R_Ratio_Extra_Trees = np.mean(score_Extra_Trees)/np.std(score_Extra_Trees)

In [22]:
print('The reward risk ratio for the best Extra trees Classifier using roc_auc metric is: ',R_R_Ratio_Extra_Trees)

The reward risk ratio for the best Extra trees Classifier using roc_auc metric is:  52.1195604201456


#### The R_R ratio for the best Extra trees Classifier using roc_auc metric is:  52.1195604201456

### Calculating R_R ratio for tuned Extra Trees Classifier model

In [23]:
# Loading the best tuned Extra Trees Classifier model
extr_R_tu = joblib.load('Extra_Trees_Reduced_Tuned.joblib')

In [26]:
# Computing the CV scores using sklearn's cross_val_score
score_Extra_Trees_tuned = cross_val_score(extr_R_tu, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [27]:
print('The reward associated with the tuned Extra Trees Classifier using roc_auc metric is: ',np.mean(score_Extra_Trees_tuned))

The reward associated with the tuned Extra Trees Classifier using roc_auc metric is:  0.7960527015478644


In [28]:
print('The risk associated with the tuned Extra Trees Classifier using roc_auc metric is: ',np.std(score_Extra_Trees_tuned))

The risk associated with the tuned Extra Trees Classifier using roc_auc metric is:  0.014914659429384768


In [30]:
R_R_Ratio_Extra_Trees_tuned = np.mean(score_Extra_Trees_tuned)/np.std(score_Extra_Trees_tuned)

In [31]:
print('The reward risk ratio for the tuned Extra trees Classifier using roc_auc metric is: ',R_R_Ratio_Extra_Trees_tuned)

The reward risk ratio for the tuned Extra trees Classifier using roc_auc metric is:  53.373843721800746


#### The R_R ratio for the best tuned Extra trees Classifier using roc_auc metric is:  53.373843721800746

## Observations:
### 1) The tuned extra trees classifier has the best R_R ratio of all the tree based classifiers trained on the reduced feature dataset , due to low std. dev. of the cross validated roc_auc scores. Hence based upon this metric the best choice among the Tree based model is Tuned Extra Trees Classifier.

### R_R Ratio for the best Tree based bagging classifier using roc_auc score is:  53.373843721800746,  corresponding to Tuned Extra Trees classifier.

### _However keeping everything into account, for this dataset, the best Tree based bagging classifier is Optuna tuned Random Forest Classifier._  