In [1]:
# Importing Required Python Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [2]:
# Loading Reduced feature Training set
X_train_red = pd.read_csv('X_train_final.csv')
y_train = pd.read_csv('y_train.final.csv')

In [3]:
# Loading Reduced feature Test set
X_test_red = pd.read_csv('X_test_final.csv')
y_test = pd.read_csv('y_test.final.csv')

## Model_15: Blender Model

#### Loading all the best models from the various categories

In [4]:
# Importing Joblib module
import joblib

In [5]:
# Importing tensorflow & keras
import tensorflow
from tensorflow import keras

In [6]:
# Importing best Logistic regression Classifier
lr = joblib.load('Log_Reg_Reduced.joblib')

In [7]:
# Importing best Random Forest Classifier
rf = joblib.load('Rand_Forest_Reduced.joblib')

In [8]:
# Importing best Light Gbm Classifier
lgbm = joblib.load('Light_Gbm_Reduced.joblib')

In [9]:
# Importing best Neural Net Classifier
neural = keras.models.load_model('Best_model_Selu_eq_Learn.h5')

In [10]:
# Importing best Linear Discriminant Analysis Classifier
lda = joblib.load('Linear_Dis_Reduced.joblib')

In [11]:
# Importing train test split from Sklearn to produce validation set
from sklearn.model_selection import train_test_split

In [12]:
# Performing train_validation split with test size =.4 to provide more Training data to the blender
X_train_red_1, X_val, y_train_1, y_val = train_test_split(X_train_red, y_train, test_size=0.4, random_state=42,
                                                          stratify=y_train)

In [13]:
# Listing all classifiers 
clfs = [lr, rf, lgbm, lda, neural]

In [14]:
# Defining the exponential decay learning rate with step 4.
def exponential_decay_fn(epoch):
            return 0.01 * 0.1**(epoch /4 )

In [15]:
# Training all the above mentioned listed classifiers on the reduced Training set
for clf in clfs:
    # checking if classifier is an instance of Keras Sequential Models
    if isinstance(clf,keras.models.Sequential):
        
        # Compiling the neural net
        clf.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Nadam(beta_1=0.9, beta_2=0.999), metrics=["accuracy"])
        
        # Declaring the callbacks
        checkpoint_cb = keras.callbacks.ModelCheckpoint('best_model.h5',save_best_only=True) # 1st Callback
        early_stopping_cb = keras.callbacks.EarlyStopping(patience=10)# 2nd Callback
        lr_scheduler_cb = keras.callbacks.LearningRateScheduler(exponential_decay_fn)# 3rd Callback
        
        # Fitting the Neural net
        clf.fit(X_train_red_1, y_train_1, epochs=50, validation_data=(X_val, y_val),batch_size=32,
             class_weight={0: 1.0, 1: 10.0},callbacks=[checkpoint_cb,early_stopping_cb,lr_scheduler_cb])
         
    else:
        clf.fit(X_train_red_1, y_train_1)
    

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 19770 samples, validate on 13180 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


In [16]:
# Preparing the Training set features matrix for the blender Classifier
X_val_predictions = np.empty((len(X_val), len(clfs)), dtype=np.float32)
# Filling in the values for the Blender's Training set
for index, estimator in enumerate(clfs):
    if isinstance(estimator,keras.models.Sequential):
        # loading the newly trained neural net
        estimator = keras.models.load_model('best_model.h5') # Loads the recently trained best neural net on reduced training set 
        X_val_predictions[:, index] = estimator.predict_proba(X_val).flatten()
    else:
        X_val_predictions[:, index] = estimator.predict_proba(X_val)[:,1]

In [17]:
# Preparing the Testing set feature matrix for the blender Classifier
X_test_predictions = np.empty((len(X_test_red), len(clfs)), dtype=np.float32)
# Filling in the values for the Blender's Test set
for index, estimator in enumerate(clfs):
    if isinstance(estimator,keras.models.Sequential):
        estimator = keras.models.load_model('best_model.h5')# Loads the recently trained best neural net on reduced training set
        X_test_predictions[:, index] = estimator.predict_proba(X_test_red).flatten()
    else:
        X_test_predictions[:, index] = estimator.predict_proba(X_test_red)[:,1]

In [18]:
# Preparing both Training & Testing sets for Blender classifier
X_train_Blender = pd.DataFrame(X_val_predictions,columns=['Log_Reg','Rand_Forest','Light_Gbm','Lin_Dis','Neural_Net'])
y_train_Blender = y_val
X_test_Blender = pd.DataFrame(X_test_predictions,columns=['Log_Reg','Rand_Forest','Light_Gbm','Lin_Dis','Neural_Net'])
y_test_Blender = y_test

In [22]:
# Saving the Blender's Training & testing sets as csv files
X_train_Blender.to_csv('X_train_Blender.csv',index=False)
y_train_Blender.to_csv('y_train_Blender.csv',index=False)
X_test_Blender.to_csv('X_test_Blender.csv',index=False)
y_test_Blender.to_csv('y_test_Blender.csv',index=False)

### Selecting  Random Forest Classifier as Blender classifier  as it has one of highest  test set roc_auc as well as R_R ratio for this dataset.

In [23]:
# Importing Optuna and Extra Trees Classifier 
from sklearn.ensemble import RandomForestClassifier
import optuna

In [24]:
# Importing Cross_val_score & Stratified K fold 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [25]:
# Initializing the K Fold object
cv_strat = StratifiedKFold(10,random_state=42)

In [32]:
cl_weight = ['balanced',{0:1.0,1:9.0},{0:1.0,1:10},{0:1.0,1:11},{0:1.0,1:12},{0:1.0,1:13},{0:1.0,1:14},{0:1.0,1:15},
            {0:1.0,1:16},{0:1.0,1:17},{0:1.0,1:18}]

In [33]:
# Defining the appropriate objective function for the Blender Random Forest classifier
def objective_wrappper_rf(X_tr, y_tr, cls=None, cv_strat=None, cl_weight=None):
    '''
    Optimizes Random Forest parameters on the given training set X_tr,y_tr
    using cv_strat cross-validation object & cl_weights
    
    '''
    
    def objective(trial):
        params = {
        'max_depth': trial.suggest_categorical('max_depth',list(range(2,50))+ [None]),
        'n_estimators':trial.suggest_int('n_estimators',100,2000,10),
        'class_weight':trial.suggest_categorical('class_weight',cl_weight),
        'min_samples_leaf':trial.suggest_loguniform('min_samples_leaf',.00001,.1)
          }
        
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [34]:
# Defining the evaluation function for study's best parameters
def train_roc_auc(X_tr, y_tr, cls, obj_func, cv, cl_weight, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier on the Training set and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv, cl_weight), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)


In [35]:
# Instantiating the Random forest classifier
rf_s = RandomForestClassifier(n_jobs=5, random_state=42)

In [36]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_roc_auc(X_train_Blender, y_train_Blender, rf_s, objective_wrappper_rf,
                                                   cv_strat, cl_weight, n_trials=200) 

[32m[I 2020-10-22 01:03:48,783][0m A new study created in memory with name: no-name-384cdf6b-075a-4a92-90a0-40d52cc412c4[0m
[32m[I 2020-10-22 01:04:01,737][0m Trial 0 finished with value: 0.7849784000325033 and parameters: {'max_depth': 41, 'n_estimators': 590, 'class_weight': 'balanced', 'min_samples_leaf': 0.0010257109620746797}. Best is trial 0 with value: 0.7849784000325033.[0m
[32m[I 2020-10-22 01:04:07,535][0m Trial 1 finished with value: 0.768637309016785 and parameters: {'max_depth': 34, 'n_estimators': 240, 'class_weight': {0: 1.0, 1: 18}, 'min_samples_leaf': 1.9077970360523176e-05}. Best is trial 0 with value: 0.7849784000325033.[0m
[32m[I 2020-10-22 01:04:17,097][0m Trial 2 finished with value: 0.7952501943527945 and parameters: {'max_depth': 37, 'n_estimators': 1340, 'class_weight': 'balanced', 'min_samples_leaf': 0.09007935827707621}. Best is trial 2 with value: 0.7952501943527945.[0m
[32m[I 2020-10-22 01:04:20,129][0m Trial 3 finished with value: 0.799211150

[32m[I 2020-10-22 01:11:11,628][0m Trial 30 finished with value: 0.7772384253919339 and parameters: {'max_depth': 43, 'n_estimators': 640, 'class_weight': {0: 1.0, 1: 11}, 'min_samples_leaf': 0.0001976999022598786}. Best is trial 17 with value: 0.8021385103321064.[0m
[32m[I 2020-10-22 01:11:15,048][0m Trial 31 finished with value: 0.8020488102134709 and parameters: {'max_depth': 3, 'n_estimators': 390, 'class_weight': {0: 1.0, 1: 18}, 'min_samples_leaf': 1.0345866875657247e-05}. Best is trial 17 with value: 0.8021385103321064.[0m
[32m[I 2020-10-22 01:11:17,990][0m Trial 32 finished with value: 0.769055862302155 and parameters: {'max_depth': 25, 'n_estimators': 110, 'class_weight': {0: 1.0, 1: 18}, 'min_samples_leaf': 1.006912552352212e-05}. Best is trial 17 with value: 0.8021385103321064.[0m
[32m[I 2020-10-22 01:11:23,175][0m Trial 33 finished with value: 0.7847159269533481 and parameters: {'max_depth': 8, 'n_estimators': 320, 'class_weight': {0: 1.0, 1: 18}, 'min_samples_le

[32m[I 2020-10-22 01:16:04,348][0m Trial 61 finished with value: 0.8020488102134709 and parameters: {'max_depth': 3, 'n_estimators': 390, 'class_weight': {0: 1.0, 1: 18}, 'min_samples_leaf': 2.0168391482315777e-05}. Best is trial 17 with value: 0.8021385103321064.[0m
[32m[I 2020-10-22 01:16:11,281][0m Trial 62 finished with value: 0.8020171079346522 and parameters: {'max_depth': 3, 'n_estimators': 820, 'class_weight': {0: 1.0, 1: 18}, 'min_samples_leaf': 1.2459039858164343e-05}. Best is trial 17 with value: 0.8021385103321064.[0m
[32m[I 2020-10-22 01:16:14,521][0m Trial 63 finished with value: 0.8020450784631651 and parameters: {'max_depth': 3, 'n_estimators': 350, 'class_weight': {0: 1.0, 1: 18}, 'min_samples_leaf': 1.0192061478759632e-05}. Best is trial 17 with value: 0.8021385103321064.[0m
[32m[I 2020-10-22 01:16:24,413][0m Trial 64 finished with value: 0.7692883539037539 and parameters: {'max_depth': 39, 'n_estimators': 400, 'class_weight': {0: 1.0, 1: 13}, 'min_samples_

[32m[I 2020-10-22 01:20:21,739][0m Trial 92 finished with value: 0.7979649557457198 and parameters: {'max_depth': 15, 'n_estimators': 660, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.022219155492572508}. Best is trial 75 with value: 0.8021924484089415.[0m
[32m[I 2020-10-22 01:20:29,103][0m Trial 93 finished with value: 0.802145313396067 and parameters: {'max_depth': 3, 'n_estimators': 900, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.0072758331110604315}. Best is trial 75 with value: 0.8021924484089415.[0m
[32m[I 2020-10-22 01:20:37,037][0m Trial 94 finished with value: 0.802048631813632 and parameters: {'max_depth': 3, 'n_estimators': 960, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.006943175352849769}. Best is trial 75 with value: 0.8021924484089415.[0m
[32m[I 2020-10-22 01:20:46,253][0m Trial 95 finished with value: 0.7975611017500059 and parameters: {'max_depth': 47, 'n_estimators': 780, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf':

[32m[I 2020-10-22 01:24:15,584][0m Trial 123 finished with value: 0.8023756833102572 and parameters: {'max_depth': 3, 'n_estimators': 140, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.005341682272651386}. Best is trial 122 with value: 0.8023969577401138.[0m
[32m[I 2020-10-22 01:24:18,957][0m Trial 124 finished with value: 0.7927950146321121 and parameters: {'max_depth': 20, 'n_estimators': 200, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.005559585141026897}. Best is trial 122 with value: 0.8023969577401138.[0m
[32m[I 2020-10-22 01:24:21,742][0m Trial 125 finished with value: 0.7907719879009012 and parameters: {'max_depth': 33, 'n_estimators': 140, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.003442431905934052}. Best is trial 122 with value: 0.8023969577401138.[0m
[32m[I 2020-10-22 01:24:25,926][0m Trial 126 finished with value: 0.7932759869848947 and parameters: {'max_depth': 40, 'n_estimators': 260, 'class_weight': {0: 1.0, 1: 14}, 'min_sampl

[32m[I 2020-10-22 01:25:40,112][0m Trial 154 finished with value: 0.8023206030931688 and parameters: {'max_depth': 3, 'n_estimators': 160, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.004994109228659473}. Best is trial 153 with value: 0.8024123420692364.[0m
[32m[I 2020-10-22 01:25:43,414][0m Trial 155 finished with value: 0.7920477824072648 and parameters: {'max_depth': 34, 'n_estimators': 190, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.0053199508406276165}. Best is trial 153 with value: 0.8024123420692364.[0m
[32m[I 2020-10-22 01:25:45,412][0m Trial 156 finished with value: 0.8023414337357089 and parameters: {'max_depth': 3, 'n_estimators': 210, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.0033084410449988155}. Best is trial 153 with value: 0.8024123420692364.[0m
[32m[I 2020-10-22 01:25:48,427][0m Trial 157 finished with value: 0.8021495549440332 and parameters: {'max_depth': 3, 'n_estimators': 330, 'class_weight': {0: 1.0, 1: 11}, 'min_sampl

[32m[I 2020-10-22 01:26:48,559][0m Trial 185 finished with value: 0.8021789692239899 and parameters: {'max_depth': 3, 'n_estimators': 150, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.00554302030054362}. Best is trial 169 with value: 0.8024281495415904.[0m
[32m[I 2020-10-22 01:26:50,693][0m Trial 186 finished with value: 0.8023750770359397 and parameters: {'max_depth': 3, 'n_estimators': 210, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.003559114240811352}. Best is trial 169 with value: 0.8024281495415904.[0m
[32m[I 2020-10-22 01:26:56,287][0m Trial 187 finished with value: 0.7880861884979656 and parameters: {'max_depth': 47, 'n_estimators': 290, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.0022682501230628445}. Best is trial 169 with value: 0.8024281495415904.[0m
[32m[I 2020-10-22 01:26:57,443][0m Trial 188 finished with value: 0.8023716645022356 and parameters: {'max_depth': 3, 'n_estimators': 100, 'class_weight': {0: 1.0, 1: 14}, 'min_samples

In [37]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.8024321971233517


In [38]:
print(('The best study parameters for the classifier are: ',best_study_params))

('The best study parameters for the classifier are: ', {'max_depth': 3, 'n_estimators': 220, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.0043477331945775514})


#### Computing the test set  roc_auc score using the best study Parameters

In [39]:
# Obtaining the best blender RF model by setting best study parameters.
rf_blender = rf_s.set_params(**best_study_params)

In [40]:
# fitting the best blender rf model on the blender training set
rf_blender.fit(X_train_Blender, y_train_Blender)

RandomForestClassifier(class_weight={0: 1.0, 1: 14}, max_depth=3,
                       min_samples_leaf=0.0043477331945775514, n_estimators=220,
                       n_jobs=5, random_state=42)

In [41]:
# Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X ,y , cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the best {} classifier is '.format(f_set,t_set,model_name),roc_auc_score(y,y_pred[:,1]))

In [42]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [44]:
# Calculating the reduced feature training probabilities set roc_auc score using the best study parameters
cal_roc_auc(X_train_Blender, y_train_Blender, rf_blender,'Reduced feature',
            'training probabilities', 'Random Forest Blender')

The roc_auc_score for the Reduced feature training probabilities set using the best Random Forest Blender classifier is  0.8084145430361763


In [45]:
# Calculating the reduced feature  test probabilities set roc_auc score using the best study parameters
cal_roc_auc(X_test_Blender, y_test_Blender, rf_blender,'Reduced feature',
            'test probabilities', 'Random Forest Blender')

The roc_auc_score for the Reduced feature test probabilities set using the best Random Forest Blender classifier is  0.8130888396858342


In [46]:
# Saving the Blender Model
import joblib
joblib.dump(rf_blender,'Random_Forest_Blender.joblib')

['Random_Forest_Blender.joblib']

### Calculating R_R ratio for the best Random forest Blender Classifier.

In [47]:
# Computing the CV scores using sklearn's cross_val_score
score_blender = cross_val_score(rf_blender, X_train_Blender, y_train_Blender, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [48]:
print('The reward associated with the best Random Forest blender classifier using roc_auc metric is: ',np.mean(score_blender))

The reward associated with the best Random Forest blender classifier using roc_auc metric is:  0.8024321971233517


In [49]:
print('The risk associated with the best Random Forest blender Classifier using roc_auc metric is: ',np.std(score_blender))

The risk associated with the best Random Forest blender Classifier using roc_auc metric is:  0.029289074299698722


In [50]:
R_R_Ratio_blender = np.mean(score_blender)/np.std(score_blender)

In [51]:
print('The reward risk ratio for the best Random Forest blender Classifier using roc_auc metric is: ',R_R_Ratio_blender)

The reward risk ratio for the best Random Forest blender Classifier using roc_auc metric is:  27.396980488782663


In [67]:
score_blender

array([0.81415743, 0.81953973, 0.78121102, 0.82570455, 0.83815835,
       0.81096388, 0.74971725, 0.77310097, 0.77314116, 0.83862763])

#### The R_R Ratio for the Random Forest Blender Classifier using roc_auc metric is: 27.396980488782663

### Removing Linear Discriminant Analysis Classifier from the classifier list to see if it leads to better  results

In [52]:
# Refining the Classifier list
clfs_red = [lr, rf, lgbm, neural]

In [53]:
# Removing the LDA probabilities column from the Blender's train & test feature set
X_train_Blender_red = X_train_Blender.drop(columns=['Lin_Dis'])
X_test_Blender_red = X_test_Blender.drop(columns=['Lin_Dis'])

In [54]:
# Instantiating the Random forest classifier
rf_s = RandomForestClassifier(n_jobs=5, random_state=42)

In [55]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_roc_auc(X_train_Blender_red, y_train_Blender, rf_s, objective_wrappper_rf,
                                                   cv_strat, cl_weight, n_trials=200) 

[32m[I 2020-10-22 01:51:01,040][0m A new study created in memory with name: no-name-48e474c6-f735-4632-ac2e-6eccc1cea2da[0m
[32m[I 2020-10-22 01:51:21,538][0m Trial 0 finished with value: 0.7624172761547174 and parameters: {'max_depth': 21, 'n_estimators': 850, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 5.354191395838755e-05}. Best is trial 0 with value: 0.7624172761547174.[0m
[32m[I 2020-10-22 01:51:47,052][0m Trial 1 finished with value: 0.7892351840312295 and parameters: {'max_depth': 49, 'n_estimators': 1620, 'class_weight': {0: 1.0, 1: 13}, 'min_samples_leaf': 0.004758954062131727}. Best is trial 1 with value: 0.7892351840312295.[0m
[32m[I 2020-10-22 01:52:22,223][0m Trial 2 finished with value: 0.7630701412372634 and parameters: {'max_depth': 27, 'n_estimators': 1450, 'class_weight': {0: 1.0, 1: 13}, 'min_samples_leaf': 1.025240429292137e-05}. Best is trial 1 with value: 0.7892351840312295.[0m
[32m[I 2020-10-22 01:52:55,848][0m Trial 3 finished with value:

[32m[I 2020-10-22 02:02:35,553][0m Trial 30 finished with value: 0.7812512641480712 and parameters: {'max_depth': 46, 'n_estimators': 1890, 'class_weight': {0: 1.0, 1: 12}, 'min_samples_leaf': 0.0012069044358255356}. Best is trial 21 with value: 0.8013311266349987.[0m
[32m[I 2020-10-22 02:02:48,506][0m Trial 31 finished with value: 0.8011947369833778 and parameters: {'max_depth': 2, 'n_estimators': 1990, 'class_weight': {0: 1.0, 1: 16}, 'min_samples_leaf': 0.0006604227908373574}. Best is trial 21 with value: 0.8013311266349987.[0m
[32m[I 2020-10-22 02:03:01,458][0m Trial 32 finished with value: 0.8011464649079688 and parameters: {'max_depth': 2, 'n_estimators': 2000, 'class_weight': {0: 1.0, 1: 15}, 'min_samples_leaf': 0.0006128527072705956}. Best is trial 21 with value: 0.8013311266349987.[0m
[32m[I 2020-10-22 02:03:12,303][0m Trial 33 finished with value: 0.8011994771606755 and parameters: {'max_depth': 2, 'n_estimators': 1670, 'class_weight': {0: 1.0, 1: 16}, 'min_samples

[32m[I 2020-10-22 02:10:35,221][0m Trial 61 finished with value: 0.8022327458399777 and parameters: {'max_depth': 3, 'n_estimators': 1440, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.013665059528544753}. Best is trial 49 with value: 0.8022709143607448.[0m
[32m[I 2020-10-22 02:10:45,725][0m Trial 62 finished with value: 0.8022776267054489 and parameters: {'max_depth': 3, 'n_estimators': 1320, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.01546950226810961}. Best is trial 62 with value: 0.8022776267054489.[0m
[32m[I 2020-10-22 02:11:01,326][0m Trial 63 finished with value: 0.7962146936979654 and parameters: {'max_depth': 35, 'n_estimators': 1300, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.017206592174987507}. Best is trial 62 with value: 0.8022776267054489.[0m
[32m[I 2020-10-22 02:11:21,468][0m Trial 64 finished with value: 0.7950158769988069 and parameters: {'max_depth': 45, 'n_estimators': 1570, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_le

[32m[I 2020-10-22 02:20:14,685][0m Trial 92 finished with value: 0.7883650006914384 and parameters: {'max_depth': 20, 'n_estimators': 1880, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.004178403528390928}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:20:30,212][0m Trial 93 finished with value: 0.8022294037146548 and parameters: {'max_depth': 3, 'n_estimators': 1900, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.007009284822777035}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:20:54,128][0m Trial 94 finished with value: 0.7958671114906797 and parameters: {'max_depth': 8, 'n_estimators': 1950, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.01294322212141763}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:21:28,999][0m Trial 95 finished with value: 0.7848899956787371 and parameters: {'max_depth': 17, 'n_estimators': 1840, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_le

[32m[I 2020-10-22 02:29:18,512][0m Trial 123 finished with value: 0.8022337315838399 and parameters: {'max_depth': 3, 'n_estimators': 1490, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.013100057529734916}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:29:37,205][0m Trial 124 finished with value: 0.7960134919609761 and parameters: {'max_depth': 38, 'n_estimators': 1530, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.015741887798369523}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:29:49,731][0m Trial 125 finished with value: 0.8022853383345989 and parameters: {'max_depth': 3, 'n_estimators': 1570, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.009597068687278935}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:30:11,171][0m Trial 126 finished with value: 0.794401658435537 and parameters: {'max_depth': 22, 'n_estimators': 1620, 'class_weight': {0: 1.0, 1: 13}, 'min_sample

[32m[I 2020-10-22 02:38:18,975][0m Trial 154 finished with value: 0.8022277541489956 and parameters: {'max_depth': 3, 'n_estimators': 1510, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.006702985461572605}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:38:35,707][0m Trial 155 finished with value: 0.7962255409585415 and parameters: {'max_depth': 27, 'n_estimators': 1410, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.017295232292871866}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:38:48,222][0m Trial 156 finished with value: 0.802273148197273 and parameters: {'max_depth': 3, 'n_estimators': 1560, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.010006672913636326}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:39:10,395][0m Trial 157 finished with value: 0.7939773387133533 and parameters: {'max_depth': 31, 'n_estimators': 1620, 'class_weight': {0: 1.0, 1: 14}, 'min_sample

[32m[I 2020-10-22 02:47:41,069][0m Trial 185 finished with value: 0.7936527999945795 and parameters: {'max_depth': 8, 'n_estimators': 1770, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.0045794780112725295}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:47:54,363][0m Trial 186 finished with value: 0.8021312926010411 and parameters: {'max_depth': 3, 'n_estimators': 1650, 'class_weight': {0: 1.0, 1: 16}, 'min_samples_leaf': 0.002959917539888778}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:48:26,053][0m Trial 187 finished with value: 0.788190338938892 and parameters: {'max_depth': 17, 'n_estimators': 1860, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.003880961126915343}. Best is trial 72 with value: 0.8023083774250737.[0m
[32m[I 2020-10-22 02:48:39,097][0m Trial 188 finished with value: 0.8023054713038839 and parameters: {'max_depth': 3, 'n_estimators': 1610, 'class_weight': {0: 1.0, 1: 14}, 'min_sample

In [56]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.8023083774250737


In [57]:
print(('The best study parameters for the classifier are: ',best_study_params))

('The best study parameters for the classifier are: ', {'max_depth': 3, 'n_estimators': 1580, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 0.0041672078377099474})


#### Computing the test set  roc_auc score using the best study Parameters

In [58]:
# Obtaining the best blender tuned RF model by setting best study parameters.
rf_blender_tuned = rf_s.set_params(**best_study_params)

In [59]:
# fitting the best blender tuned RF model on the blender training set
rf_blender_tuned.fit(X_train_Blender_red, y_train_Blender)

RandomForestClassifier(class_weight={0: 1.0, 1: 14}, max_depth=3,
                       min_samples_leaf=0.0041672078377099474,
                       n_estimators=1580, n_jobs=5, random_state=42)

In [60]:
# Calculating the reduced feature training probabilities (without lda) set roc_auc score using the best study parameters
cal_roc_auc(X_train_Blender_red, y_train_Blender, rf_blender_tuned,'Reduced feature',
            'training probabilities', 'Random Forest Blender')

The roc_auc_score for the Reduced feature training probabilities set using the best Random Forest Blender classifier is  0.8081989339022259


In [61]:
# Calculating the reduced feature test probabilities (without lda) set roc_auc score using the best study parameters
cal_roc_auc(X_test_Blender_red, y_test_Blender, rf_blender_tuned,'Reduced feature',
            'test probabilities', 'Random Forest Blender')

The roc_auc_score for the Reduced feature test probabilities set using the best Random Forest Blender classifier is  0.8139186252889287


### Calculating R_R ratio for the best tuned Random forest Blender Classifier (without lda Predictions)

In [27]:
# Computing the CV scores using sklearn's cross_val_score
score_blender_1 = cross_val_score(rf_blender_tuned, X_train_Blender_red, y_train_Blender, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [28]:
print('The reward associated with the best Random Forest blender classifier (sans lda probs) using roc_auc metric is: ',np.mean(score_blender_1))

The reward associated with the best Random Forest blender classifier (sans lda probs) using roc_auc metric is:  0.8023083774250737


In [29]:
print('The risk associated with the best Random Forest blender Classifier (sans lda probs) using roc_auc metric is: ',np.std(score_blender_1))

The risk associated with the best Random Forest blender Classifier (sans lda probs) using roc_auc metric is:  0.02915336150549055


In [30]:
R_R_Ratio_blender_1 = np.mean(score_blender_1)/np.std(score_blender_1)

In [31]:
print('The reward risk ratio for the best Random Forest blender Classifier (sans lda probs) using roc_auc metric is: ',R_R_Ratio_blender_1)

The reward risk ratio for the best Random Forest blender Classifier (sans lda probs) using roc_auc metric is:  27.520269910348837


In [32]:
score_blender_1

array([0.81539039, 0.81914414, 0.78256237, 0.82492492, 0.83805151,
       0.80982139, 0.74873264, 0.77269335, 0.77397363, 0.83778943])

#### R_R Ratio for the Random Forest Blender Classifier (without LDA) using roc_auc metric is: 27.520269910348837

In [69]:
# Saving the tuned Blender Model
joblib.dump(rf_blender_tuned,'Random_Forest_Blender_lda.joblib')

['Random_Forest_Blender_lda.joblib']

## Observations: 
### 1) The test set roc_auc for the Random forest Blender Classifier is more or less the same in both scenarios (with or without the LDA) & is approx. equal to that of the plain Random Forest classifier.
### 2) However the R_R ratios for both the blender classifiers are worst of all the classifiers,  due to the high Std. Dev. of the CV roc_auc scores. This may be due to less training data being available to the blender classifier. _Hence we can all, but rule out using blender classifier for this dataset._


### Best R_R Ratio for the RF Blender Classifier family using roc_auc metric is:  27.520269910348837, corresponding to RF Blender Classifier (Without LDA)