In [1]:
# Importing Required Python Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [2]:
# Loading Reduced feature Training set
X_train_red = pd.read_csv('X_train_final.csv')
y_train = pd.read_csv('y_train.final.csv')

In [3]:
# Loading Reduced feature Test set
X_test_red = pd.read_csv('X_test_final.csv')
y_test = pd.read_csv('y_test.final.csv')

## Model_16: Stacking Classifier

####  Loading  the best Voting Classifier model & Neural Network (with Equal Nodes in all the Layers)

In [4]:
# Importing Joblib module
import joblib

In [5]:
import tensorflow
from tensorflow import keras

In [6]:
# Importing best Voting Classifier
voting_clf = joblib.load('Voting_Red.joblib')

### Preparing the Training Set for Stacking Classifier

In [7]:
# Instantiating an empty DataFrame to store training set for Stacking Classifier
df_train_stack = pd.DataFrame(columns=['Voting','Neural','True'])

In [8]:
# Initializing the 10 Fold object
from sklearn.model_selection import StratifiedKFold
cv_strat = StratifiedKFold(10,random_state=42)

#### Getting predictions of the Voting & Neural Net classifier for each of 10 folds of Training set.

In [9]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [10]:
# Importing cross val score from sklearn
from sklearn.model_selection import cross_val_score

In [11]:
# Importing train test split from Sklearn to produce validation set
from sklearn.model_selection import train_test_split

In [12]:
# Defining the exponential decay learning rate.
def exponential_decay_fn(epoch):
            return 0.01 * 0.1**(epoch /4 )

In [13]:
def Neural_Pred(clf, X_tr, y_tr, X_tst):
    ''' 
    This function returns the predictions of Neural Classifier(clf) for the test folds 
    Parameters:
    X_tr : Training set Features
    y_tr : Training set Labels
    X_tst : Test set Features
    '''
    # Splitting the Training set further into training & validation set.
    X_tr_r, X_val, y_tr_r, y_val = train_test_split(X_tr, y_tr, test_size=0.1, random_state=42, stratify=y_tr)
    
    # Compiling & Training the Neural Net
    clf.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Nadam(beta_1=0.9, beta_2=0.999), metrics=["accuracy"])
    
    # defining Checkpoints
    checkpoint_cb = keras.callbacks.ModelCheckpoint('best_model.h5',save_best_only=True) # 1st Callback
    early_stopping_cb = keras.callbacks.EarlyStopping(patience=10)# 2nd Callback
    lr_scheduler_cb = keras.callbacks.LearningRateScheduler(exponential_decay_fn)# 3rd Callback
    
    # Fitting The model
    clf.fit(X_tr_r, y_tr_r, epochs=50, validation_data=(X_val,y_val),batch_size=32,
    class_weight={0: 1.0, 1: 10.0},callbacks=[checkpoint_cb,early_stopping_cb,lr_scheduler_cb])
    
    # Loading the best Neural model after training & before making predictions
    clf_best = keras.models.load_model('best_model.h5')    
    
    # Saving the predictions for every fold in a numpy array
    return (clf_best.predict_proba(X_tst).flatten())

In [14]:
# Computing Voting Classifier's & Neural Net 10 fold conditional prob predictions on the training set for Stacking classifier
for train_index, test_index in cv_strat.split(X_train_red, y_train):
    # Creating  Folds
    X_tr, X_tst = X_train_red.loc[train_index], X_train_red.loc[test_index]
    y_tr, y_tst = y_train.loc[train_index], y_train.loc[test_index]
    
    # Fitting the best Voting Classifier on the Training Folds
    voting_clf.fit(X_tr, y_tr)
    # Making Predictions on the testing Fold.
    y_pred_voting = voting_clf.predict_proba(X_tst)[:,1]
    
    
    # Loading the fresh best Neural net classifier
    neural = keras.models.load_model('Best_model_Selu_eq_Learn.h5')
    # Fitting the best Neural Net on Training Folds & obtaining  predictions
    y_pred_neural = Neural_Pred(neural, X_tr, y_tr, X_tst)
    
    # Storing the predictions in a dataframe
    df_temp = pd.DataFrame(columns=['Voting','Neural','True'])
    df_temp['Voting'] = y_pred_voting
    df_temp['Neural'] = y_pred_neural
    df_temp['True'] = y_tst.values.flatten() # As y_tst is a dataframe we first convert it into a 2d numpy vector
    # array, which is is then converted to 1d array using .flatten()
    
    # Appending the df_temp to df_train_stack
    df_train_stack = df_train_stack.append(df_temp,ignore_index=True)   
    

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 26689 samples, validate on 2966 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 26689 samples, validate on 2966 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 26689 samples, validate on 2966 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
  ...
    to  
  ['...']
  ...
    to  
  ['..

Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 26689 samples, validate on 2966 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 26689 samples, validate on 2966 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 26689 samples, validate on 2966 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50

In [15]:
# Getting the info of the Training dataframe of the  Stacking Classifier
df_train_stack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Voting  32950 non-null  float64
 1   Neural  32950 non-null  float32
 2   True    32950 non-null  object 
dtypes: float32(1), float64(1), object(1)
memory usage: 643.7+ KB


In [16]:
# Converting the True Column to the int type
df_train_stack['True'] = df_train_stack['True'].astype('int8')

In [17]:
# Re-checking the info of the df_train_stack
df_train_stack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Voting  32950 non-null  float64
 1   Neural  32950 non-null  float32
 2   True    32950 non-null  int8   
dtypes: float32(1), float64(1), int8(1)
memory usage: 418.4 KB


In [18]:
# Printing the top 5 rows of the df_train_stack
df_train_stack.head()

Unnamed: 0,Voting,Neural,True
0,0.254285,0.320342,0
1,0.274904,0.356925,0
2,0.897182,0.964222,1
3,0.210015,0.224473,0
4,0.831096,0.877714,0


### Selecting Random forest as the Stacking classifier as it has one of highest  test set roc_auc as well as R_R ratio for this dataset.

### Tuning the hyperparameters of the Random Forest on the Stacked Training set using Optuna

In [19]:
# Defining the class weights
cl_weight = [None,'balanced',{0:1.0,1:9.0},{0:1.0,1:10},{0:1.0,1:11},{0:1.0,1:12},{0:1.0,1:13},{0:1.0,1:14},{0:1.0,1:15}]

In [20]:
# Defining the appropriate objective function for the Random Forest classifier
def objective_wrappper_rf(X_tr, y_tr, cls=None, cv_strat=None):
    '''
    Optimizes Random Forest parameters on the given training set X_tr,y_tr
    using cv_strat cross-validation object
    
    '''
    
    def objective(trial):
        params = {
        'max_depth': trial.suggest_categorical('max_depth',list(range(2,50))+ [None]),
        'n_estimators':trial.suggest_int('n_estimators',100,2000,10),
        'class_weight':trial.suggest_categorical('class_weight',cl_weight),
        'min_samples_leaf':trial.suggest_loguniform('min_samples_leaf',.00001,.1)
          }
        
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [21]:
# Importing  hyperparamater tuning optimizer optuna
import optuna

In [22]:
# Defining the evaluation function for study's best parameters
def train_test_roc_auc(X_tr, y_tr, cls, obj_func, cv_strat, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv_strat), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)


In [23]:
# Importing Random Forest Classifier from  Sklearn
from sklearn.ensemble import RandomForestClassifier

In [24]:
# Instantiating the Random forest classifier
rf_s = RandomForestClassifier(n_jobs=5, random_state=42)

In [27]:
# Segregating the Features and class labels from the Training Dataset
X_train_stacked = df_train_stack[['Voting','Neural']]
y_train_stacked = df_train_stack['True']

In [28]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train_stacked, y_train_stacked, rf_s, objective_wrappper_rf, cv_strat, n_trials=150)

[32m[I 2020-11-02 23:21:05,923][0m A new study created in memory with name: no-name-a5025eef-ac4f-4eff-8a7c-49889c4619e2[0m
[32m[I 2020-11-02 23:21:35,831][0m Trial 0 finished with value: 0.7985677591660422 and parameters: {'max_depth': 29, 'n_estimators': 1550, 'class_weight': {0: 1.0, 1: 11}, 'min_samples_leaf': 0.019211420621720456}. Best is trial 0 with value: 0.7985677591660422.[0m
[32m[I 2020-11-02 23:22:47,841][0m Trial 1 finished with value: 0.7776684448984348 and parameters: {'max_depth': 15, 'n_estimators': 1770, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.0001329767801953648}. Best is trial 0 with value: 0.7985677591660422.[0m
[32m[I 2020-11-02 23:23:54,335][0m Trial 2 finished with value: 0.7563436594490136 and parameters: {'max_depth': 47, 'n_estimators': 1280, 'class_weight': {0: 1.0, 1: 12}, 'min_samples_leaf': 1.0294047801488418e-05}. Best is trial 0 with value: 0.7985677591660422.[0m
[32m[I 2020-11-02 23:24:26,329][0m Trial 3 finished with valu

[32m[I 2020-11-02 23:46:18,847][0m Trial 31 finished with value: 0.8001869688358418 and parameters: {'max_depth': 4, 'n_estimators': 1910, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 2.6692206671107163e-05}. Best is trial 5 with value: 0.800206114248191.[0m
[32m[I 2020-11-02 23:46:50,828][0m Trial 32 finished with value: 0.8001821201652017 and parameters: {'max_depth': 4, 'n_estimators': 1860, 'class_weight': {0: 1.0, 1: 14}, 'min_samples_leaf': 1.290102664864045e-05}. Best is trial 5 with value: 0.800206114248191.[0m
[32m[I 2020-11-02 23:47:19,771][0m Trial 33 finished with value: 0.8002590206328056 and parameters: {'max_depth': 4, 'n_estimators': 1670, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_leaf': 0.0001593845377949925}. Best is trial 33 with value: 0.8002590206328056.[0m
[32m[I 2020-11-02 23:48:21,798][0m Trial 34 finished with value: 0.7776296136132248 and parameters: {'max_depth': 21, 'n_estimators': 1390, 'class_weight': {0: 1.0, 1: 10}, 'min_samples_le

[32m[I 2020-11-03 00:14:58,363][0m Trial 62 finished with value: 0.8002905028605773 and parameters: {'max_depth': 4, 'n_estimators': 1770, 'class_weight': None, 'min_samples_leaf': 1.9011795186684577e-05}. Best is trial 42 with value: 0.8003822305738767.[0m
[32m[I 2020-11-03 00:16:03,315][0m Trial 63 finished with value: 0.7875491072962408 and parameters: {'max_depth': 14, 'n_estimators': 1570, 'class_weight': None, 'min_samples_leaf': 1.9150608464861085e-05}. Best is trial 42 with value: 0.8003822305738767.[0m
[32m[I 2020-11-03 00:17:38,816][0m Trial 64 finished with value: 0.7664606536460319 and parameters: {'max_depth': 32, 'n_estimators': 1790, 'class_weight': None, 'min_samples_leaf': 5.2109315566479745e-05}. Best is trial 42 with value: 0.8003822305738767.[0m
[32m[I 2020-11-03 00:19:13,028][0m Trial 65 finished with value: 0.7643837311122572 and parameters: {'max_depth': 27, 'n_estimators': 1710, 'class_weight': None, 'min_samples_leaf': 2.4904965238943712e-05}. Best i

[32m[I 2020-11-03 00:45:03,207][0m Trial 94 finished with value: 0.7650595685705328 and parameters: {'max_depth': 41, 'n_estimators': 1670, 'class_weight': None, 'min_samples_leaf': 3.903491920463432e-05}. Best is trial 42 with value: 0.8003822305738767.[0m
[32m[I 2020-11-03 00:45:29,480][0m Trial 95 finished with value: 0.8003767314243238 and parameters: {'max_depth': 4, 'n_estimators': 1540, 'class_weight': None, 'min_samples_leaf': 7.42733310028795e-05}. Best is trial 42 with value: 0.8003822305738767.[0m
[32m[I 2020-11-03 00:45:55,725][0m Trial 96 finished with value: 0.8003767314243238 and parameters: {'max_depth': 4, 'n_estimators': 1540, 'class_weight': None, 'min_samples_leaf': 7.451495640090427e-05}. Best is trial 42 with value: 0.8003822305738767.[0m
[32m[I 2020-11-03 00:46:47,576][0m Trial 97 finished with value: 0.7805602412739591 and parameters: {'max_depth': 13, 'n_estimators': 1410, 'class_weight': {0: 1.0, 1: 12}, 'min_samples_leaf': 0.00010623144067744437}. 

[32m[I 2020-11-03 01:10:54,733][0m Trial 126 finished with value: 0.7816768310326337 and parameters: {'max_depth': 35, 'n_estimators': 1520, 'class_weight': None, 'min_samples_leaf': 0.00030661798319488274}. Best is trial 114 with value: 0.8004241337179158.[0m
[32m[I 2020-11-03 01:12:05,869][0m Trial 127 finished with value: 0.7846403804556144 and parameters: {'max_depth': 25, 'n_estimators': 1600, 'class_weight': None, 'min_samples_leaf': 0.0004242877359032014}. Best is trial 114 with value: 0.8004241337179158.[0m
[32m[I 2020-11-03 01:13:20,185][0m Trial 128 finished with value: 0.7785927046868022 and parameters: {'max_depth': 38, 'n_estimators': 1550, 'class_weight': None, 'min_samples_leaf': 0.00022293034694246362}. Best is trial 114 with value: 0.8004241337179158.[0m
[32m[I 2020-11-03 01:14:18,056][0m Trial 129 finished with value: 0.7907258934965208 and parameters: {'max_depth': 44, 'n_estimators': 1480, 'class_weight': None, 'min_samples_leaf': 0.0012446029184334678}. 

In [29]:
print('The best roc_auc_score for the study is: ', best_study_score)

The best roc_auc_score for the study is:  0.8004241337179158


In [30]:
print('The best study parameters for the classifier are: ', best_study_params)

The best study parameters for the classifier are:  {'max_depth': 4, 'n_estimators': 1540, 'class_weight': None, 'min_samples_leaf': 0.00024403747920277233}


In [31]:
# Obtaining the best Stacking Random Forest model by setting best study parameters.
rf_stack = rf_s.set_params(**best_study_params)

In [32]:
# fitting the best Stacking Random Forest model on the whole training set
rf_stack.fit(X_train_stacked, y_train_stacked)

RandomForestClassifier(max_depth=4, min_samples_leaf=0.00024403747920277233,
                       n_estimators=1540, n_jobs=5, random_state=42)

### Preparing the Test set observations for the Stacking Classifier.

In [33]:
# Training the Voting Classifier on the whole Training set
voting_clf.fit(X_train_red,y_train)

VotingClassifier(estimators=[('logistic_Reg',
                              LogisticRegression(C=0.12725888493400458,
                                                 class_weight={0: 1.0, 1: 9.0},
                                                 l1_ratio=0.9851193622801032,
                                                 n_jobs=5, penalty='elasticnet',
                                                 random_state=42,
                                                 solver='saga')),
                             ('Random_Forest',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=11,
                                                     min_samples_leaf=0.000264150675671259,
                                                     n_estimators=1560,
                                                     n_jobs=5,
                                                     rando...
                              

In [34]:
# Computing the predictions of the voting classifier on the test set.
y_pred_test_voting = voting_clf.predict_proba(X_test_red)[:,1]

In [35]:
# Reloading the best Neural Classifier
neural = keras.models.load_model('Best_model_Selu_eq_Learn.h5')

In [36]:
# Computing the predictions of the Neural Classifier on the test set.
y_pred_test_neural = neural.predict_proba(X_test_red).flatten()

In [37]:
# Preparing the Test feature set for the Stacking Classifier
X_test_stacked = pd.DataFrame({'Voting':y_pred_test_voting,'Neural':y_pred_test_neural})

In [38]:
# Checking the info of the Stacking Test Features
X_test_stacked.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8238 entries, 0 to 8237
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Voting  8238 non-null   float64
 1   Neural  8238 non-null   float32
dtypes: float32(1), float64(1)
memory usage: 96.7 KB


In [41]:
# Getting the Stacking Classifier Predictions for the Test set
y_pred_stacked = rf_stack.predict_proba(X_test_stacked)[:,1]

In [42]:
# Getting the stacking Clasiifier roc_auc score for the Test Set.
print('The test set roc_auc score for the Stacking Classifier is: ',roc_auc_score(y_test,y_pred_stacked))

The test set roc_auc score for the Stacking Classifier is:  0.8145040007783386


### Calculating R_R ratio for Stacking Classifier.

In [44]:
# Computing the CV scores using sklearn's cross_val_score
score_Stacking = cross_val_score(rf_stack, X_train_stacked, y_train_stacked, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [45]:
print('The reward associated with the tuned Stacking Classifier using roc_auc metric is: ',np.mean(score_Stacking))

The reward associated with the tuned Stacking Classifier using roc_auc metric is:  0.8004241337179158


In [46]:
print('The risk associated with the tuned Stacking Classifier  using roc_auc metric is: ',np.std(score_Stacking))

The risk associated with the tuned Stacking Classifier  using roc_auc metric is:  0.016554226064379108


In [47]:
R_R_Ratio_Stacking = np.mean(score_Stacking)/np.std(score_Stacking)

In [48]:
print('The reward risk ratio for the tuned Stacking Classifier using roc_auc metric is: ',R_R_Ratio_Stacking)

The reward risk ratio for the tuned Stacking Classifier using roc_auc metric is:  48.351649337461005


## Observations: 
### 1) The test set roc_auc score for the Stacking classifier is more than that of  Neural Net classifier , but less than that of the Voting classifier , both of which were used to create the training  as well as test set for the stacking classifier.
### 2) Similarly the R_R ratio of the Stacking classifier is approx. equal (although more) to that of the Voting Classifier, but much less than that of the Neural Net. Thus, even with added complexity, the Stacking Classifier still hasn't been able to beat the tuned Random Forest Classifier on this dataset.




###  The R_R Ratio for the tuned Stacking Classifier using roc_auc metric is:  48.351649337461005