In [1]:
# Importing Required Python Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [2]:
# Loading Reduced feature Training set
X_train_red = pd.read_csv('X_train_final.csv')
y_train = pd.read_csv('y_train.final.csv')

In [3]:
# Loading Reduced feature Test set
X_test_red = pd.read_csv('X_test_final.csv')
y_test = pd.read_csv('y_test.final.csv')

## Loading all the best models from various categories

In [4]:
# Importing Joblib module
import joblib

In [5]:
import tensorflow

In [6]:
from tensorflow import keras

In [7]:
# Importing best Logistic regression Classifier
lr = joblib.load('Log_Reg_Reduced.joblib')

In [8]:
# Importing best Random Forest Classifier
rf = joblib.load('Rand_Forest_Reduced.joblib')

In [9]:
# Importing best Light Gbm Classifier
lgbm = joblib.load('Light_Gbm_Reduced.joblib')

In [10]:
# Importing best Neural Net Classifier
neural = keras.models.load_model('Best_model_Selu_eq_Learn.h5')

In [11]:
# Importing best Linear Discriminant Analysis Classifier
lda = joblib.load('Linear_Dis_Reduced.joblib')

## Model_13: Voting Classifier with Default Parameters & Soft_Voting.

In [12]:
# Importing Voting classifier from sklearn
from sklearn.ensemble import VotingClassifier

In [13]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [18]:
voting_clf = VotingClassifier(estimators=[('logistic_Reg',lr),('Random_Forest',rf),('Light_Gbm',lgbm),
                                           ('Linear_Dis',lda)],voting='soft',n_jobs=5)

In [19]:
# Fitting the vanilla voting classifier on the Reduced Feature Training set
voting_clf.fit(X_train_red, y_train)

VotingClassifier(estimators=[('logistic_Reg',
                              LogisticRegression(C=0.12725888493400458,
                                                 class_weight={0: 1.0, 1: 9.0},
                                                 l1_ratio=0.9851193622801032,
                                                 n_jobs=5, penalty='elasticnet',
                                                 random_state=42,
                                                 solver='saga')),
                             ('Random_Forest',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=11,
                                                     min_samples_leaf=0.000264150675671259,
                                                     n_estimators=1560,
                                                     n_jobs=5,
                                                     rando...
                              

In [17]:
#Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X, y, cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the {} is '.format(f_set,t_set,model_name),roc_auc_score(y,y_pred[:,1]))

In [61]:
# Calculating the reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train_red, y_train, voting_clf,'reduced feature','training',' default Voting Classifier')

The roc_auc_score for the reduced feature training set using the  default Voting Classifier is  0.8261782556386233


In [62]:
# Calculating the reduced feature test roc_auc score using the best study parameters
cal_roc_auc(X_test_red, y_test, voting_clf,'reduced feature','test','default Voting Classifier')

The roc_auc_score for the reduced feature test set using the default Voting Classifier is  0.8114581613519506


In [67]:
# Saving the default voting Classifier
import joblib
joblib.dump(voting_clf,'Voting_Red_default.joblib')

['Voting_Red_default.joblib']

### Calculating R_R ratio for default Voting Classifier.

In [24]:
# Importing required Libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [25]:
# Instantiating the Stratified K fold object
cv_strat = StratifiedKFold(10,random_state=42)

In [26]:
# Computing the CV scores using sklearn's cross_val_score
score_voting_default = cross_val_score(voting_clf, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [28]:
print('The reward associated with the default Voting Classifier using roc_auc metric is: ',np.mean(score_voting_default))

The reward associated with the default Voting Classifier using roc_auc metric is:  0.7994986354233928


In [27]:
print('The risk associated with the default Voting Classifier using roc_auc metric is: ',np.std(score_voting_default))

The risk associated with the default Voting Classifier using roc_auc metric is:  0.016226315881551476


In [30]:
R_R_Ratio_voting_default = np.mean(score_voting_default)/np.std(score_voting_default)

In [32]:
print('The reward risk ratio for the default Voting Classifier using roc_auc metric is: ',R_R_Ratio_voting_default)

The reward risk ratio for the default Voting Classifier using roc_auc metric is:  49.271728792879195


#### R_R Ratio for the default Voting classifier using reduced feature set is: 49.271728792879195

## Observations: 
### 1) The test set roc_auc score for the default voting classifier is more than that of component Logistic Regression & LDA, but less than that of  Random Forest & LightGBM. May be changing the weights of voting classifier result in better results.
### 2) On the other hand, the R_R ratio for the default voting classifier model is more than that of Logistic regression & LightGBM , but less than that of Random forest & LDA. So clearly Random Forest bests  default Voting classifier hands down.

## Model_13: Voting Classifier with Tuned weights.

In [41]:
# Instantiating a new voting classifier object
voting_clf_2 = VotingClassifier(estimators=[('logistic_Reg',lr),('Random_Forest',rf),('Light_Gbm',lgbm),
                                           ('Linear_Dis',lda)],voting='soft',n_jobs=5)

In [42]:
# Defining the appropriate objective function for the best weights of Voting classifier.
def objective_wrappper_Vt_2(X_tr, y_tr, cls=None, cv_strat=None):
    '''
    Optimizes voting classifier (cls) weights parameter on the given training set X_tr,y_tr
      
    '''
    
    def objective(trial):
        w1 = trial.suggest_uniform('w1',-1,1)
        w2 = trial.suggest_uniform('w2',-1,1)
        w3 = trial.suggest_uniform('w3',-1,1)
        w4 = trial.suggest_uniform('w4',-1,1)
        
        params = {
            'weights':[w1,w2,w3,w4]                 
            }
        
        cls.set_params(**params)#Initializing the model with the parameter
               
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))
    
    return objective

#### Computing the best weights for the Voting Classifier_2 using Optuna.

In [1]:
# Defining the evaluation function for study's best parameters
def train_test_roc_auc(X_tr, y_tr, cls, obj_func, cv_strat, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv_strat), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)


In [43]:
# Extracting the best model parameters and best study score for the 2nd Voting Classifier
best_study_score,best_study_params = train_roc_auc(X_train_red, y_train, voting_clf_2, objective_wrappper_Vt_2,
                                                   cv_strat)

[I 2020-10-09 01:49:51,417] A new study created in memory with name: no-name-10a255f2-6cbe-4a24-bd10-cd52b8d43b20
[I 2020-10-09 01:51:17,556] Trial 0 finished with value: 0.5829850701192909 and parameters: {'w1': 0.1038898457697357, 'w2': -0.7058492808036847, 'w3': 0.025250530887687583, 'w4': 0.44266115384040594}. Best is trial 0 with value: 0.5829850701192909.
[I 2020-10-09 01:53:01,586] Trial 1 finished with value: 0.5837460812121741 and parameters: {'w1': -0.9867733185292686, 'w2': 0.7856035983802627, 'w3': 0.83641583303722, 'w4': -0.7831608968403647}. Best is trial 1 with value: 0.5837460812121741.
[I 2020-10-09 01:54:49,553] Trial 2 finished with value: 0.7756791747079623 and parameters: {'w1': 0.3885807116486186, 'w2': 0.9994103258417006, 'w3': 0.05856411622978519, 'w4': -0.6883430815552989}. Best is trial 2 with value: 0.7756791747079623.
[I 2020-10-09 01:56:33,147] Trial 3 finished with value: 0.7946751883433738 and parameters: {'w1': -0.8966206571876429, 'w2': 0.30782819131362

[I 2020-10-09 03:46:50,759] Trial 64 finished with value: 0.798709979378647 and parameters: {'w1': -0.054264392642794035, 'w2': 0.19769572490578338, 'w3': -0.797129249461288, 'w4': 0.05141668280484202}. Best is trial 63 with value: 0.8012767991170631.
[I 2020-10-09 03:48:36,429] Trial 65 finished with value: 0.8007373172496075 and parameters: {'w1': 0.036728486130442856, 'w2': -0.03133266363989179, 'w3': -0.5590136259925164, 'w4': -0.1146057982520279}. Best is trial 63 with value: 0.8012767991170631.
[I 2020-10-09 03:50:26,775] Trial 66 finished with value: 0.8008098060871992 and parameters: {'w1': 0.01957174872373488, 'w2': -0.0592196571212438, 'w3': -0.5494146015229269, 'w4': -0.1406341471614976}. Best is trial 63 with value: 0.8012767991170631.
[I 2020-10-09 03:52:15,135] Trial 67 finished with value: 0.7905679995462143 and parameters: {'w1': -0.00491412504663756, 'w2': -0.3675543414259161, 'w3': -0.794107026625973, 'w4': 0.30558130111505255}. Best is trial 63 with value: 0.80127679

In [46]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.8014569106242455


In [47]:
print(('The best study parameters for the classifier are: ',best_study_params))

('The best study parameters for the classifier are: ', {'w1': -0.01892659811471946, 'w2': -0.12205104743904709, 'w3': -0.4097477051900742, 'w4': -0.007396756446858069})


In [48]:
# Obtaining the best tuned voting clasiifier model by setting best study parameters.
voting_clf_2 = voting_clf_2.set_params(weights=[best_study_params['w1'],best_study_params['w2'],
                                                 best_study_params['w3'],best_study_params['w4']])

In [49]:
# fitting the best Voting Classifier on the reduced feature training set
voting_clf_2.fit(X_train_red, y_train)

VotingClassifier(estimators=[('logistic_Reg',
                              LogisticRegression(C=0.12725888493400458,
                                                 class_weight={0: 1.0, 1: 9.0},
                                                 l1_ratio=0.9851193622801032,
                                                 n_jobs=5, penalty='elasticnet',
                                                 random_state=42,
                                                 solver='saga')),
                             ('Random_Forest',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=11,
                                                     min_samples_leaf=0.000264150675671259,
                                                     n_estimators=1560,
                                                     n_jobs=5,
                                                     rando...
                              

In [50]:
# Calculating the reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train_red, y_train, voting_clf_2, 'reduced feature', 'training', 'Voting Classifier')

The roc_auc_score for the reduced feature training set using the best Voting Classifier is  0.8327689992475545


In [51]:
# Calculating the reduced feature test roc_auc score using the best study parameters
cal_roc_auc(X_test_red, y_test, voting_clf_2, 'reduced feature', 'test', 'Voting Classifier')

The roc_auc_score for the reduced feature test set using the best Voting Classifier is  0.8175044371196755


In [30]:
# Saving the best voting Classifier
import joblib
joblib.dump(voting_clf_2,'Voting_Red.joblib')

['Voting_Red.joblib']

### Calculating R_R ratio for the tuned Voting Classifier.

In [48]:
# Loading the best SVM  Classifier model
import joblib
voting_clf_2 = joblib.load('Voting_Red.joblib')

In [34]:
# Computing the CV scores using sklearn's cross_val_score
score_voting_best = cross_val_score(voting_clf_2, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [49]:
print('The reward associated with the tuned Voting Classifier using roc_auc metric is: ',np.mean(score_voting_best))

The reward associated with the tuned Voting Classifier using roc_auc metric is:  0.8014569106242455


In [50]:
print('The risk associated with the tuned Voting Classifier using roc_auc metric is: ',np.std(score_voting_best))

The risk associated with the tuned Voting Classifier using roc_auc metric is:  0.01694969376455111


In [51]:
R_R_Ratio_voting = np.mean(score_voting_best)/np.std(score_voting_best)

In [52]:
print('The reward risk ratio for the tuned Voting Classifier using roc_auc metric is: ',R_R_Ratio_voting)

The reward risk ratio for the tuned voting Classifier using roc_auc metric is:  47.28444783471113


#### The R_R Ratio for the tuned Voting classifier using roc_auc metric is 47.28444783471113

## Observations: 
### 1)  The test set roc_auc score of the tuned voting classifier is more than that of all component models (by a good margin ), but for that of Light  GBM and also exceeds that of untuned  default voting classifier (voting_clf)
### 2) On the other hand, quite surprisingly the R_R ratio for the tuned voting classifier model is more than that of only Light GBM (among the component models) and is also beaten down by that of the default voting classifier.  
### 3) Again after accounting for all the factors, the tuned Random forest classifier still reigns supreme. 

## Model_14: Voting Classifier with Tuned weights without LDA.

In [42]:
voting_clf_3 = VotingClassifier(estimators=[('logistic_Reg',lr),('Random_Forest',rf),('Light_Gbm',lgbm)],
                                           voting='soft',n_jobs=5)

In [40]:
# Defining the appropriate objective function for the best weights of Voting classifier classifier
def objective_wrappper_Vt_3(X_tr, y_tr, cls=None, cv_strat=None):
    '''
    Optimizes voting classifier (cls) weights parameter on the given training set X_tr, y_tr
      
    '''
    
    def objective(trial):
        
        w1 = trial.suggest_uniform('w1',0,1)
        w2 = trial.suggest_uniform('w2',0,1)
        w3 = trial.suggest_uniform('w3',0,1)
        
        
        params = {
            'weights':[w1,w2,w3]                 
            }
        
        cls.set_params(**params)#Initializing the model with the parameter
               
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))
    
    return objective

In [14]:
# Defining the evaluation function for study's best parameters
def train_roc_auc(X_tr, y_tr, cls, obj_func, cv_strat, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv_strat), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)


In [21]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_roc_auc(X_train_red, y_train, voting_clf_3, objective_wrappper_Vt_3,
                                                   cv_strat)

[I 2020-10-09 23:51:02,311] A new study created in memory with name: no-name-285ed665-90bf-48ff-a1ff-a9f29cb259b7
[I 2020-10-09 23:52:20,979] Trial 0 finished with value: 0.8013708950876163 and parameters: {'w1': 0.09102328342103017, 'w2': 0.6334135383619343, 'w3': 0.7309523997590622}. Best is trial 0 with value: 0.8013708950876163.
[I 2020-10-09 23:53:20,170] Trial 1 finished with value: 0.8008370564675362 and parameters: {'w1': 0.6229923080810127, 'w2': 0.7272435684711895, 'w3': 0.9696964393180374}. Best is trial 0 with value: 0.8013708950876163.
[I 2020-10-09 23:54:19,902] Trial 2 finished with value: 0.8008762774862301 and parameters: {'w1': 0.3262331688940634, 'w2': 0.177787139751987, 'w3': 0.6931711334747859}. Best is trial 0 with value: 0.8013708950876163.
[I 2020-10-09 23:55:18,761] Trial 3 finished with value: 0.8002959001262322 and parameters: {'w1': 0.41596117681920475, 'w2': 0.7675176376135828, 'w3': 0.30487589664687}. Best is trial 0 with value: 0.8013708950876163.
[I 2020

[I 2020-10-10 01:06:49,978] Trial 72 finished with value: 0.8014637107835071 and parameters: {'w1': 0.18416705829383978, 'w2': 0.4184385947894099, 'w3': 0.9304569036554005}. Best is trial 69 with value: 0.8014985302584225.
[I 2020-10-10 01:07:50,670] Trial 73 finished with value: 0.8014508560860051 and parameters: {'w1': 0.11353520623179661, 'w2': 0.5612279735176083, 'w3': 0.884841517590857}. Best is trial 69 with value: 0.8014985302584225.
[I 2020-10-10 01:08:50,636] Trial 74 finished with value: 0.801461029882678 and parameters: {'w1': 0.1528259996761602, 'w2': 0.302368982579299, 'w3': 0.9737016696242646}. Best is trial 69 with value: 0.8014985302584225.
[I 2020-10-10 01:09:51,430] Trial 75 finished with value: 0.8014156042911722 and parameters: {'w1': 0.21213033679478005, 'w2': 0.5088593536197344, 'w3': 0.8275012677018809}. Best is trial 69 with value: 0.8014985302584225.
[I 2020-10-10 01:10:51,857] Trial 76 finished with value: 0.8014157752252569 and parameters: {'w1': 0.2367909439

In [22]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.8014985302584225


In [23]:
print(('The best study parameters for the classifier are: ',best_study_params))

('The best study parameters for the classifier are: ', {'w1': 0.16295216012906855, 'w2': 0.4658787199076757, 'w3': 0.9256837614054116})


In [44]:
# Obtaining the best reduced feature LR model by setting best study parameters.
voting_clf_3 = voting_clf_3.set_params(weights=[best_study_params['w1'],best_study_params['w2'],
                                                 best_study_params['w3']])

In [45]:
# fitting the best Voting Classifier on the full feature training set
voting_clf_3.fit(X_train_red, y_train)

VotingClassifier(estimators=[('logistic_Reg',
                              LogisticRegression(C=0.12725888493400458,
                                                 class_weight={0: 1.0, 1: 9.0},
                                                 l1_ratio=0.9851193622801032,
                                                 n_jobs=5, penalty='elasticnet',
                                                 random_state=42,
                                                 solver='saga')),
                             ('Random_Forest',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=11,
                                                     min_samples_leaf=0.000264150675671259,
                                                     n_estimators=1560,
                                                     n_jobs=5,
                                                     rando...
                              

In [15]:
# Saving the best voting Classifier without lda
import joblib
voting_clf_3 = joblib.load('Voting_Red_without_lda.joblib')

In [18]:
# Calculating the reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train_red, y_train, voting_clf_3, 'reduced feature', 'training', 'tuned Voting Classifier without lda')

The roc_auc_score for the reduced feature training set using the tuned Voting Classifier without lda is  0.8340876676343492


In [19]:
# Calculating the reduced feature test roc_auc score using the best study parameters
cal_roc_auc(X_test_red, y_test, voting_clf_3, 'reduced feature', 'test', 'tuned Voting Classifier without lda')

The roc_auc_score for the reduced feature test set using the tuned Voting Classifier without lda is  0.8166479698334828


In [68]:
# Saving the best voting Classifier without lda
import joblib
joblib.dump(voting_clf_3,'Voting_Red_without_lda.joblib')

['Voting_Red_without_lda.joblib']

### Calculating R_R ratio for the tuned Voting Classifier without  LDA

In [53]:
voting_clf_3

VotingClassifier(estimators=[('logistic_Reg',
                              LogisticRegression(C=0.12725888493400458,
                                                 class_weight={0: 1.0, 1: 9.0},
                                                 l1_ratio=0.9851193622801032,
                                                 n_jobs=5, penalty='elasticnet',
                                                 random_state=42,
                                                 solver='saga')),
                             ('Random_Forest',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=11,
                                                     min_samples_leaf=0.000264150675671259,
                                                     n_estimators=1560,
                                                     n_jobs=5,
                                                     rando...
                              

In [54]:
# Computing the CV scores using sklearn's cross_val_score
score_voting_3 = cross_val_score(voting_clf_3, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [56]:
print('The reward associated with the tuned Voting Classifier without lda using roc_auc metric is: ',np.mean(score_voting_3))

The reward associated with the tuned Voting Classifier without lda using roc_auc metric is:  0.8014985302584225


In [57]:
print('The risk associated with the tuned Voting Classifier using roc_auc metric is: ',np.std(score_voting_3))

The risk associated with the tuned Voting Classifier using roc_auc metric is:  0.0166290787448371


In [58]:
R_R_Ratio_voting_without_lda = np.mean(score_voting_3)/np.std(score_voting_3)

In [59]:
print('The reward risk ratio for the tuned Voting Classifier without lda using roc_auc metric is: ',R_R_Ratio_voting_without_lda)

The reward risk ratio for the tuned Voting Classifier without lda using roc_auc metric is:  48.19861295727324


#### The R_R Ratio for the tuned Voting classifier without lda using roc_auc metric is 48.19861295727324

## Observations: 
### 1) The voting classifier with lda outperformed the one without lda, in terms of test set roc_auc score, which was expected as the voting classifier performs well when the constituent models are many and well diversified.
### 2)  The test set roc_auc score of the tuned voting classifier without lda is more than that of all component models (by a good margin ), but for that of Light  GBM and also exceeds that of untuned  default voting classifier (voting_clf)
### 3) Again, the R-R ratio for the tuned voting classifier without lda is more than that of only Light GBM (among the component models) & tuned voting classifier with lda. But is beaten down by that of the default voting classifier. 
### 4) Taking everything into consideration( such as computational cost), the tuned Random forest classifier has beaten all the voting classifiers & is clearly the winner till here.


### Best R_R Ratio for the voting classifier family using roc_auc metric is:  49.271728792879195 ,corresponding to default Voting Classifier

### _Thus keeping everything into account (including the computational costs), for this dataset , the best Voting Classifier is the tuned Voting Classifier with lda._