In [1]:
# Importing Required Python Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [2]:
# Loading Training set
X_train_red = pd.read_csv('X_train_final.csv')
y_train = pd.read_csv('y_train.final.csv')

In [3]:
# Loading both versions of Test sets
X_test_red = pd.read_csv('X_test_final.csv')
y_test = pd.read_csv('y_test.final.csv')

In [4]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [5]:
# Importing  hyperparamater tuning optimizer optuna
import optuna

In [6]:
# Importing required Libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [7]:
# Instantiating the Stratified K fold object
cv_strat = StratifiedKFold(10,random_state=42)

## Model_4 : Linear Discriminant Analysis Classifiers 

### Computing the test set roc_auc score using Vanilla LDA classifier with SVD solver with no other hyperparameters to tune.

In [8]:
# Importing LDA Classifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [9]:
# Instantiating the classifier object 
lda_svd = LinearDiscriminantAnalysis(solver='svd')

In [12]:
# Fitting the classifier on the Training data directly as there are no hyperparameters to tune
lda_svd.fit(X_train_red,y_train)

LinearDiscriminantAnalysis()

In [13]:
# Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X, y, cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the best {} is '.format(f_set, t_set, model_name), roc_auc_score(y, y_pred[:,1]))

In [16]:
# Calculating the Reduced feature training set roc_auc score 
cal_roc_auc(X_train_red, y_train, lda_svd, 'Reduced feature', 'training', 'LDA_SVD Classifier')

The roc_auc_score for the Reduced feature training set using the best LDA_SVD Classifier is  0.7920662651019811


In [17]:
# Calculating the Reduced feature test set roc_auc score 
cal_roc_auc(X_test_red, y_test, lda_svd, 'Reduced feature', 'test', 'LDA_SVD Classifier')

The roc_auc_score for the Reduced feature test set using the best LDA_SVD Classifier is  0.7966209343601113


### Computing the best hyperparameters for the Linear Discriminant Classifier with ‘eigen’ solver using Reduced Feature Training Set.

In [18]:
# Instantiating the classifier object 
lda_eigen = LinearDiscriminantAnalysis(solver='eigen')

In [19]:
# Defining the appropriate objective function for the LDA classifier
def objective_wrappper_lda(X_tr, y_tr, cls=None, cv_strat=None):
    '''
    Optimizes Linear Discriminant Analysis parameters on the given training set X_tr, y_tr
    using cv_strat cross-validation object
    
    '''
    
    def objective(trial):
        params = {
        'shrinkage': trial.suggest_uniform('shrinkage', 0, 1)
        }
        
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [21]:
# Defining the evaluation function for study's best parameters
def train_test_roc_auc(X_tr, y_tr, cls, obj_func, cv_strat, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv_strat), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)


In [22]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train_red, y_train, lda_eigen, objective_wrappper_lda, cv_strat, n_trials=200)

[I 2020-10-05 17:54:47,805] A new study created in memory with name: no-name-843e6567-fb24-4479-a3b4-5edcb47f3bb7
[I 2020-10-05 17:54:49,479] Trial 0 finished with value: 0.7684882854891582 and parameters: {'shrinkage': 0.6073407651312152}. Best is trial 0 with value: 0.7684882854891582.
[I 2020-10-05 17:54:49,890] Trial 1 finished with value: 0.7880165236410314 and parameters: {'shrinkage': 0.06845025879399613}. Best is trial 1 with value: 0.7880165236410314.
[I 2020-10-05 17:54:50,455] Trial 2 finished with value: 0.7620961059684799 and parameters: {'shrinkage': 0.7571115363447368}. Best is trial 1 with value: 0.7880165236410314.
[I 2020-10-05 17:54:51,020] Trial 3 finished with value: 0.761748452432655 and parameters: {'shrinkage': 0.7670807635151834}. Best is trial 1 with value: 0.7880165236410314.
[I 2020-10-05 17:54:51,558] Trial 4 finished with value: 0.7887053852381439 and parameters: {'shrinkage': 0.026072894898244603}. Best is trial 4 with value: 0.7887053852381439.
[I 2020-1

[I 2020-10-05 17:55:33,878] Trial 90 finished with value: 0.778926611200582 and parameters: {'shrinkage': 0.36998683434637647}. Best is trial 34 with value: 0.7895780581063258.
[I 2020-10-05 17:55:34,580] Trial 91 finished with value: 0.7895118602345204 and parameters: {'shrinkage': 0.004479697066122862}. Best is trial 34 with value: 0.7895780581063258.
[I 2020-10-05 17:55:35,067] Trial 92 finished with value: 0.7894413361868589 and parameters: {'shrinkage': 0.0004170381732173183}. Best is trial 34 with value: 0.7895780581063258.
[I 2020-10-05 17:55:35,495] Trial 93 finished with value: 0.788593102996781 and parameters: {'shrinkage': 0.0295732280345621}. Best is trial 34 with value: 0.7895780581063258.
[I 2020-10-05 17:55:36,031] Trial 94 finished with value: 0.7878423542217815 and parameters: {'shrinkage': 0.07876647245192424}. Best is trial 34 with value: 0.7895780581063258.
[I 2020-10-05 17:55:36,705] Trial 95 finished with value: 0.7883227951991493 and parameters: {'shrinkage': 0.0

[I 2020-10-05 17:56:19,166] Trial 180 finished with value: 0.788540746985798 and parameters: {'shrinkage': 0.03170799843138182}. Best is trial 131 with value: 0.7895860007338827.
[I 2020-10-05 17:56:19,724] Trial 181 finished with value: 0.7895754133533023 and parameters: {'shrinkage': 0.0014308917087018124}. Best is trial 131 with value: 0.7895860007338827.
[I 2020-10-05 17:56:20,388] Trial 182 finished with value: 0.7895639961307074 and parameters: {'shrinkage': 0.0013073571673148296}. Best is trial 131 with value: 0.7895860007338827.
[I 2020-10-05 17:56:20,857] Trial 183 finished with value: 0.7891037824124754 and parameters: {'shrinkage': 0.015428210314241136}. Best is trial 131 with value: 0.7895860007338827.
[I 2020-10-05 17:56:21,284] Trial 184 finished with value: 0.7884968812889186 and parameters: {'shrinkage': 0.03394278649691965}. Best is trial 131 with value: 0.7895860007338827.
[I 2020-10-05 17:56:21,933] Trial 185 finished with value: 0.7894991805298763 and parameters: {'

In [23]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.7895860007338827


In [24]:
print('The best study parameters for the classifier are: ',best_study_params)

The best study parameters for the classifier are:  {'shrinkage': 0.0023665526949698365}


In [25]:
# Obtaining the best reduced feature LDA  model with eigen solver by setting best study parameters.
lda_eigen = lda_eigen.set_params(**best_study_params)

In [26]:
# fitting the best LDA_eigen  model on the reduced feature training set
lda_eigen.fit(X_train_red, y_train)

LinearDiscriminantAnalysis(shrinkage=0.0023665526949698365, solver='eigen')

In [27]:
# Calculating the Reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train_red ,y_train , lda_eigen, 'Reduced feature', 'training', 'LDA Classifier with eigen solver')

The roc_auc_score for the Reduced feature training set using the best LDA Classifier with eigen solver is  0.7921019598225975


In [28]:
# Calculating the Reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test_red ,y_test , lda_eigen, 'Reduced feature', 'test', 'LDA Classifier with eigen solver')

The roc_auc_score for the Reduced feature test set using the best LDA Classifier with eigen solver is  0.7966579349261759


###  Computing the test set roc_auc score using LDA classifier with eigen solver & shrinkage utilizing the Ledoit-Wolf lemma with no other hyperparameters to tune.


In [24]:
# Instantiating the LDA classifier object using  eigen solver utilizing shrinkage using the Ledoit-Wolf lemma
lda_eigen_auto = LinearDiscriminantAnalysis(solver='eigen',shrinkage='auto')

In [30]:
# fitting the best LDA_eigen model with Ledoit-Wolf shrinkage on the reduced feature training set
lda_eigen_auto.fit(X_train_red, y_train)

LinearDiscriminantAnalysis(shrinkage='auto', solver='eigen')

In [31]:
# Calculating the Reduced feature training set roc_auc score using Ledoit-Wolf shrinkage
cal_roc_auc(X_train_red , y_train, lda_eigen_auto, 'Reduced feature', 'training', 'LDA Classifier with Ledoit-Wolf shrinkage')

The roc_auc_score for the Reduced feature training set using the best LDA Classifier with Ledoit-Wolf shrinkage is  0.7918050136542901


In [32]:
# Calculating the Reduced feature test set roc_auc score using Ledoit-Wolf shrinkage
cal_roc_auc( X_test_red, y_test, lda_eigen_auto, 'Reduced feature', 'test', 'LDA Classifier with Ledoit-Wolf shrinkage')

The roc_auc_score for the Reduced feature test set using the best LDA Classifier with Ledoit-Wolf shrinkage is  0.7966489427567338


In [33]:
# Saving the Reduced feature best LDA model 
import joblib
joblib.dump(lda_eigen,'Linear_Dis_Reduced.joblib')

['Linear_Dis_Reduced.joblib']

### Calculating R_R ratio for Vanilla LDA Classifier using SVD solver

In [11]:
# Computing the CV scores using sklearn's cross_val_score
score_lda = cross_val_score(lda_svd, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [12]:
print('The reward associated with the vanilla lda Classifier using roc_auc metric is: ',np.mean(score_lda))

The reward associated with the vanilla lda Classifier using roc_auc metric is:  0.7893660018231028


In [13]:
print('The risk associated with the vanilla lda Classifier using roc_auc metric is: ',np.std(score_lda))

The risk associated with the vanilla lda Classifier using roc_auc metric is:  0.015431504650271223


In [14]:
R_R_Ratio_lda = np.mean(score_lda)/np.std(score_lda)

In [15]:
print('The reward risk ratio for the vanilla lda Classifier using roc_auc metric is: ',R_R_Ratio_lda)

The reward risk ratio for the vanilla lda Classifier using roc_auc metric is:  51.152886235836306


#### The R_R ratio for the vanilla lda Classifier using roc_auc metric is:  51.152886235836306

### Calculating R_R ratio for the best  lda Classifier using eigen solver & shrinkage

In [None]:
# Loading the best lda classifier with eigen solver & shrinkage model.
import joblib
lda_eigen = joblib.load('Linear_Dis_Reduced.joblib')

In [17]:
# Computing the CV scores using sklearn's cross_val_score
score_lda_eigen = cross_val_score(lda_eigen, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [20]:
print('The reward associated with the best lda classifier with eigen solver using roc_auc metric is: ',np.mean(score_lda_eigen))

The reward associated with the best lda classifier with eigen solver using roc_auc metric is:  0.7895860007338827


In [21]:
print('The risk associated with the best lda classifier with eigen solver using roc_auc metric is: ',np.std(score_lda_eigen))

The risk associated with the best lda classifier with eigen solver using roc_auc metric is:  0.015607689779301505


In [22]:
R_R_Ratio_lda_eigen = np.mean(score_lda_eigen)/np.std(score_lda_eigen)

In [23]:
print('The reward risk ratio for the  best lda classifier with eigen solver using roc_auc metric is: ',R_R_Ratio_lda_eigen)

The reward risk ratio for the  best lda classifier with eigen solver using roc_auc metric is:  50.589549888479354


#### The R_R ratio for the best lda classifier with eigen solver using roc_auc metric is:  50.589549888479354

### Calculating R_R ratio for the lda Classifier with eigen solver & shrinkage utilizing the Ledoit-Wolf lemma

In [25]:
# Computing the CV scores using sklearn's cross_val_score
score_lda_auto = cross_val_score(lda_eigen_auto, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [26]:
print('The reward associated with the lda Classifier utilizing LW lemma shrinkage using roc_auc metric is: ',np.mean(score_lda_auto))

The reward associated with the lda Classifier utilizing LW lemma shrinkage using roc_auc metric is:  0.7893280433729302


In [27]:
print('The risk associated with the lda Classifier utilizing LW lemma shrinkage using roc_auc metric is: ',np.std(score_lda_auto))

The risk associated with the lda Classifier utilizing LW lemma shrinkage using roc_auc metric is:  0.015975073971514493


In [28]:
R_R_Ratio_lda_auto = np.mean(score_lda_auto)/np.std(score_lda_auto)

In [29]:
print('The reward risk ratio for the lda Classifier utilizing LW lemma shrinkage using roc_auc metric is: ',R_R_Ratio_lda_auto)

The reward risk ratio for the lda Classifier utilizing LW lemma shrinkage using roc_auc metric is:  49.40997736726593


#### The R_R ratio for the lda Classifier utilizing LW lemma shrinkage using roc_auc metric is: 49.40997736726593

### R_R Ratio for the best LDA classifier using reduced feature set is:  51.152886235836306, corresponding to vanilla LDA Classifier.


## Observations:

### 1) The Best LDA model with eigen solver & tuned shrinkage performed  worse than the Logistic Regression Model as well as Tree based Bagging models on the test set. This was expected as the underlying Feature space is not multivariate normal and doesn't have the same covariance matrix for both the classes ,which is the underlying assumption of the LDA model. However its R_R ratio was greater than that of the logistic Regression & less than those of Tree based Bagging Classifiers.
### 2) Among all lda variants, the R_R ratio is greatest for plain vanilla lda classifier with SVD solver.


## Model_5 : Quadratic Discriminant Analysis Classifier

### Computing the Reduced Feature test set roc_auc score using QDA classifier 

In [34]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [35]:
# Instantiating the QDA classifier object 
QDA = QuadraticDiscriminantAnalysis()

In [36]:
# Defining the appropriate objective function for the QDA classifier
def objective_wrappper_qda(X_tr, y_tr, cls=None, cv_strat=None):
    '''
    Optimizes Quadratic Discriminant Analysis parameters on the given training set X_tr,y_tr
    using cv_strat cross-validation object
    
    '''
    
    def objective(trial):
        params = {
        'reg_param': trial.suggest_uniform('reg_param', 0, 1)
        }
        
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [37]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = train_test_roc_auc(X_train_red, y_train, QDA, objective_wrappper_qda, cv_strat, n_trials=200)

[I 2020-10-05 20:11:37,840] A new study created in memory with name: no-name-91ac7f4c-49b5-449f-84b5-96be86a9447f
[I 2020-10-05 20:11:39,459] Trial 0 finished with value: 0.7765753071782676 and parameters: {'reg_param': 0.7078196969220766}. Best is trial 0 with value: 0.7765753071782676.
[I 2020-10-05 20:11:39,931] Trial 1 finished with value: 0.7719732203201287 and parameters: {'reg_param': 0.8091096586699921}. Best is trial 0 with value: 0.7765753071782676.
[I 2020-10-05 20:11:40,389] Trial 2 finished with value: 0.7574210763089786 and parameters: {'reg_param': 0.9738162146137824}. Best is trial 0 with value: 0.7765753071782676.
[I 2020-10-05 20:11:40,735] Trial 3 finished with value: 0.7765107151192494 and parameters: {'reg_param': 0.7093569215722008}. Best is trial 0 with value: 0.7765753071782676.
[I 2020-10-05 20:11:41,209] Trial 4 finished with value: 0.7751665222376182 and parameters: {'reg_param': 0.7433205322743957}. Best is trial 0 with value: 0.7765753071782676.
[I 2020-10-

[I 2020-10-05 20:12:19,250] Trial 91 finished with value: 0.7824058427255867 and parameters: {'reg_param': 0.11268205315283532}. Best is trial 53 with value: 0.7824111163861736.
[I 2020-10-05 20:12:19,596] Trial 92 finished with value: 0.7823833138362933 and parameters: {'reg_param': 0.11757984855770283}. Best is trial 53 with value: 0.7824111163861736.
[I 2020-10-05 20:12:19,942] Trial 93 finished with value: 0.7820975106642039 and parameters: {'reg_param': 0.07431131540802241}. Best is trial 53 with value: 0.7824111163861736.
[I 2020-10-05 20:12:20,290] Trial 94 finished with value: 0.7824006385134676 and parameters: {'reg_param': 0.0997489386262268}. Best is trial 53 with value: 0.7824111163861736.
[I 2020-10-05 20:12:20,751] Trial 95 finished with value: 0.7821936640399392 and parameters: {'reg_param': 0.14036357178254655}. Best is trial 53 with value: 0.7824111163861736.
[I 2020-10-05 20:12:21,082] Trial 96 finished with value: 0.7819663843085591 and parameters: {'reg_param': 0.16

[I 2020-10-05 20:12:57,566] Trial 181 finished with value: 0.7824076913526641 and parameters: {'reg_param': 0.11076705928510958}. Best is trial 176 with value: 0.7824142568670662.
[I 2020-10-05 20:12:58,019] Trial 182 finished with value: 0.7824050120008763 and parameters: {'reg_param': 0.11278912719221927}. Best is trial 176 with value: 0.7824142568670662.
[I 2020-10-05 20:12:58,381] Trial 183 finished with value: 0.7823196217513977 and parameters: {'reg_param': 0.12480164887548191}. Best is trial 176 with value: 0.7824142568670662.
[I 2020-10-05 20:12:58,874] Trial 184 finished with value: 0.7823750325339391 and parameters: {'reg_param': 0.09578952687000243}. Best is trial 176 with value: 0.7824142568670662.
[I 2020-10-05 20:12:59,361] Trial 185 finished with value: 0.7821345248314955 and parameters: {'reg_param': 0.1492315906406596}. Best is trial 176 with value: 0.7824142568670662.
[I 2020-10-05 20:12:59,877] Trial 186 finished with value: 0.7821899373102511 and parameters: {'reg_p

In [38]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.7824142568670662


In [39]:
print('The best study parameters for the classifier are: ',best_study_params)

The best study parameters for the classifier are:  {'reg_param': 0.10638381702463794}


In [40]:
# Obtaining the best reduced feature QDA  model with eigen solver by setting best study parameters.
QDA = QDA.set_params(**best_study_params)

In [42]:
# fitting the best QDA  model on the reduced feature training set
QDA.fit(X_train_red,y_train)

QuadraticDiscriminantAnalysis(reg_param=0.10638381702463794)

In [43]:
# Calculating the Reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train_red , y_train, QDA, 'Reduced feature', 'training', 'QDA classifier')

The roc_auc_score for the Reduced feature training set using the best QDA classifier is  0.7851086555035252


In [45]:
# Calculating the Reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test_red, y_test, QDA, 'Reduced feature', 'test', 'QDA classifier')

The roc_auc_score for the Reduced feature test set using the best QDA classifier is  0.790439849757064


In [46]:
# Saving the Reduced feature best QDA model 
joblib.dump(QDA,'Quadratic_Dis_Reduced.joblib')

['Quadratic_Dis_Reduced.joblib']

### Calculating R_R ratio for best QDA Classifier.

In [30]:
# Loading the best QDA Classifier model
import joblib
QDA = joblib.load('Quadratic_Dis_Reduced.joblib')

In [31]:
# Computing the CV scores using sklearn's cross_val_score
score_qda = cross_val_score(QDA, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [32]:
print('The reward associated with the best QDA Classifier using roc_auc metric is: ',np.mean(score_qda))

The reward associated with the best QDA Classifier using roc_auc metric is:  0.7824142568670662


In [33]:
print('The risk associated with the best QDA Classifier using roc_auc metric is: ',np.std(score_qda))

The risk associated with the best QDA Classifier using roc_auc metric is:  0.020311006351920957


In [34]:
R_R_Ratio_qda = np.mean(score_qda)/np.std(score_qda)

In [35]:
print('The reward risk ratio for the best QDA Classifier using roc_auc metric is: ',R_R_Ratio_qda)

The reward risk ratio for the best QDA Classifier using roc_auc metric is:  38.5216883550956


#### The R_R Ratio for the best QDA classifier using reduced feature set is:  38.5216883550956

## Observations:
### 1) The tuned QDA model has the worst test set roc_auc score as well as R_R ratio of all the fitted models till now. This was expected as the underlying Feature space is not multivariate normal, which is the underlying assumption of the QDA model. Any departure from normality affects the QDA more than the LDA, which is also observed here.
### 2) Owing to both very low R_R ratio and test set roc_auc, we can rule out the QDA model for this dataset.

### R_R Ratio for the best  classifier  in Discriminant Analysis family utilizing roc_auc metric is:  51.152886235836306,  corresponding to Vanilla LDA Classifier.

### _Thus keeping everything into account , for this dataset , the best discriminant classifier is LDA model with eigen solver & tuned shrinkage ._