### Import the relevant packages 

In [14]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, balanced_accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer

In [15]:
train_data = pd.read_csv('train_data.csv',index_col=0)
test_data = pd.read_csv('test_data.csv',index_col=0)
train_data.index = range(len(train_data))
train_data.loc[train_data['OCCUPATION']!=1,'OCCUPATION'] = 0
test_data.index = range(len(test_data))
test_data.loc[test_data['OCCUPATION']!=1,'OCCUPATION'] = 0

## Prepping data

In [16]:
# first, we list all the categorical variables to be one hot encoded
cat_vars = ['MARRIAGE', 'EDUCATION']

In [17]:
# create an encoder for each cat_vars
encoders = [OneHotEncoder(categories='auto') for _ in range(len(cat_vars))] 
# encode each of the cat_vars with their respective encoder
encoded_tr = [encoders[i].fit_transform(train_data[[cat_var]]).todense() for i,cat_var in enumerate(cat_vars)]
encoded_test = [encoders[i].fit_transform(test_data[[cat_var]]).todense() for i,cat_var in enumerate(cat_vars)]

In [18]:
# drop the label column and also drop the cat_vars 
# this way we can combine the encoded categorical variables with the continuous variables 
X_train = pd.concat([train_data.iloc[:,:-1].drop(cat_vars, axis=1), 
                     pd.DataFrame(np.concatenate(encoded_tr, axis=1))], axis=1)
X_test = pd.concat([test_data.iloc[:,:-1].drop(cat_vars, axis=1), 
                    pd.DataFrame(np.concatenate(encoded_test, axis=1))], axis=1)
y_train = train_data.iloc[:,-1] 
y_test = test_data.iloc[:,-1]
X_train = X_train.rename(columns={0:'Marriage 1',1:'Marriage 2',2:'Marriage 3',3:'Edu 1',4:'Edu 2',5:'Edu 3',
                                  6:'Edu 4',7:'Edu 5',8:'Edu 6',9:'Edu 7'})
# Note that in the testing data, we do not have Marriage 3 and Edu 6
X_test = X_test.rename(columns={0:'Marriage 1',1:'Marriage 2',2:'Edu 1',3:'Edu 2',4:'Edu 3',
                                  5:'Edu 4',6:'Edu 5',7:'Edu 7'})
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

### Normalize continuous features. Note that for the testing data, we still use the mean and standard deviation from the training data to do the normalization.¶

In [19]:
for i in [0,1,2,3,4,5,8]:
    X1 = X_train.iloc[:,i]
    mean = X1.mean()
    std = X1.std()
    X_train.iloc[:,i] = (X1-mean)/std
    X_test.iloc[:,i] = (X_test.iloc[:,i]-mean)/std

In [22]:
# Get rid of some dummy variables to avoid perfect multicollinearity
X_train = X_train.drop(['Marriage 3','Edu 6'], axis=1)

In [23]:
X_train.head()

Unnamed: 0,DEBT,YRS_IN_RESIDENT,AGE,YRS_OF_EMPLOYMENT,DTI,NUM_PREV_APP,OCCUPATION,PROVIDED_SIN,INCOME,CREDIT_PROFILE,Marriage 1,Marriage 2,Edu 1,Edu 2,Edu 3,Edu 4,Edu 5,Edu 7
0,0.835945,-0.686741,-1.085931,-0.642788,-0.074637,-0.555121,0.0,1.0,1.530698,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.294264,-1.364851,1.314706,-0.095677,-0.630176,-0.555121,1.0,1.0,-0.109336,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.072081,-1.025796,0.238559,-0.045939,1.03644,-0.555121,1.0,1.0,1.804037,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.233465,-0.347686,-1.00315,-0.841738,-0.667212,0.153818,0.0,1.0,-1.74937,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.309851,1.008535,1.066364,0.451435,-0.185745,-0.555121,0.0,1.0,0.164003,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [24]:
X_test.head()

Unnamed: 0,DEBT,YRS_IN_RESIDENT,AGE,YRS_OF_EMPLOYMENT,DTI,NUM_PREV_APP,OCCUPATION,PROVIDED_SIN,INCOME,CREDIT_PROFILE,Marriage 1,Marriage 2,Edu 1,Edu 2,Edu 3,Edu 4,Edu 5,Edu 7
0,-0.233465,-1.364851,-0.672028,0.35196,0.962368,-0.555121,1.0,1.0,-1.476031,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.683172,-0.00863,0.735242,0.053536,0.814224,0.862758,1.0,1.0,0.98402,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.309851,-0.347686,-1.168711,-0.941213,-0.667212,-0.555121,1.0,0.0,-0.656014,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,5.189971,1.008535,0.321339,-0.742263,-0.444997,0.390132,0.0,0.0,-0.382675,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.157078,1.34759,0.4869,1.048284,0.073506,0.862758,0.0,1.0,1.804037,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


### Our first task is to see if the dataset has class imbalance.  In our case, the number of approvals and rejections are roughly the same in the training data.

In [25]:
y_train.value_counts()

APPROVAL_STATUS
0    300
1    250
Name: count, dtype: int64

## Import a number of classifiers

In [26]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

### For GirdSearchCV, the default scoring for classifier is accuracy, which seems to be a reasonable choice in our problem given that the data does not have serious imbalance problem. In case there is a serious imbalance problem, then other socring choice maybe more appropriate. These include:
### (1) Balanced accuracy
The balanced accuracy is the average between the sensitivity (true positive rate, or recall) and the specificity (true negative rate), 
which measures the 
average accuracy obtained from both the minority and majority classes. This quantity reduces to the
tradition
l accuracy if a classifier performs equally well on either classes. Conversely, if the high value o 
the traditional accuracy is due
to the classifier taking advantage of the distribution of the majority clas, 
then the balanced accuracy will decrease compared to the accu.y
### (2) F1-ratio
F1-ratio is the harmonic mean of precision and recall, i.e., F1-ratio = 2/(1/precision+1/recall).  In scenarios where the dataset is imbalanced, using the F1-ratio helps mitigate biased evaluations. Since the F1-ratio accounts for both precision and recall, it cam provide a more fair assessment of the model's performance as compared with the accuracy score.

Finally, the cost of type I and type II errors maybe different.  In that case, we need to have more explicit objective on what we try to maximize.  For example, if we want to maximize TP-4*FP, we can define a custom scoring function by using make_scorer.  In the following, we stay with the default scoring option of accuracy.𝑖𝑡𝑦) 


### Model 1: LDA, the only tuning parameter is shrinkage, we first use the default option, and then use the GridSearchCV to find the best shrinkage paramter.

In [40]:
model = LinearDiscriminantAnalysis()
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)
bal_accuracy = balanced_accuracy_score(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
TPR = recall_score(y_test, y_test_pred)   # Sensitivity = TP/(TP+FN)
TNR = TN/(TN+FP)                          # Specificity
Precision = precision_score(y_test, y_test_pred)
F1score = f1_score(y_test, y_test_pred)
Q = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)    
print("Classifier: LDA (default)")    
print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
print("Performance on Testing Data:")
print("Accuracy Score = {:.4f}".format(test_accuracy))
print("Balanced Accuracy Socre = {:.4f}".format(bal_accuracy))
print("True Positive Rate = {:.4f}".format(TPR))
print("True Negative Rate = {:.4f}".format(TNR))
print("Precision = {:.4f}".format(Precision))
print("F1-score = {:.4f}".format(F1score))
print("AUC = {:.4f}\n".format(roc_auc))

Classifier: LDA (default)
Accuracy Score on Training Data = 0.8600
Performance on Testing Data:
Accuracy Score = 0.8714
Balanced Accuracy Socre = 0.8723
True Positive Rate = 0.8772
True Negative Rate = 0.8675
Precision = 0.8197
F1-score = 0.8475
AUC = 0.9269



In [74]:
model = LinearDiscriminantAnalysis(solver='lsqr')
param_grid = {'shrinkage': np.linspace(0,1,11)}
clf_lda = GridSearchCV(model, param_grid, cv=3, n_jobs=-1)
clf_lda.fit(X_train,y_train)
clf_reg_params = clf_lda.best_params_['shrinkage']
print("Best Shrinkage Parameter = {:.4f}\n".format(clf_reg_params))

Best Shrinkage Parameter = 0.3000



In [52]:
model = LinearDiscriminantAnalysis(solver='lsqr',shrinkage=clf_reg_params)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)
bal_accuracy = balanced_accuracy_score(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
TPR = recall_score(y_test, y_test_pred)   # Sensitivity = TP/(TP+FN)
TNR = TN/(TN+FP)                          # Specificity
Precision = precision_score(y_test, y_test_pred)
F1score = f1_score(y_test, y_test_pred)
Q = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)    
print("Classifier: LDA (shrinkage parameter based on 3-fold CV)")    
print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
print("Performance on Testing Data:")
print("Accuracy Score = {:.4f}".format(test_accuracy))
print("Balanced Accuracy Socre = {:.4f}".format(bal_accuracy))
print("True Positive Rate = {:.4f}".format(TPR))
print("True Negative Rate = {:.4f}".format(TNR))
print("Precision = {:.4f}".format(Precision))
print("F1-score = {:.4f}".format(F1score))
print("AUC = {:.4f}\n".format(roc_auc))

Classifier: LDA (shrinkage parameter based on 3-fold CV)
Accuracy Score on Training Data = 0.8673
Performance on Testing Data:
Accuracy Score = 0.8929
Balanced Accuracy Socre = 0.8849
True Positive Rate = 0.8421
True Negative Rate = 0.9277
Precision = 0.8889
F1-score = 0.8649
AUC = 0.9334



### Model 2: Logistic regression, we first try the default option, i.e., penalty='l2', C=1.0

In [53]:
model = LogisticRegression()
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)
bal_accuracy = balanced_accuracy_score(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
TPR = recall_score(y_test, y_test_pred)   # Sensitivity = TP/(TP+FN)
TNR = TN/(TN+FP)                          # Specificity
Precision = precision_score(y_test, y_test_pred)
F1score = f1_score(y_test, y_test_pred)
Q = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)    
print("Classifier: Logistic Regression (default)")    
print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
print("Performance on Testing Data:")
print("Accuracy Score = {:.4f}".format(test_accuracy))
print("Balanced Accuracy Socre = {:.4f}".format(bal_accuracy))
print("True Positive Rate = {:.4f}".format(TPR))
print("True Negative Rate = {:.4f}".format(TNR))
print("Precision = {:.4f}".format(Precision))
print("F1-score = {:.4f}".format(F1score))
print("AUC = {:.4f}\n".format(roc_auc))

Classifier: Logistic Regression (default)
Accuracy Score on Training Data = 0.8691
Performance on Testing Data:
Accuracy Score = 0.8643
Balanced Accuracy Socre = 0.8636
True Positive Rate = 0.8596
True Negative Rate = 0.8675
Precision = 0.8167
F1-score = 0.8376
AUC = 0.9302



### We now use GridSearchCV to find the best C for the L2-penalty

In [73]:
model = LogisticRegression(penalty='l2')
param_grid = {'C': np.linspace(0.01,10,1000)}
clf_lgr = GridSearchCV(model, param_grid, n_jobs=-1, cv=3)
clf_lgr.fit(X_train, y_train)
clf_reg_params = clf_lgr.best_params_['C']
print("Best regularization parameter = {:.3f}\n".format(clf_reg_params))

Best regularization parameter = 0.260



In [58]:
model = LogisticRegression(penalty='l2',C=clf_reg_params)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)
bal_accuracy = balanced_accuracy_score(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
TPR = recall_score(y_test, y_test_pred)   # Sensitivity = TP/(TP+FN)
TNR = TN/(TN+FP)                          # Specificity
Precision = precision_score(y_test, y_test_pred)
F1score = f1_score(y_test, y_test_pred)
Q = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)    
print("Classifier: Logistic Regression (L2 penalty parameter based on 3-fold CV)")    
print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
print("Performance on Testing Data:")
print("Accuracy Score = {:.4f}".format(test_accuracy))
print("Balanced Accuracy Socre = {:.4f}".format(bal_accuracy))
print("True Positive Rate = {:.4f}".format(TPR))
print("True Negative Rate = {:.4f}".format(TNR))
print("Precision = {:.4f}".format(Precision))
print("F1-score = {:.4f}".format(F1score))
print("AUC = {:.4f}\n".format(roc_auc))

Classifier: Logistic Regression (L2 penalty parameter based on 3-fold CV)
Accuracy Score on Training Data = 0.8709
Performance on Testing Data:
Accuracy Score = 0.8786
Balanced Accuracy Socre = 0.8756
True Positive Rate = 0.8596
True Negative Rate = 0.8916
Precision = 0.8448
F1-score = 0.8522
AUC = 0.9319



### We now repeat the exercise using L1 penalty

In [72]:
model = LogisticRegression(solver='liblinear',penalty='l1')
param_grid = {'C': np.linspace(0.05,5,100)}
clf_lgr = GridSearchCV(model, param_grid, n_jobs=-1, cv=3)
clf_lgr.fit(X_train, y_train)
clf_reg_params = clf_lgr.best_params_['C']
print("Best regularization parameter = {:.3f}\n".format(clf_reg_params))

Best regularization parameter = 1.650



In [60]:
model = LogisticRegression(solver='liblinear',penalty='l1',C=clf_reg_params)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)
bal_accuracy = balanced_accuracy_score(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
TPR = recall_score(y_test, y_test_pred)   # Sensitivity = TP/(TP+FN)
TNR = TN/(TN+FP)                          # Specificity
Precision = precision_score(y_test, y_test_pred)
F1score = f1_score(y_test, y_test_pred)
Q = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)    
print("Classifier: Logistic Regression (L1 penalty parameter based on 3-fold CV)")    
print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
print("Performance on Testing Data:")
print("Accuracy Score = {:.4f}".format(test_accuracy))
print("Balanced Accuracy Socre = {:.4f}".format(bal_accuracy))
print("True Positive Rate = {:.4f}".format(TPR))
print("True Negative Rate = {:.4f}".format(TNR))
print("Precision = {:.4f}".format(Precision))
print("F1-score = {:.4f}".format(F1score))
print("AUC = {:.4f}\n".format(roc_auc))

Classifier: Logistic Regression (L1 penalty parameter based on 3-fold CV)
Accuracy Score on Training Data = 0.8727
Performance on Testing Data:
Accuracy Score = 0.8571
Balanced Accuracy Socre = 0.8575
True Positive Rate = 0.8596
True Negative Rate = 0.8554
Precision = 0.8033
F1-score = 0.8305
AUC = 0.9332



### Model 3: K nearest neighbors, we first try the default option, i.e., n_neighbors=5

In [62]:
model = KNeighborsClassifier()
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)
bal_accuracy = balanced_accuracy_score(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
TPR = recall_score(y_test, y_test_pred)   # Sensitivity = TP/(TP+FN)
TNR = TN/(TN+FP)                          # Specificity
Precision = precision_score(y_test, y_test_pred)
F1score = f1_score(y_test, y_test_pred)
Q = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)    
print("Classifier: KNN (default)")    
print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
print("Performance on Testing Data:")
print("Accuracy Score = {:.4f}".format(test_accuracy))
print("Balanced Accuracy Socre = {:.4f}".format(bal_accuracy))
print("True Positive Rate = {:.4f}".format(TPR))
print("True Negative Rate = {:.4f}".format(TNR))
print("Precision = {:.4f}".format(Precision))
print("F1-score = {:.4f}".format(F1score))
print("AUC = {:.4f}\n".format(roc_auc))

Classifier: KNN (default)
Accuracy Score on Training Data = 0.8582
Performance on Testing Data:
Accuracy Score = 0.8643
Balanced Accuracy Socre = 0.8526
True Positive Rate = 0.7895
True Negative Rate = 0.9157
Precision = 0.8654
F1-score = 0.8257
AUC = 0.9195



### We now use GridSearchCV to search for the best tuning parameters n_neighbors.  There are other things that we can tune as well, like metric and weights, but the most important one is the number of neighbors

In [80]:
model = KNeighborsClassifier()
param_grid = {'n_neighbors': range(1,11)}
clf_knn = GridSearchCV(model, param_grid, n_jobs=-1, cv=3)
clf_knn.fit(X_train, y_train)
clf_knn_params = clf_knn.best_params_['n_neighbors']
print("Best number of neighbors = {:.0f}\n".format(clf_knn_params))

Best number of neighbors = 7



In [81]:
model = KNeighborsClassifier(n_neighbors=clf_knn_params)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)
bal_accuracy = balanced_accuracy_score(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
TPR = recall_score(y_test, y_test_pred)   # Sensitivity = TP/(TP+FN)
TNR = TN/(TN+FP)                          # Specificity
Precision = precision_score(y_test, y_test_pred)
F1score = f1_score(y_test, y_test_pred)
Q = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)    
print("Classifier: KNN (Number of Neighbors based on )")    
print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
print("Performance on Testing Data:")
print("Accuracy Score = {:.4f}".format(test_accuracy))
print("Balanced Accuracy Socre = {:.4f}".format(bal_accuracy))
print("True Positive Rate = {:.4f}".format(TPR))
print("True Negative Rate = {:.4f}".format(TNR))
print("Precision = {:.4f}".format(Precision))
print("F1-score = {:.4f}".format(F1score))
print("AUC = {:.4f}\n".format(roc_auc))

Classifier: KNN (Number of Neighbors based on 3-fold CV)
Accuracy Score on Training Data = 0.8691
Performance on Testing Data:
Accuracy Score = 0.8714
Balanced Accuracy Socre = 0.8613
True Positive Rate = 0.8070
True Negative Rate = 0.9157
Precision = 0.8679
F1-score = 0.8364
AUC = 0.9130



### Model 4: Decision Tree Classifier, we first try the default option, i.e., criterion=gini, max_depth=None, min_samples_split=2, min_samples_leaf=1, etc.

In [89]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)
bal_accuracy = balanced_accuracy_score(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
TPR = recall_score(y_test, y_test_pred)   # Sensitivity = TP/(TP+FN)
TNR = TN/(TN+FP)                          # Specificity
Precision = precision_score(y_test, y_test_pred)
F1score = f1_score(y_test, y_test_pred)
Q = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)    
print("Classifier: Decision Tree (default)")    
print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
print("Performance on Testing Data:")
print("Accuracy Score = {:.4f}".format(test_accuracy))
print("Balanced Accuracy Socre = {:.4f}".format(bal_accuracy))
print("True Positive Rate = {:.4f}".format(TPR))
print("True Negative Rate = {:.4f}".format(TNR))
print("Precision = {:.4f}".format(Precision))
print("F1-score = {:.4f}".format(F1score))
print("AUC = {:.4f}\n".format(roc_auc))

Classifier: Decision Tree (default)
Accuracy Score on Training Data = 1.0000
Performance on Testing Data:
Accuracy Score = 0.8571
Balanced Accuracy Socre = 0.8438
True Positive Rate = 0.7719
True Negative Rate = 0.9157
Precision = 0.8627
F1-score = 0.8148
AUC = 0.8438



### We now try to tune the decision based on: (1) max_depth, (2) min_samples_split, and (3) min_samples_leaf.  There are other options that we can tune as well but it will take longer to try all combinations.

In [92]:
model = DecisionTreeClassifier()
param_grid = {'max_depth': range(3,13),
              'min_samples_split': [10,20,30,40,50],
              'min_samples_leaf': [1,5,10,15,20]}
clf_dt = GridSearchCV(model, param_grid, n_jobs=-1, cv=3)
clf_dt.fit(X_train, y_train)
clf_dt_param1 = clf_dt.best_params_['max_depth']
clf_dt_param2 = clf_dt.best_params_['min_samples_split']
clf_dt_param3 = clf_dt.best_params_['min_samples_leaf']
print("Best Max Depth = {:.0f}".format(clf_dt_param1))
print("Best Min Samples Split = {:.0f}".format(clf_dt_param2))
print("Best Min Samples Leaf = {:.0f}\n".format(clf_dt_param3))

Best Max Depth = 7
Best Min Samples Split = 20
Best Min Samples Leaf = 5



In [95]:
model = DecisionTreeClassifier(max_depth=clf_dt_param1,
                               min_samples_split=clf_dt_param2,
                               min_samples_leaf=clf_dt_param3)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)
bal_accuracy = balanced_accuracy_score(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
TPR = recall_score(y_test, y_test_pred)   # Sensitivity = TP/(TP+FN)
TNR = TN/(TN+FP)                          # Specificity
Precision = precision_score(y_test, y_test_pred)
F1score = f1_score(y_test, y_test_pred)
Q = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)    
print("Classifier: Decision Tree (tuning parameters based on 3-fold CV)")    
print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
print("Performance on Testing Data:")
print("Accuracy Score = {:.4f}".format(test_accuracy))
print("Balanced Accuracy Socre = {:.4f}".format(bal_accuracy))
print("True Positive Rate = {:.4f}".format(TPR))
print("True Negative Rate = {:.4f}".format(TNR))
print("Precision = {:.4f}".format(Precision))
print("F1-score = {:.4f}".format(F1score))
print("AUC = {:.4f}\n".format(roc_auc))

Classifier: Decision Tree (tuning parameters based on 3-fold CV)
Accuracy Score on Training Data = 0.8873
Performance on Testing Data:
Accuracy Score = 0.8857
Balanced Accuracy Socre = 0.8789
True Positive Rate = 0.8421
True Negative Rate = 0.9157
Precision = 0.8727
F1-score = 0.8571
AUC = 0.9211



### In genearl, we find that cross-validation helps us to improve the model.  It generally improves the accuracy, balanced accuracy, F1-ratio and AUC.  Overall, the LDA classifier with the best shrinkage parameter appears to perform the best, with an AUC of 0.9334