## Data Understanding

### Import Data

In [1]:
from __future__ import division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
#Import packages and read data
bank = pd.read_csv('Bank_Personal_Loan_Modelling.csv')

FileNotFoundError: ignored

### Missing Values

In [None]:
# No missing data
print(bank.info())

# No duplicated data
print(sum(bank.duplicated()))

### Data Description & Distribution

In [None]:
#First few rows of data
bank.head()

In [None]:
#number of columns and rows
bank.shape

In [None]:
#List all the column names
bank.columns

In [None]:
#Descriptive statistics for numeric attributes
bank.describe()

In [None]:
# Binary categories: target variable personal loan, also securities account, CD account, online banking and credit card. 
# Counts of both classes in binary variables

## Personal loan - Did this customer accept the personal loan offered in the last campaign? This is our target variable
print(bank['Personal Loan'].value_counts(dropna = False))

## Securities Account - Does the customer have a securities account with the bank?
print(bank['Securities Account'].value_counts(dropna = False))

## CD Account - Does the customer have a certificate of deposit (CD) account with the bank?
print(bank['CD Account'].value_counts(dropna = False))

## Online - Does the customer use internet banking facilities?
print(bank['Online'].value_counts(dropna = False))

## Credit Card - Does the customer use a credit card issued by UniversalBank?
print(bank['CreditCard'].value_counts(dropna = False))

In [None]:
# Interval categories: experience, age, income, CC avg and mortgage.

## Experience - Year of experience (negative values!!!)
print(bank.loc[bank.Experience < 0].head())
print(len(bank[bank.Experience < 0]))    # 52 negative

## Age - 
bank.Age.plot('hist')
plt.title("Age")
plt.show()

## Income - Annual income in dollars (which income, what the scale is?)
bank.Income.plot('hist')
plt.title("Income")
plt.show()

## CCAvg - Average credit card spending
bank['CCAvg'].plot('hist')
plt.title("CCAvg")
plt.show()

## Mortgage - Value of House Mortgage
bank['Mortgage'].plot('hist')
plt.title("Mortgage")
plt.show()

In [None]:
# Ordinal categories: family and education

# Family (mostly 1)
print(bank['Family'].value_counts())

# Education - Education level of the customer (mostly 1)
print(bank['Education'].value_counts())

In [None]:
# Counts for target variable
## Personal loan - Did this customer accept the personal loan offered in the last campaign? This is our target variable
print(bank['Personal Loan'].value_counts(dropna = False))

In [None]:
# Bar plot visualizing two classes in the target variable
count = bank["Personal Loan"].value_counts()
count.plot(kind = "bar", title = "count")

## Data Preparation

### Deal with Negative Values

In [None]:
# Deal with negative values in Experience
## Replace with the median experience from people having the same age
bank_Age = bank[bank.Experience >= 0].groupby('Age').Experience.median().to_frame('Experience Median').reindex()
bank = pd.merge(bank, bank_Age, on = "Age", how = "left")

# Round up the value to an integer
bank.loc[bank.Experience < 0, 'Experience'] = np.round(bank['Experience Median'])

# 23 years old == null
## Since no one in the dataset aged 23 had reported accurate experience, and based on that of 22 and 24, we replace negative values with 0.
bank.loc[bank.Experience.isnull(), 'Experience'] = 0

### Categorized Categorical Values

In [None]:
# ZIP Code
bank['ZIP Code'] = bank['ZIP Code'].astype(str)
bank['Area'] = bank['ZIP Code'].str.slice(0, 3)
bank['Area'] = bank['Area'].astype('category')
bank['ZIP Code'] = bank['ZIP Code'].astype('category')

In [None]:
# Education, Family
bank['Education'] = bank['Education'].astype('category')
bank['Family'] = bank['Family'].astype('category')

In [None]:
# Show all the data types
bank.dtypes

### Correlation

In [None]:
# Drop unnecessary columns in order to plot the correlation plot.
bank.drop(['ID', 'Experience Median', 'ZIP Code'],  inplace=True, axis=1)

In [None]:
corr = bank.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

corr

### Dummy Variables

In [None]:
# Family
## Family has 4 levels, and we create 3 dummy variables

familydummy = pd.get_dummies(bank['Family'], prefix='Family')
bank_family = pd.concat([bank, familydummy], axis=1)      
bank_family.drop(['Family_4'], inplace=True, axis=1)

In [None]:
# Education
## Education has 3 levels, and we create 2 dummy variables

edudummy =  pd.get_dummies(bank_family['Education'], prefix='Education')
bank_edu = pd.concat([bank_family, edudummy], axis=1)      
bank_edu.drop(['Education_3'], inplace=True, axis=1)

### Describe Cleaned Dataset

In [None]:
# Total numbers of rows and columns
bank_edu.shape

In [None]:
# Descriptive statistics of the cleaned dataset
bank_edu.describe()

In [None]:
# List all the column names
list(bank_edu.columns.values)

### Define Variables

In [None]:
# We define two x here
## x is used for KNN and Logistic Regression. This model contains dummy variables.
## xtree is used for decision tree. This model treated Family and Education as categorical variables. 
##  Targe variable is 'Personal Loan'

x = bank_edu[['Age',
 'Income',
 'CCAvg',
 'Mortgage',
 'Securities Account',
 'CD Account',
 'Online',
 'CreditCard',
 'Family_1',
 'Family_2',
 'Family_3',
 'Education_1',
 'Education_2']]

xtree = bank_edu[[ 'Age',
 'Income',
 'Family',
 'CCAvg',
 'Education',
 'Mortgage',
 'Securities Account',
 'CD Account',
 'Online',
 'CreditCard',
 'Area']]

y = bank_edu["Personal Loan"]

### Train/Test Data Split

In [None]:
# Split Data for KNN and Logistic Regression

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0, stratify = y)
x_train.describe()

In [None]:
# Split data for decision tree

xtree_train, xtree_test, y_train, y_test = train_test_split(xtree, y, test_size=0.30, random_state=0, stratify = y)
xtree_train.describe()

## Modeling & Evaluation (Pre-Resample)

### Decision Tree

In [None]:
# Use GridSearch to find the best decision tree parameters, and fit the training data to apply to test data

from sklearn import tree
import sklearn.grid_search as gs
from sklearn.tree import DecisionTreeClassifier 

gstree = gs.GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                                   param_grid= [{"criterion": ["gini", "entropy"],
                                                 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50],
                                                 "max_features": list(range(1, 11)),
                                                 'min_samples_leaf':[1,2,3,4,5],
                                                 'min_samples_split':[2,3,4,5]
                                                }],
                                   cv = 10,
                                   n_jobs = 4,
                                   scoring='accuracy')
gstree_fit = gstree.fit(xtree_train, y_train)
y_pred_tree = gstree_fit.predict(xtree_test)

## best parameter
print(gstree.best_params_)

## best estimator
print("Tree parameters: \n", gstree_fit.best_estimator_)

## best score
print("Best score: ", gstree.best_score_)

## The overall accuracy on the training set:
print("Training score: ", gstree.score(xtree_train, y_train))

## The overall accuracy on the test set:
print("Test accuracy: ", gstree.score(xtree_test, y_test))

In [None]:
# Generalization Performance of decision tree on test data

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

accuracy = sum(y_pred_tree == y_test)/len(y_test)
error = 1 - accuracy
print("The predictive accuracy is: ", round(accuracy, 2))
print("The classification error is: ", round(error, 2))
print(classification_report(y_test, y_pred_tree))

# Confusion matrix
from sklearn.metrics import confusion_matrix

cnf_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_tree), columns = ['Predict 0', 'Predict 1'], index = ['Actual 0', 'Actual 1'])
print("The Confusion matrix: \n", cnf_matrix)

In [None]:
# Graph for the best decision tree model

from sklearn import tree
import graphviz 
from sklearn.tree import export_graphviz

model =  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')
model = model.fit(xtree_train, y_train)

dot_data = tree.export_graphviz(model, out_file=None, 
                                feature_names = [ 'Age', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 
                                                 'Securities Account', 'CD Account', 'Online', 'CreditCard', 'Area'],
                                class_names = ['0','1'],
                                filled = True, 
                                rounded = True)
                                                   
graph = graphviz.Source(dot_data) 
graph

In [None]:
#Cross Validation score for decision tree

from sklearn.model_selection import cross_val_score

scores_tree=cross_val_score(gstree, xtree_train, y_train, 
                         scoring='accuracy', cv=10)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_tree), np.std(scores_tree)))

### KNN

In [None]:
# Standardize for x (both training and test data)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)
x_std = sc.transform(x)

In [None]:
# Use GridSearch to find the best KNN parameters, and fit the training data to apply to test data.

from sklearn import neighbors, datasets
from sklearn.model_selection import GridSearchCV 

gsknn = GridSearchCV(estimator=neighbors.KNeighborsClassifier(p=2, 
                           metric='minkowski'),
                  param_grid=[{'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21],
                               'weights':['uniform','distance']}],
                  scoring='accuracy',
                  cv=10,
                  n_jobs=4)

gsknn_fit = gsknn.fit(x_train_std, y_train)          
y_pred_knn = gsknn_fit.predict(x_test_std)

## best parameter
print(gsknn.best_params_)

## best estimator
print("KNN parameters: \n", gsknn_fit.best_estimator_)

## best score
print("Best score: ", gsknn.best_score_)

## The overall accuracy on the training set:
print("Training score: ", gsknn.score(x_train_std, y_train))

## The overall accuracy on the test set:
print("Test accuracy: ", gsknn.score(x_test_std, y_test))

In [None]:
# Generalization Performance of KNN on test data

accuracy = sum(y_pred_knn == y_test)/len(y_test)
error = 1 - accuracy
print("The predictive accuracy is: ", round(accuracy, 2))
print("The classification error is: ", round(error, 2))
print(classification_report(y_test, y_pred_knn))

# Confusion matrix
cnf_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_knn), columns = ['Predict 0', 'Predict 1'], index = ['Actual 0', 'Actual 1'])
print("The Confusion matrix: \n", cnf_matrix)

In [None]:
# Cross Validation score for KNN model

scores_knn =cross_val_score(gsknn, x_train_std, y_train, 
                         scoring='accuracy', cv=10)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_knn), np.std(scores_knn)))

### Logistic Regression

In [None]:
# Use GridSearch to find the best logistic regression parameters, and fit the training data to apply to test data.

from sklearn.linear_model import LogisticRegression 

gslr = GridSearchCV(estimator=LogisticRegression(random_state=0),
                  param_grid=[{'C': [ 0.00001, 0.0001, 0.001, 0.01, 0.1 ,1 ,10 ,100, 1000, 10000, 100000, 1000000, 10000000],
                             'penalty':['l1','l2']}],
                  scoring='accuracy',
                  cv=10)

gslr_fit = gslr.fit(x_train, y_train)          
y_pred_lr = gslr_fit.predict(x_test)

## best parameter
print(gslr.best_params_)

## best estimator
print("Logistic parameters: \n", gslr_fit.best_estimator_)

## best score
print("Best score: ", gslr.best_score_)

## The overall accuracy on the training set:
print("Training score: ", gslr.score(x_train, y_train))

## The overall accuracy on the test set:
print("Test accuracy: ", gslr.score(x_test, y_test))

In [None]:
# Generalization Performance of logistic regression on test data

accuracy = sum(y_pred_lr == y_test)/len(y_test)
error = 1 - accuracy
print("The predictive accuracy is: ", round(accuracy, 2))
print("The classification error is: ", round(error, 2))
print(classification_report(y_test, y_pred_lr))

# Confusion matrix

cnf_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_lr), columns = ['Predict 0', 'Predict 1'], index = ['Actual 0', 'Actual 1'])
print("The Confusion matrix: \n", cnf_matrix)

In [None]:
# Cross validation performance for logistic regression 

scores_lr =cross_val_score(gslr, x_train, y_train, 
                         scoring='accuracy', cv=10)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_lr),np.std(scores_knn)))

### ROC Graph

In [None]:
# ROC graph containing roc curve for decision tree, KNN, and logistic regression models

np.random.seed(0)
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

## Decision Tree Classifier
clf1 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

## kNN Classifier
clf2 = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

## Logistic Regression Classifier
clf3 = LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Label the classifiers
clf_labels = ['Decision tree',  'KNN', 'Logistic regression',]
all_clf = [clf1, clf2, clf3]

print('10-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3], clf_labels): #For all classifiers 
    if clf == clf1:
        scores = cross_val_score(estimator=clf,  #Estimate AUC based on cross validation
                             X=xtree_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')

    elif clf == clf2:
        scores = cross_val_score(estimator=clf,  #Estimate AUC based on cross validation
                             X=x_train_std,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')

    else:
        scores = cross_val_score(estimator=clf,  #Estimate AUC based on cross validation
                             X=x_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" #Print peformance statistics based on cross-validation
      % (scores.mean(), scores.std(), label))

colors = ['red', 'blue', 'green']      #Colors for visualization
linestyles = [':', '--', '-.', '-']        #Line styles for visualization
for clf, label, clr, ls in zip(all_clf,
               clf_labels, colors, linestyles):

    # assuming the label of the positive class is 1 and data is normalized
    if clf == clf1:
        y_pred = clf.fit(xtree_train,
                         y_train).predict_proba(xtree_test)[:, 1] # Make predictions based on the classifiers
        
    elif clf == clf2:
        y_pred = clf.fit(x_train_std,
                         y_train).predict_proba(x_test_std)[:, 1] # Make predictions based on the classifiers
        
    else:
        y_pred = clf.fit(x_train,
                         y_train).predict_proba(x_test)[:, 1] # Make predictions based on the classifiers
    fpr, tpr, thresholds = roc_curve(y_true=y_test, # Build ROC curve
                                     y_score=y_pred)
    roc_auc = auc(x=fpr, y=tpr)                # Compute Area Under the Curve (AUC) 
    plt.plot(fpr, tpr,                         # Plot ROC Curve and create label with AUC values
             color=clr,
             linestyle=ls,
             label='%s (auc = %0.2f)' % (label, roc_auc))

plt.legend(loc='lower right')    # Where to place the legend
plt.plot([0, 1], [0, 1], # Visualize random classifier
         linestyle='--',
         color='gray',
         linewidth=2)

plt.xlim([-0.1, 1.1])   #limits for x axis
plt.ylim([-0.1, 1.1])   #limits for y axis
plt.grid(alpha=0.5)
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')


#plt.savefig('ROC_all_classifiers', dpi=300)
plt.show()

## Modling & Evalutaion (Resampling)

### Resampling

In [None]:
# The dataset is imbalanced, we resample the rare class to the same amount of the larger class to get 50%:50% ratio.
# We only resample for the training dataset, excluding test data

from sklearn.utils import resample

## resample for KNN and logistic regression
x_train_resampled, y_train_resampled = resample(x_train[ y_train == 1],
                                    y_train[y_train == 1],
                                    replace=True, 
                                    n_samples=x_train[y_train == 0].shape[0], 
                                    random_state=0)

xtree_train_resampled, y_train_resampled = resample(xtree_train[y_train == 1],
                                    y_train[y_train == 1],
                                    replace=True, 
                                    n_samples=xtree_train[y_train == 0].shape[0], 
                                    random_state=0)

In [None]:
## resample for decision tree

x_train_zero = x_train[y_train == 0]
x_train_resample = pd.concat([x_train_resampled, x_train_zero])

xtree_train_zero = xtree_train[y_train == 0]
xtree_train_resample = pd.concat([xtree_train_resampled, xtree_train_zero])

y_train_zero = y_train[y_train == 0]
y_train_resample = pd.concat([y_train_resampled, y_train_zero])

### Decision Tree (Resampling)

In [None]:
# Use GridSearch to find the best decision tree parameters after resampling, and fit the training data to apply to test data
from sklearn import tree
import sklearn.grid_search as gs
from sklearn.tree import DecisionTreeClassifier 

gstree_resample = gs.GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                                   param_grid= [{"criterion": ["gini", "entropy"],
                                                 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50],
                                                 "max_features": list(range(1, 11)),
                                                 'min_samples_leaf':[1,2,3,4,5],
                                                 'min_samples_split':[2,3,4,5]
                                                }],
                                   cv = 10,
                                   n_jobs = 4,
                                   scoring='accuracy')
gstree_resample_fit = gstree_resample.fit(xtree_train_resample, y_train_resample)
y_pred_tree_resample = gstree_resample_fit.predict(xtree_test)

## best parameter
print(gstree_resample.best_params_)

## best estimator
print("Tree parameters: \n", gstree_resample_fit.best_estimator_)

## best score
print("Best score: ", gstree_resample.best_score_)

## The overall accuracy on the training set:
print("Training score: ", gstree_resample.score(xtree_train_resample, y_train_resample))

## The overall accuracy on the test set:
print("Test accuracy: ", gstree_resample.score(xtree_test, y_test))

In [None]:
tree = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=20,
            max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best').fit(xtree_train_resample, y_train_resample)
tree.feature_importances_

In [None]:
xtree_train_resample.columns

In [None]:
feature_importance = pd.DataFrame({'features': list(xtree_train_resample.columns), 'importance': list(tree.feature_importances_)})

In [None]:
tmp = feature_importance.sort_values('importance', ascending = False)
tmp.plot('features', 'importance', kind = 'bar', color = 'black', legend = None)
plt.xlabel('Features')
plt.ylabel('Importance')

In [None]:
# Generalization Performance of decision tree after resampling on test data

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

accuracy = sum(y_pred_tree_resample == y_test)/len(y_test)
error = 1 - accuracy
print("The predictive accuracy is: ", round(accuracy, 2))
print("The classification error is: ", round(error, 2))
print(classification_report(y_test, y_pred_tree_resample))

# Confusion matrix
from sklearn.metrics import confusion_matrix

cnf_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_tree_resample), columns = ['Predict 0', 'Predict 1'], index = ['Actual 0', 'Actual 1'])
print("The Confusion matrix: \n", cnf_matrix)

In [None]:
# Where to save the figures
PROJECT_ROOT_DIR = "."

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", fig_id)

def save_fig(fig_id, tight_layout=True):
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)

In [None]:
# Graph for the best decision tree model

from sklearn import tree
import graphviz 
from sklearn.tree import export_graphviz

model =  DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=20,
            max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')
model = model.fit(xtree_train_resample, y_train_resample)

dot_data = tree.export_graphviz(model, out_file=image_path("of_tree.dot"), 
                                feature_names = [ 'Age', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 
                                                 'Securities Account', 'CD Account', 'Online', 'CreditCard', 'Area'],
                                class_names = ['0','1'],
                                filled = True, 
                                rounded = True)
                                                   
graph = graphviz.Source(dot_data) 

In [None]:
# Converting .dot file to PNG Example: Run command "dot -Tpng of_tree.dot -o of_tree.png" in the terminal after installing graphviz package 
# and making sure you are in the right directory (same directory as the .dot file)
path_png = os.path.join(PROJECT_ROOT_DIR, "images", "of_tree.png")
Image(path_png)

In [None]:
# Cross Validation score for decision tree after resampling 

from sklearn.model_selection import cross_val_score

scores_tree_resample=cross_val_score(gstree, xtree_train_resample, y_train_resample, 
                         scoring='accuracy', cv=10)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_tree_resample), np.std(scores_tree_resample)))

### KNN (Resampling)

In [None]:
# Standardize for all x after resampling

sc.fit(x_train_resample)
x_train_resample_std = sc.transform(x_train_resample)
x_test_std = sc.transform(x_test)

In [None]:
# Use GridSearch to find the best KNN parameters after resampling, and fit the training data to apply to test data

gsknn_resample = GridSearchCV(estimator=neighbors.KNeighborsClassifier(p=2, 
                           metric='minkowski'),
                  param_grid=[{'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21],
                               'weights':['uniform','distance']}],
                  scoring='accuracy',
                  cv=10,
                  n_jobs=4)

gsknn_resample_fit = gsknn_resample.fit(x_train_resample_std, y_train_resample)          
y_pred_knn_resample = gsknn_resample_fit.predict(x_test_std)

## best parameter
print(gsknn_resample.best_params_)

## best estimator
print("KNN parameters: \n", gsknn_resample_fit.best_estimator_)

## best score
print("Best score: ", gsknn_resample.best_score_)

## The overall accuracy on the training set:
print("Training score: ", gsknn_resample.score(x_train_resample_std, y_train_resample))

## The overall accuracy on the test set:
print("Test accuracy: ", gsknn_resample.score(x_test_std, y_test))

In [None]:
# Generalization Performance of KNN after resampling on test data

accuracy = sum(y_pred_knn_resample == y_test)/len(y_test)
error = 1 - accuracy
print("The predictive accuracy is: ", round(accuracy, 2))
print("The classification error is: ", round(error, 2))
print(classification_report(y_test, y_pred_knn_resample))

# Confusion matrix
from sklearn.metrics import confusion_matrix

cnf_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_knn_resample), columns = ['Predict 0', 'Predict 1'], index = ['Actual 0', 'Actual 1'])
print("The Confusion matrix: \n", cnf_matrix)

In [None]:
# Cross validaiton score for KNN after resampling

scores_knn_resample =cross_val_score(gsknn_resample, x_train_resample_std, y_train_resample, 
                         scoring='accuracy', cv=10)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_knn_resample), np.std(scores_knn_resample)))

### Logistic Regression (Resampling)

In [None]:
# Use GridSearch to find the best logistic regression parameters after resampling, and fit the training data to apply to test data.

gslr_resample = GridSearchCV(estimator=LogisticRegression(random_state=0),
                  param_grid=[{'C': [ 0.00001, 0.0001, 0.001, 0.01, 0.1 ,1 ,10 ,100, 1000, 10000, 100000, 1000000, 10000000],
                             'penalty':['l1','l2']}],
                  scoring='accuracy',
                  cv=10)

gslr_resample_fit = gslr_resample.fit(x_train_resample, y_train_resample)          
y_pred_lr_resample = gslr_resample_fit.predict(x_test)

## best parameter
print(gslr_resample.best_params_)

## best estimator
print("Logistic parameters: \n", gslr_resample_fit.best_estimator_)

## best score
print("Best score: ", gslr_resample.best_score_)

## The overall accuracy on the training set:
print("Training score: ", gslr_resample.score(x_train_resample, y_train_resample))

## The overall accuracy on the test set:
print("Test accuracy: ", gslr_resample.score(x_test, y_test))

In [None]:
# Generalization Performance of logistic regression after resampling on test data

accuracy = sum(y_pred_lr_resample == y_test)/len(y_test)
error = 1 - accuracy
print("The predictive accuracy is: ", round(accuracy, 2))
print("The classification error is: ", round(error, 2))
print(classification_report(y_test, y_pred_lr_resample))

# Confusion matrix
from sklearn.metrics import confusion_matrix

cnf_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_lr_resample), columns = ['Predict 0', 'Predict 1'], index = ['Actual 0', 'Actual 1'])
print("The Confusion matrix: \n", cnf_matrix)

In [None]:
# Cross validation score for logisitc regression after resampling

scores_lr_resample =cross_val_score(gslr_resample, x_train_resample, y_train_resample, 
                                    scoring='accuracy', cv=10)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_lr_resample), np.std(scores_lr_resample)))

### ROC Graph (Resampling)

In [None]:
# ROC graph containing roc curve for decision tree, KNN, and logistic regression models after resampling

np.random.seed(0)

## Decision Tree Classifier
clf1_resample = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=20,
            max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

## kNN Classifier
clf2_resample = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

## Logistic Regression Classifier
clf3_resample = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Label the classifiers
clf_labels = ['Decision tree',  'KNN', 'Logistic regression',]
all_clf = [clf1_resample, clf2_resample, clf3_resample]

print('10-fold cross validation:\n')
for clf, label in zip([clf1_resample, clf2_resample, clf3_resample], clf_labels): #For all classifiers 
    if clf == clf1_resample:
        scores = cross_val_score(estimator=clf,  #Estimate AUC based on cross validation
                             X=xtree_train_resample,
                             y=y_train_resample,
                             cv=10,
                             scoring='roc_auc')
    elif clf == clf2_resample:
        scores = cross_val_score(estimator=clf,  #Estimate AUC based on cross validation
                             X=x_train_resample_std,
                             y=y_train_resample,
                             cv=10,
                             scoring='roc_auc')

    else:
        scores = cross_val_score(estimator=clf,  #Estimate AUC based on cross validation
                             X=x_train_resample,
                             y=y_train_resample,
                             cv=10,
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" #Print peformance statistics based on cross-validation
      % (scores.mean(), scores.std(), label))

colors = ['red', 'blue', 'green']      #Colors for visualization
linestyles = [':', '--', '-.', '-']        #Line styles for visualization
for clf, label, clr, ls in zip(all_clf,
               clf_labels, colors, linestyles):

    # assuming the label of the positive class is 1 and data is normalized
    if clf == clf1_resample:
        y_pred = clf.fit(xtree_train_resample,
                         y_train_resample).predict_proba(xtree_test)[:, 1] # Make predictions based on the classifiers
        
    elif clf == clf2_resample:
        y_pred = clf.fit(x_train_resample_std,
                         y_train_resample).predict_proba(x_test_std)[:, 1] # Make predictions based on the classifiers

    else:
        y_pred = clf.fit(x_train_resample,
                         y_train_resample).predict_proba(x_test)[:, 1] # Make predictions based on the classifiers
    fpr, tpr, thresholds = roc_curve(y_true=y_test, # Build ROC curve
                                     y_score=y_pred)
    roc_auc = auc(x=fpr, y=tpr)                # Compute Area Under the Curve (AUC) 
    plt.plot(fpr, tpr,                         # Plot ROC Curve and create label with AUC values
             color=clr,
             linestyle=ls,
             label='%s (auc = %0.2f)' % (label, roc_auc))

plt.legend(loc='lower right')    # Where to place the legend
plt.plot([0, 1], [0, 1], # Visualize random classifier
         linestyle='--',
         color='gray',
         linewidth=2)

plt.xlim([-0.1, 1.1])   #limits for x axis
plt.ylim([-0.1, 1.1])   #limits for y axis
plt.grid(alpha=0.5)
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')

#plt.savefig('ROC_all_classifiers', dpi=300)
plt.show()