In [None]:
# STAT8017 Data mining techniques – Group project
# Data Analysis of Cardiovascular Disease Dataset
#------------------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from itertools import product

# data transformation & splitting
from sklearn.preprocessing import RobustScaler, label_binarize
from sklearn.model_selection import train_test_split, GridSearchCV

# decision tree, logistic
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
from sklearn.linear_model import LogisticRegressionCV

# clustering analysis
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
import scipy.cluster.hierarchy as sch
from sklearn.mixture import GaussianMixture
from sklearn.metrics import jaccard_score, adjusted_rand_score, silhouette_score, calinski_harabasz_score, roc_curve, auc, accuracy_score, classification_report, confusion_matrix
from sklearn.metrics.cluster import contingency_matrix

# ensemble methods, MLP
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
#------------------------------------------------------------------------------------------   
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# read data
df = pd.read_csv('/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv', sep = ';', index_col = 'id')

# preview
pd.options.display.float_format = '{:,.2f}'.format
display(df.head())

# Data Cleaning

### Check missing values:

In [None]:
# check blank rows
df.isnull().sum() 

**None of the variables have missing values.**

### Examining the variables:

In [None]:
# before data cleaning
display(df.describe())

**Age : Converting to years for ease of understanding.**

In [None]:
# convert age from days to years
df['age'] = df['age']/365

**Gender: Converting female to 0 and male to 1**

In [None]:
# convert gender to 0=female and 1=male
df['gender'] = df['gender'] - 1

**Height and Weight : Using BMI as an indicator to remove records that do not make sense.**

In [None]:
# calculate BMI
df['BMI'] = df['weight']/(np.power(df['height']/100, 2))

# BMI Distribution
print(df['BMI'].describe())
seaborn.histplot(data = df, x = 'BMI', bins = 100)
plt.show()

In [None]:
# remove BMI > 150
drop_criteria_bmi = df[df['BMI'] > 150].index

# number of records to be removed
print(drop_criteria_bmi.size)

# remove records
df.drop(drop_criteria_bmi, inplace = True)

**AP_HI and AP_LO : Blood pressure should always be positive, not exceeding a certain threshold (300). AP_HI > AP_LO checking should be enforced.**

In [None]:
# ap_hi is higher than 250 or lower than 60
drop_criteria_aphi = df[(df['ap_hi'] > 210) | (df['ap_hi'] < 60)].index

# ap_lo is higher than 200 or lower than 10
drop_criteria_aplo = df[(df['ap_lo'] > 140) | (df['ap_lo'] < 30)].index

# ap_lo is higher than 'ap_hi
drop_criteria_ap = df[df['ap_lo'] > df['ap_hi']].index 

# number of records to be removed
drop_criteria = drop_criteria_aphi.union(drop_criteria_aplo)
drop_criteria.union(drop_criteria_ap)
print(drop_criteria.size)

In [None]:
# remove the records
df.drop(drop_criteria, inplace = True)

### Data cleaning result:

In [None]:
# after data cleaning
display(df.describe())

# distribution of response variable
display(pd.DataFrame(df['cardio'].value_counts()))

**The data is balanced. There is a fairly even split between individuals with the disease and without the disease.**

# Visualizing Variables

In [None]:
def pie_chart(df, col, labels):
    data = df[col].value_counts().to_numpy()
    def absolute_value(val):
        a  = np.round(val / 100 * data.sum(), 0)
        return str('%0.0f' % a) + '\n(' + ('%0.2f' % val) + '%)'
    plt.pie(data, labels = labels, autopct=absolute_value)
    plt.legend(title=col)
    plt.show() 

pie_chart(df, 'gender', ['Female', 'Male'])
pie_chart(df, 'cardio', ['No', 'Yes'])
pie_chart(df, 'cholesterol', ['Normal', 'Above normal', 'Well above normal'])
pie_chart(df, 'gluc', ['Normal', 'Above normal', 'Well above normal'])
pie_chart(df, 'smoke', ['No', 'Yes'])
pie_chart(df, 'alco', ['No', 'Yes'])
pie_chart(df, 'active', ['No', 'Yes'])

In [None]:
df_subset = df[['age', 'height', 'weight', 'ap_hi', 'ap_lo']]
flierprops = dict(markerfacecolor='lightblue', marker='o',markeredgecolor='lightblue') 
# <=> rs = {'markerfacecolor'='lightblue', 'marker'='o'}
boxprops = dict(facecolor='lightblue',color = 'lightblue') # color: box line color; facecolor: fill-in color
plt.figure(figsize=(10, 5))
plt.boxplot(df_subset.values,labels=df_subset.columns,
           flierprops=flierprops,boxprops=boxprops,
            patch_artist=True)
plt.show()
df_subset.boxplot()

In [None]:
from sklearn.preprocessing import QuantileTransformer
quantile_transformer = QuantileTransformer(random_state=0)
X_trans = quantile_transformer.fit_transform(df_subset)
pd.DataFrame(X_trans, columns=df_subset.columns).hist(bins = 5)
plt.tight_layout()

In [None]:
flierprops = dict(markerfacecolor='lightblue', marker='o',markeredgecolor='lightblue') 
# <=> rs = {'markerfacecolor'='lightblue', 'marker'='o'}
boxprops = dict(facecolor='lightblue',color = 'lightblue') # color: box line color; facecolor: fill-in color
plt.figure(figsize=(10, 5))
plt.boxplot(df_subset.values,labels=df_subset.columns,
           flierprops=flierprops,boxprops=boxprops,
            patch_artist=True)
plt.show()

In [None]:
%%time

# pair-plot
seaborn.pairplot(df, vars = ['age', 'height', 'weight', 'ap_hi', 'ap_lo'], hue = 'cardio')
plt.show()

# Pairplots by gender

In [None]:
# gender pair-plot
seaborn.pairplot(df[df.gender == 0], vars = ['age', 'height', 'weight', 'ap_hi', 'ap_lo'], hue = 'cardio')
plt.show()
seaborn.pairplot(df[df.gender == 1], vars = ['age', 'height', 'weight', 'ap_hi', 'ap_lo'], hue = 'cardio')
plt.show()

# Correlation heatmap

In [None]:
# correlation heatmap
plt.figure(figsize=(16, 8))
seaborn.heatmap(df.corr(), annot=True, fmt='.3f')

# Data Transformation & Train-Test Splitting 

In [None]:
# explanatory variables
x = df.drop(columns = ['cardio', 'BMI'])

# response variable
y = df['cardio']

In [None]:
# split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 8017)

# RobustScaler ALL variables
scaler = RobustScaler()
scaler.fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train), index=x_train.index, columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test), index=x_test.index, columns=x_test.columns)

pd.options.display.float_format = '{:,.4f}'.format
display(x_train.head(5))
display(x_train.describe())

# Decision Tree

In [None]:
%%time

# parameters candidates
parameters = {'max_depth':range(2,32)}

# fitting
DecisionTree_GSCV = GridSearchCV(DecisionTreeClassifier(random_state=8017), 
                                 parameters, n_jobs=-1, verbose=3, return_train_score=True)
DecisionTree_GSCV.fit(x_train, y_train)
DecisionTree_model = DecisionTree_GSCV.best_estimator_

In [None]:
# plot training & testing scores
train_dt_scores = DecisionTree_GSCV.cv_results_['mean_train_score']
test_dt_scores = DecisionTree_GSCV.cv_results_['mean_test_score']

plt.plot(train_dt_scores, "g.--")
plt.plot(test_dt_scores, "g.-")
plt.ylim(0.4, 1.05)
plt.xticks(range(30), range(2, 32))
plt.legend(["DT training score", "DT test score"])
plt.axvline(np.argmax(test_dt_scores), linestyle="dotted", color="red")
plt.annotate(np.max(test_dt_scores).round(4), (np.argmax(test_dt_scores), np.max(test_dt_scores)), xycoords="data",
                 xytext=(50, 25), textcoords="offset pixels", arrowprops=dict(facecolor="black", shrink=0.1), fontsize=10,
                 horizontalalignment="center", verticalalignment="top")
plt.show()

In [None]:
# accuracy scores
print(DecisionTree_model.get_params())
print(f"Training Score: {round(DecisionTree_model.score(x_train, y_train),4)}")
print(f"Testing Score: {round(DecisionTree_model.score(x_test, y_test),4)}")

In [None]:
# feature importances
d = {'feature importance':list(DecisionTree_model.feature_importances_)}
table = pd.DataFrame(d, index=x_train.columns)

display(  table.sort_values('feature importance', ascending=False)  )

# Logistic Regression

In [None]:
%%time

# fitting
Logistic_model =  LogisticRegressionCV(Cs = 50, cv = 5, random_state=8017)
Logistic_model.fit(x_train, y_train)
print(Logistic_model.get_params())

In [None]:
# regularaization candidates
print('Candidates of Regularization Parameter C:')
print(Logistic_model.Cs_, '\n')

# accuracy scores
print(f"Training Score: {round(Logistic_model.score(x_train, y_train),4)}")
print(f"Testing Score: {round(Logistic_model.score(x_test, y_test),4)}")

In [None]:
# fitted parameters
print(f'Best Regularization Parameter C = {round(Logistic_model.C_[0],4)}')
print(f'intercept = {round(Logistic_model.intercept_[0],4)}')
d = {'estimates' : list(Logistic_model.coef_[0]),
     'absolute' : np.abs(list(Logistic_model.coef_[0]))
    }
table = pd.DataFrame(d, index=x_train.columns)

display(  table.sort_values('absolute', ascending=False).drop(columns='absolute')  )

# Cluster Analysis
**Using the two most important features: `age` and `ap_hi`**

In [None]:
# sample the first 2500 records only due to computation limit
sX = x_train[['age', 'ap_hi']][0:2500].to_numpy()
sY = y_train[0:2500].to_numpy()

### K-means

In [None]:
# function to plot decision boundary
def plot_decision_boundary(x, y, model, title):
    
    h = 0.02
    x_min, x_max = x[:, 0].min()-0.1, x[:, 0].max() +0.1
    y_min, y_max = x[:, 1].min()-0.1, x[:, 1].max() +0.1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.scatter(x[:, 0:1], x[:, 1:2], c=y, edgecolors='k')
    plt.title(title, fontsize = 20)

In [None]:
# K-means random
kmean1 = KMeans(n_clusters=2, init='random', random_state=0)
kmean1.fit(sX)
plot_decision_boundary(sX, sY, kmean1, "Prediction Boundary of K-Means")
plt.plot(kmean1.cluster_centers_[:, 0], kmean1.cluster_centers_[:, 1], '*', markersize=20, color="red")

In [None]:
# K-means++
kmean2 = KMeans(n_clusters=2, init='k-means++', random_state=0)
kmean2.fit(sX)
plot_decision_boundary(sX, sY, kmean2, "Prediction Boundary of K-Means ++")
plt.plot(kmean2.cluster_centers_[:, 0], kmean2.cluster_centers_[:,1], '*', markersize=20, color="red")

In [None]:
# training accuracy scores
kmeans1_pred = kmean1.predict(sX) # K-means random
kmeans2_pred = kmean2.predict(sX) # K-means++
print('K-means(random) training accuracy: ', accuracy_score(sY, kmeans1_pred))
print('K-means++ training accuracy: ', accuracy_score(sY, kmeans2_pred), '\n')

# testing accuracy scores
kmeans1_pred_test = kmean1.predict(x_test[['age','ap_hi']]) # K-means random
kmeans2_pred_test = kmean2.predict(x_test[['age','ap_hi']]) # K-means++
print('K-means(random) testing accuracy: ', accuracy_score(y_test, kmeans1_pred_test))
print('K-means++ testing accuracy: ', accuracy_score(y_test, kmeans2_pred_test))

### Agglomerative Clustering

In [None]:
# function to plot dengrogram
def plot_dendrogram(model, **kwargs): # provided by Mathew Kallada. 

    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0] + 2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)
    
    sch.dendrogram(linkage_matrix, **kwargs)

In [None]:
# ward's linkage and complete linkage
H_C_ward = AgglomerativeClustering(n_clusters=2) # default linkage is ward. 
H_C_complete = AgglomerativeClustering(n_clusters=2, linkage='complete')

# dendrogram (on 250 records only)
hc_ward_pred = H_C_ward.fit_predict(sX[0:250])
hc_complete_pred = H_C_complete.fit_predict(sX[0:250])

fig = plt.figure(figsize=(25, 10))
ax = fig.add_subplot(1, 2, 1)
plot_dendrogram(H_C_ward)
ax.set_title('Linkage method is ward')

ax = fig.add_subplot(1, 2, 2)
Z2 = plot_dendrogram(H_C_complete)
ax.set_title('Linkage method is complete')
plt.show()

In [None]:
# accuracy score (on 2500 records)
hc_ward_pred = H_C_ward.fit_predict(sX)
hc_complete_pred = H_C_complete.fit_predict(sX)

print("ward's linkage training accuracy: ", accuracy_score(sY, hc_ward_pred))
print('complete linkage training accuracy: ', accuracy_score(sY, hc_complete_pred))

### DBSCAN

In [None]:
# DBSCAN
dbscan = DBSCAN(eps=0.26, min_samples=20)
dbscan_pred = dbscan.fit_predict(sX)

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(dbscan_pred)) - (1 if -1 in dbscan_pred else 0)
n_noise_ = list(dbscan_pred).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

In [None]:
# plot
plt.scatter(x=sX[:,0], y=sX[:,1], c=dbscan_pred, edgecolors='k')
plt.show()

# accuracy score
print("DBSCAN training accuracy: ", accuracy_score(sY, dbscan_pred))

### Gaussian Mixture

In [None]:
# Gaussian Mixture
gmm = GaussianMixture(n_components=2, covariance_type='full', max_iter=20, random_state=8017) 
gmm.fit(sX)
plot_decision_boundary(sX, sY, gmm, "Gaussian Mixture")

In [None]:
# training accuracy scores
gmm_pred = gmm.predict(sX)
print('Gaussian Mixture Model training accuracy: ', accuracy_score(sY, gmm_pred), '\n')

# testing accuracy scores
gmm_pred_test = gmm.predict(x_test[['age','ap_hi']])
print('Gaussian Mixture Model testing accuracy: ', accuracy_score(y_test, gmm_pred_test))

### Clustering Performance

In [None]:
# function to calculate entropy score
def get_entropy(y, pred, n_class):
    p = np.zeros((n_class, n_class))
    tb = contingency_matrix(y, pred)
    for i in range(n_class):
        for j in range(n_class):
            p[i, j] = tb[i, j]/np.sum(tb[i, :])
            
    E = np.zeros((n_class, 1))
    for i in range(n_class):
        for j in range(n_class):
            if (p[i, j] != 0):
                E[i] = E[i] - p[i, j] * np.log(p[i, j])
    Entropy = np.dot(np.sum(tb, 1) / np.sum(tb), E)
    return Entropy

In [None]:
# accuracy scores of all clustering
result = pd.DataFrame({'Model':['K-means (Random)','K-means (K-means++)','Dendrogram (Ward)','Dendrogram (Complete)','DBSCAN','Gaussian Mixture Model'],
                       'Training Accuracy': [accuracy_score(sY, kmeans1_pred),
                                             accuracy_score(sY, kmeans2_pred),
                                             accuracy_score(sY, hc_ward_pred),
                                             accuracy_score(sY, hc_complete_pred),
                                             accuracy_score(sY, dbscan_pred),
                                             accuracy_score(sY, gmm_pred)],
                       
                       # external measurement
                       'Entropy': [get_entropy(sY, kmeans1_pred, 2)[0], 
                                  get_entropy(sY, kmeans2_pred, 2)[0], 
                                  get_entropy(sY, hc_ward_pred, 2)[0], 
                                  get_entropy(sY, hc_complete_pred, 2)[0],
                                  get_entropy(sY, dbscan_pred, 2)[0], 
                                  get_entropy(sY, gmm_pred, 2)[0]],
                       'Adjusted Rand Index': [adjusted_rand_score(sY, kmeans1_pred), 
                                              adjusted_rand_score(sY, kmeans2_pred),
                                              adjusted_rand_score(sY, hc_ward_pred), 
                                              adjusted_rand_score(sY, hc_complete_pred), 
                                              adjusted_rand_score(sY, dbscan_pred), 
                                              adjusted_rand_score(sY, gmm_pred)],
                       
                       # internal measurement
                       'Silhouette Coefficient': [silhouette_score(sX, kmeans1_pred),
                                                 silhouette_score(sX, kmeans2_pred), 
                                                 silhouette_score(sX, hc_ward_pred),
                                                 silhouette_score(sX, hc_complete_pred),
                                                 silhouette_score(sX, dbscan_pred),
                                                 silhouette_score(sX, gmm_pred)],
                       'Calinski Harabasz Score': [calinski_harabasz_score(sX, kmeans1_pred),  #Ratio of between-cluster dispersion to within-cluster dispersion
                                                   calinski_harabasz_score(sX, kmeans2_pred), 
                                                   calinski_harabasz_score(sX, hc_ward_pred),
                                                   calinski_harabasz_score(sX, hc_complete_pred),
                                                   calinski_harabasz_score(sX, dbscan_pred),
                                                   calinski_harabasz_score(sX, gmm_pred)]
                      })
result

**K-means has the highest training accuracy.**

**Dendrogram (Complete) works the best in entropy, while K-means (Random) works the best on adjusted rand index.**

**It can be observed that Dendrogram (Complete) works the best in silhouette score, while K-means (K-means++) works the best regarding calinski harabasz score.**

# Ensemble Methods

### Bagging Classifier

In [None]:
%%time

# parameters candidates
parameters = {'base_estimator__max_depth': [4,6,8,12,24],
              'n_estimators': [20, 50, 100, 200]}

# fitting
Bagging_GSCV = GridSearchCV(BaggingClassifier(DecisionTreeClassifier(), random_state=8017), 
                            parameters, n_jobs=-1, verbose=3, return_train_score=True)
Bagging_GSCV.fit(x_train, y_train)
Bagging_model = Bagging_GSCV.best_estimator_
Bagging_model

In [None]:
train_bagging_scores = Bagging_GSCV.cv_results_['mean_train_score']
test_bagging_scores = Bagging_GSCV.cv_results_['mean_test_score']

#plt.plot(test_dt_scores, 'go-')
plt.plot(train_bagging_scores, 'ro--')
plt.plot(test_bagging_scores, 'ro-')
plt.ylim(0.4, 1.05)
plt.xticks(range(20), range(2, 22))
plt.legend(["Bagging training score", "Bagging test score"])
plt.axvline(np.argmax(test_bagging_scores), linestyle="dotted", color="red")
plt.annotate(np.max(test_bagging_scores).round(4), (np.argmax(test_bagging_scores), np.max(test_bagging_scores)), xycoords="data",
                 xytext=(-40, 30), textcoords="offset pixels", arrowprops=dict(facecolor="black", shrink=0.1), fontsize=10,
                 horizontalalignment="center", verticalalignment="top")
plt.show()

In [None]:
# accuracy scores
print(Bagging_model.get_params())
print(f"Training Score: {round(Bagging_model.score(x_train, y_train),4)}")
print(f"Testing Score: {round(Bagging_model.score(x_test, y_test),4)}")

### Random Forest Classifier

In [None]:
%%time

# parameters candidates
parameters = {'n_estimators': [20, 50, 100, 200],
              'max_depth':[6,8,12,24,48]}

# fitting
RandomForest_GSCV = GridSearchCV(RandomForestClassifier(random_state=8017), 
                                 parameters, n_jobs=-1, verbose=3)
RandomForest_GSCV.fit(x_train, y_train)
RandomForest_model = RandomForest_GSCV.best_estimator_
RandomForest_model

In [None]:
# accuracy scores
print(RandomForest_model.get_params())
print(f"Training Score: {round(RandomForest_model.score(x_train, y_train),4)}")
print(f"Testing Score: {round(RandomForest_model.score(x_test, y_test),4)}")

### Adaboost

In [None]:
%%time

# parameters candidates
parameters = {'base_estimator__max_depth': [2,3,4,6,8,12],
              'n_estimators': [20, 50, 100, 200]}

# fitting
Adaboost_GSCV = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(), random_state=8017), 
                             parameters, n_jobs=-1, verbose=3)
Adaboost_GSCV.fit(x_train, y_train)
Adaboost_model = Adaboost_GSCV.best_estimator_
Adaboost_model

In [None]:
# accuracy scores
print(Adaboost_model.get_params())
print(f"Training Score: {round(Adaboost_model.score(x_train, y_train),4)}")
print(f"Testing Score: {round(Adaboost_model.score(x_test, y_test),4)}")

### Gradient Boosting

In [None]:
%%time

# parameters candidates
parameters = {'max_depth': [2,4,6,8,10,12],
              'n_estimators': [200],
              'learning_rate': [0.01]}

# fitting
GradientBoost_GSCV = GridSearchCV(GradientBoostingClassifier(random_state=8017), 
                             parameters, n_jobs=-1, verbose=3)
GradientBoost_GSCV.fit(x_train, y_train)
GradientBoost_model = GradientBoost_GSCV.best_estimator_
GradientBoost_model

In [None]:
# accuracy scores
print(GradientBoost_model.get_params())
print(f"Training Score: {round(GradientBoost_model.score(x_train, y_train),4)}")
print(f"Testing Score: {round(GradientBoost_model.score(x_test, y_test),4)}")

# Support Vector Machine

### Linear SVC

In [None]:
%%time

# parameters candidates
parameters = {'C': np.logspace(-4, 4, 50)}

# fitting
LinearSVC_GSCV = GridSearchCV(LinearSVC(dual=False, random_state=8017), 
                              parameters, n_jobs=-1, verbose=3)
LinearSVC_GSCV.fit(x_train, y_train)
LinearSVC_model = LinearSVC_GSCV.best_estimator_
LinearSVC_model


In [None]:
# accuracy scores
print(LinearSVC_model.get_params())
print(f"Training Score: {round(LinearSVC_model.score(x_train, y_train),4)}")
print(f"Testing Score: {round(LinearSVC_model.score(x_test, y_test),4)}")

### Non-Linear SVC

In [None]:
%%time

# fitting
SVC_model = SVC(kernel='rbf', random_state=8017)
SVC_model.fit(x_train, y_train)
SVC_model

In [None]:
%%time

# accuracy scores
print(SVC_model.get_params())
print(f"Training Score: {round(SVC_model.score(x_train, y_train),4)}")
print(f"Testing Score: {round(SVC_model.score(x_test, y_test),4)}")

# MLP

In [None]:
# layer sizes candidates
ls = [x for x in [4,8,16,32]] + [x for x in product([2,4,8], [4,8])] + [x for x in product([2,4,8], [8,16], [4,8])]
ls

In [None]:
%%time

# parameters candidates
parameters = {'hidden_layer_sizes': ls}

# fitting
MLP_GSCV = GridSearchCV(MLPClassifier(random_state=8017), 
                        parameters, n_jobs=-1, verbose=3)
MLP_GSCV.fit(x_train, y_train)
MLP_model = MLP_GSCV.best_estimator_
MLP_model

In [None]:
# top 10 cv scores for the MLP candidates
pd.DataFrame(MLP_GSCV.cv_results_).sort_values('rank_test_score').head(10)

In [None]:
# scores
print(MLP_model)
print(f"Training Score: {round(MLP_model.score(x_train, y_train),4)}")
print(f"Testing Score: {round(MLP_model.score(x_test, y_test),4)}")

# Compare All Models

In [None]:
# accuracy scores of all models
result = pd.DataFrame({'Model':['Decision Tree','Logistic',
                                'Bagging','Random Forest','Adaboost','Gradient Boost',
                                'Linear SVC','Non-linear SVC','MLP'],
                       'Prediction Accuracy': [DecisionTree_model.score(x_test, y_test),
                                               Logistic_model.score(x_test, y_test),
                                               Bagging_model.score(x_test, y_test),
                                               RandomForest_model.score(x_test, y_test),
                                               Adaboost_model.score(x_test, y_test),
                                               GradientBoost_model.score(x_test, y_test),
                                               LinearSVC_model.score(x_test, y_test),
                                               SVC_model.score(x_test, y_test),
                                               MLP_model.score(x_test, y_test)]}
                     )
result.sort_values('Prediction Accuracy', ascending=False)

# Classification Report & Confusion Matrix

In [None]:
def plot_confusion_matrix(classifier, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    pred_train = classifier.predict_proba(x_train)
    pred_test = classifier.predict_proba(x_test)
    acc_train = accuracy_score(y_train, np.argmax(pred_train, 1))
    acc_test = accuracy_score(y_test, np.argmax(pred_test, 1))

    print("Training ACC:", round(acc_train, 4), "Testing ACC:", round(acc_test, 4))
    cm = confusion_matrix(y_test, np.argmax(pred_test, 1))
    print("Confusion matrix: \n", cm)
    print("Testing:\n",classification_report(y_test, np.argmax(pred_test, 1), target_names=classes))

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
plot_confusion_matrix(RandomForest_model, classes=['No','Yes'],
                      title='Confusion matrix of Random Forest')

In [None]:
plot_confusion_matrix(DecisionTree_model, classes=['No','Yes'],
                      title='Confusion matrix of Decision Tree')

In [None]:
plot_confusion_matrix(Logistic_model, classes=['No','Yes'],
                      title='Confusion matrix of Logistic Regression')

# ROC Curve

In [None]:
yy_test = label_binarize(y_test, classes=[0, 1])
plt.figure(figsize=(20, 20))
def plot_roc_curve(classifier, label):
    # Compute ROC curve and ROC area for each class
    fpr = []
    tpr = []
    roc_auc = []
    pred_test = classifier.predict_proba(x_test)
    fpr, tpr, _ = roc_curve(yy_test, pred_test[:, 1])
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, label=label+(' (area = %0.4f)' % roc_auc))
def show_roc_curve():
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
plot_roc_curve(RandomForest_model, label='Random Forest')
plot_roc_curve(DecisionTree_model, label='Decision Tree')
plot_roc_curve(Logistic_model, label='Logistic Regression')
show_roc_curve()