In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

In [3]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [4]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

In [5]:
d = pd.read_csv("heart.csv")
d

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [15]:
d.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [6]:
X, y = d.drop("target", axis=1), d["target"]

In [7]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2


In [8]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64

In [9]:
from sklearn.preprocessing import RobustScaler
colunas=X.columns

scaler=RobustScaler()

X[colunas]=scaler.fit_transform(X[colunas])
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,-0.307692,0.0,-0.5,-0.25,-0.437500,0.0,0.0,0.470588,0.0,0.111111,1.0,2.0,1.0
1,-0.230769,0.0,-0.5,0.50,-0.578125,1.0,-1.0,0.088235,1.0,1.277778,-1.0,0.0,1.0
2,1.076923,0.0,-0.5,0.75,-1.031250,0.0,0.0,-0.794118,1.0,1.000000,-1.0,0.0,1.0
3,0.384615,0.0,-0.5,0.90,-0.578125,0.0,0.0,0.264706,0.0,-0.444444,1.0,1.0,1.0
4,0.461538,-1.0,-0.5,0.40,0.843750,1.0,0.0,-1.352941,0.0,0.611111,0.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.230769,0.0,0.0,0.50,-0.296875,0.0,0.0,0.352941,1.0,-0.444444,1.0,0.0,0.0
1021,0.307692,0.0,-0.5,-0.25,0.281250,0.0,-1.0,-0.323529,1.0,1.111111,0.0,1.0,1.0
1022,-0.692308,0.0,-0.5,-1.00,0.546875,0.0,-1.0,-1.000000,1.0,0.111111,0.0,1.0,0.0
1023,-0.461538,-1.0,-0.5,-1.00,0.218750,0.0,-1.0,0.205882,0.0,-0.444444,1.0,0.0,0.0


In [10]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

In [11]:
dt = DecisionTreeClassifier(max_depth=3, random_state=1234)
splitter = StratifiedKFold(10, random_state=1234, shuffle=True)
scores = cross_validate(dt, X, y, cv=splitter, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.002119,0.004765,0.839006,0.822185,0.883926,0.849045,0.902075,0.791796,0.677076


In [12]:
nn = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=20, random_state=1234)
scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.073629,0.005868,0.862383,0.842295,0.906749,0.87209,0.928015,0.815306,0.72369


In [13]:
nb = GaussianNB()
scores_nb = cross_validate(nb, X, y, cv=splitter, scoring=METRICS)
nb_scores = pd.DataFrame(scores_nb)
pd.DataFrame(nb_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.001901,0.005922,0.825262,0.81465,0.861176,0.835833,0.902805,0.787224,0.649421


In [14]:
svm = SVC(random_state=1234, probability=True)
scores_svm = cross_validate(svm, X, y, cv=splitter, scoring=METRICS)
svm_scores = pd.DataFrame(scores_svm)
pd.DataFrame(svm_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.095202,0.011518,0.91501,0.898175,0.946662,0.920688,0.969369,0.881388,0.829375
