In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, auc
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv('heart.csv')

In [3]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


Tento dataset obsahuje data o lidech zkoumaných v klinikách ohledně srdečních nemocí. Data obsahují atributy jako,
jaký měli choresterol. Hladinu cukru v krvi, tlak a další atributy podobného rázu. Důležitý atribut 'HeartDisease',
který budu zkoumat.

In [59]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [4]:
df.describe(exclude=np.number)

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
count,918,918,918,918,918
unique,2,4,3,2,3
top,M,ASY,Normal,N,Flat
freq,725,496,552,547,460


In [5]:
df.describe(include=np.number)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [6]:
df.apply(lambda x: x.isna().sum()).sort_values(ascending=False)

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

Dataset neobsahuje žádné prázdné řádky/sloupce, takže nebudeme muset nic dropovat.

In [30]:
df.HeartDisease.value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

O našem datasetu se dá říct, že je plus mínus vyvážený. Protože, námi hledaný atribut je v poměru 5:4.

In [8]:
dummies= pd.get_dummies(df)
dummies

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
914,68,144,193,1,141,3.4,1,0,1,1,...,0,0,0,1,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,0,1,1,...,0,0,0,1,0,0,1,0,1,0
916,57,130,236,0,174,0.0,1,1,0,0,...,0,0,1,0,0,1,0,0,1,0


Všechny ne numerické atributy se musí zakódovat na numerické pomocí funkce 'pd.get_dummies', aby s nimi mohl následně klasifikační algoritmus pracovat 

In [9]:
X, y = dummies.loc[:, (dummies.columns != 'HeartDisease') ], dummies.loc[:, 'HeartDisease']
X.shape, y.shape

((918, 20), (918,))

Tento dataset si rozdělím na daný sloupec, který zkoumám (v mojem případě HeartDisease) a všechny ostatní.

In [43]:
X.shape, y.shape

((918, 20), (918,))

In [90]:
best_scores = []
best = 0
best_index = 0

    
skf = StratifiedKFold(n_splits=40, shuffle = True, random_state = 10)
scores = list()
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores.append(f1_score(y_test, y_pred))

    if best < np.mean(scores):
        best_scores = scores
        best = np.mean(scores)

print(f'Průměrné skóre je {np.mean(best_scores)}, Min: {np.min(best_scores)}, Max: {np.max(best_scores)} při algoritmu'
     f' DecisionTreeClassifier  ')
#np.mean(best_scores), np.min(best_scores), np.max(best_scores), best_index

Průměrné skóre je 0.8254023773855108, Min: 0.5833333333333334, Max: 0.962962962962963 při algoritmu DecisionTreeClassifier  


Při klasifikaci  pomocí klasifikátoru DecisionTreeClassifier jsem dosáhl nejlepší predikce infarktu
82 %. Použil jsem 40 splitů, což je celkem hodně, protože jde vidět, že se liší nejhorší a nejlepší výsledek o skoro 
40 %. 

In [92]:
result = 0
result_score = 0
index_inner = 0

scores_field = []
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 10)

for i in range (1, 100):
    scores = list()
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clf = RandomForestClassifier(n_estimators=i, random_state = 10)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        scores.append(f1_score(y_test, y_pred))

    scores
    np.mean(scores), np.min(scores), np.max(scores)
    scores_field.append(scores)
    if result < np.mean(scores):
        result = np.mean(scores)
        result_score = scores
        index_inner = i
     
np.mean(result_score), np.min(result_score), np.max(result_score), index_inner 
print (f'Průměrné skóre je {np.mean(result_score)}, Min: {np.min(result_score)}, Max: {np.max(result_score)}, '
       f' počtem estimátorů {index_inner}' )

Průměrné skóre je 0.8904233666448329, Min: 0.8585365853658536, Max: 0.9320388349514563,  počtem estimátorů 91


Při klasifikaci pomocí klasifikátoru RandomForestClassifier jsem dosáhl nejlepší predikce infarktu 89 % s počtem estimátorů 91. 

In [58]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 10)
scores = list()
activ = [ 'relu', 'identity','logistic', 'tanh' ]#, 'logistic', 'tanh' ,'relu''identity','relu',
solve = ['sgd', 'adam']#, 'sgd', 'adam'
result = []
for acti in activ:
    
    for solv in solve:
        scores = list()
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            clf = MLPClassifier(hidden_layer_sizes=(20,5), activation=acti, solver=solv, max_iter=100000, random_state=13)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            scores.append(f1_score(y_test, y_pred))
        if not result:
            result = [acti, solv, np.mean(scores), np.min(scores), np.max(scores)]
        elif result[2] <  np.mean(scores):
            result = [acti, solv, np.mean(scores), np.min(scores), np.max(scores)]
        #print(acti, solv, np.mean(scores), np.min(scores), np.max(scores))
#scores
print(result)

['relu', 'adam', 0.8780162741165458, 0.84, 0.8975609756097561]


Při klasifikaci pomocí MLPClassifier jsem dosáhl nejlepšího průměrného skóre 87.8 % s 
parametry solver = adam a activation = relu

Závěrem bych dodal, že nejvíce dosažené úspěšnosti odhadu srdeční nemoci v tomle projektu je průměrně 89% pomocí klasifikátoru random forest. Úspěšnost devět lidí z deseti mi příjde jako solidní. Pro výpočet úspěšnosti jsem použil f1_score, která bere v potaz true positive, true negative, false positive a false negative.