In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from matplotlib import pyplot as plt

In [9]:
data = pd.read_csv('heart.csv')

In [10]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [11]:
# Разделяем на тренировочные и тестовые данные

data_train, data_test, y_train, y_test = train_test_split(data.drop(columns='HeartDisease'), data['HeartDisease'], test_size=0.20, random_state=42)

In [12]:
# Находим категориальные признаки

categorials = list(data.dtypes[data.dtypes == object].index)

categorials

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [13]:
# Отфильтровываем непрерывные признаки
integer = [f for f in data if f not in (categorials + ['HeartDisease'])]

In [14]:
# Создаем дамми-переменные для категорий
dummy_train = pd.get_dummies(data_train[categorials], columns=categorials)
dummy_test = pd.get_dummies(data_test[categorials], columns=categorials)

dummy_cols = list(set(dummy_train) & set(dummy_test))

dummy_train = dummy_train[dummy_cols]
dummy_test = dummy_test[dummy_cols]

X_train = pd.concat([data_train[integer], dummy_train], axis=1)

X_test = pd.concat([data_test[integer], dummy_test], axis=1)

In [15]:
X_train.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,ST_Slope_Down,ChestPainType_NAP,ST_Slope_Flat,RestingECG_ST,ChestPainType_ATA,ChestPainType_TA,RestingECG_LVH,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Up,Sex_M,ChestPainType_ASY,RestingECG_Normal,Sex_F
795,42,120,240,1,194,0.8,1,1,0,0,0,0,0,1,0,0,1,0,1,0
25,36,130,209,0,178,0.0,0,1,0,0,0,0,0,1,0,1,1,0,1,0
84,56,150,213,1,125,1.0,0,0,1,0,0,0,0,0,1,0,1,1,1,0
10,37,130,211,0,142,0.0,0,1,0,0,0,0,0,1,0,1,0,0,1,1
344,51,120,0,1,104,0.0,0,0,1,0,0,0,0,1,0,0,1,1,1,0


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

In [17]:
# Обучаем логистическую регрессию

lr = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
lr.fit(X_train, y_train)

lr.score(X_test, y_test)

0.8532608695652174

In [18]:
# Подсчитаем основные метрики модели 

cross_validate(lr, X_test, y_test, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])

{'fit_time': array([0.01900315, 0.01300049, 0.01199961, 0.01002979, 0.01099944,
        0.00999379, 0.01004529, 0.0099988 , 0.00994754, 0.01100469]),
 'score_time': array([0.01899409, 0.00600505, 0.00700045, 0.00696397, 0.00600505,
        0.00700784, 0.00598788, 0.00598502, 0.00500107, 0.00699854]),
 'test_accuracy': array([0.89473684, 0.78947368, 0.84210526, 0.78947368, 0.77777778,
        1.        , 0.83333333, 0.72222222, 0.83333333, 0.83333333]),
 'test_recall': array([1.        , 0.90909091, 0.81818182, 0.81818182, 0.8       ,
        1.        , 0.8       , 0.81818182, 0.81818182, 0.90909091]),
 'test_precision': array([0.84615385, 0.76923077, 0.9       , 0.81818182, 0.8       ,
        1.        , 0.88888889, 0.75      , 0.9       , 0.83333333]),
 'test_f1': array([0.91666667, 0.83333333, 0.85714286, 0.81818182, 0.8       ,
        1.        , 0.84210526, 0.7826087 , 0.85714286, 0.86956522])}

In [27]:
# Осуществим подбор параметров при помощи GridSearchCV

from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Определяем сеть параметров

grid={'C':[0.1, 0.5, 1, 5, 10, 50, 100],\
      'penalty':['l1','l2','none'],\
      'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag'],\
      'max_iter':[1000, 5000, 10000]}

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [28]:
# Определяем гиперпараметры

lr_cv = GridSearchCV(LogisticRegression(), grid, cv=10, n_jobs=-1)
lr_cv.fit(X_train, y_train)

print('best parameters: ', lr_cv.best_params_)
print('accuracy :', lr_cv.best_score_)

best parameters:  {'C': 0.5, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'newton-cg'}
accuracy : 0.8663643095149943


In [30]:
# Попробуем разные модели

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

In [31]:
models=[
      {'name':'SVC',"model": SVC(), 'params':{'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'gamma':['scale', 'auto']}},
      {'name':'RF',"model": RandomForestClassifier(), 'params':{'n_estimators':[10,25,50,100,150,200], 'criterion':['gini', 'entropy'], 'max_depth':[3,5,7,9,11]}},
      {'name':'KN',"model": KNeighborsClassifier(), 'params':{'n_neighbors':list(range(1,30)),'weights': ['uniform', 'distance'], 'p':[1,2,3]}},
      {'name':'DT',"model": DecisionTreeClassifier(), 'params':{'criterion':['gini', 'entropy'], 'max_depth':[3,5,7,9,11]}}

]

res=[]
for v in  models:
    res.append((v['name'], RandomizedSearchCV(v['model'], v['params'], cv=10).fit(X_train, y_train)))

In [32]:
for r in res:
    print(r[0], r[1].best_score_, r[1].best_params_)

SVC 0.8677156608663458 {'kernel': 'rbf', 'gamma': 'scale'}
RF 0.8745834875971863 {'n_estimators': 50, 'max_depth': 7, 'criterion': 'entropy'}
KN 0.8758793039614957 {'weights': 'distance', 'p': 1, 'n_neighbors': 25}
DT 0.8432062199185488 {'max_depth': 3, 'criterion': 'entropy'}


In [None]:
# Лучше всех сработала модель KNN-классификатор, почти также "Случайный лес"