In [160]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [161]:
df = pd.read_csv('heart14.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [163]:
for col in df.select_dtypes(include=['object']).columns:
    print(col)
    print(df[col].value_counts())
    print('-'*20)


Sex
Sex
M    725
F    193
Name: count, dtype: int64
--------------------
ChestPainType
ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: int64
--------------------
RestingECG
RestingECG
Normal    552
LVH       188
ST        178
Name: count, dtype: int64
--------------------
ExerciseAngina
ExerciseAngina
N    547
Y    371
Name: count, dtype: int64
--------------------
ST_Slope
ST_Slope
Flat    460
Up      395
Down     63
Name: count, dtype: int64
--------------------


In [164]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [165]:
X = df.drop('HeartDisease', axis=1)   
y = df['HeartDisease']

In [166]:
categorial = OneHotEncoder(
    sparse=False, 
    handle_unknown='ignore' 
    ).fit_transform(X.select_dtypes(include=['object']))

In [167]:
scaled = StandardScaler().fit_transform(X.select_dtypes(exclude=['object']))

In [168]:
X = np.concatenate([categorial, scaled], axis=1)

In [169]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, 
    random_state=1589
)


In [170]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

In [171]:
models = {}

In [172]:
def print_scores(key=None):
    print(f'{"Модель":20s}\taccuracy\trecall\tprecision\tf1')
    for name, score  in models.items():
        if key is None or name == key:     
            print(
                f'{name:20s}\t{score["accuracy"]:.3f}\t\t'
                f'{score["recall"]:.3f}\t{score["precision"]:.3f}\t\t'
                f'{score["fi"]:.3f}',
            )

In [173]:
def add_score(name, accuracy, recall, precision, fi):
    models[name] = {
        'accuracy': accuracy,
        'recall': recall,
        'precision': precision,
        'fi': fi
    }
    print_scores(name)

In [174]:
name = 'LR без CV'
lr = LogisticRegression(random_state=1589)
lr.fit(X_train, y_train)
add_score(
    name, 
    lr.score(X_test, y_test), 
    recall_score(lr.predict(X_test), y_test), 
    precision_score(lr.predict(X_test), y_test), 
    f1_score(lr.predict(X_test), y_test)
)



Модель              	accuracy	recall	precision	f1
LR без CV           	0.875		0.871	0.939		0.904


In [175]:
scoring = ['accuracy','recall', 'f1', 'precision']

In [176]:
name = 'LR с CV'
scores = cross_validate(
    LogisticRegression(
        random_state=1589
    ),
    X_train,
    y_train,
    cv=10,
    scoring= scoring,
)

In [177]:
add_score(
    name, 
    scores['test_accuracy'].mean(),
    scores['test_recall'].mean(),
    scores['test_precision'].mean(),
    scores['test_f1'].mean()
)

Модель              	accuracy	recall	precision	f1
LR с CV             	0.864		0.883	0.866		0.873


In [178]:
from sklearn.model_selection import GridSearchCV

name  =  'GridSearchCV'
param = {
    'penalty': ['l1', 'l2', None],
    'max_iter': [100, 200, 300],
    'C': [0.1,  0.2,  0.3,  0.4,  0.5, 1],
}
grid = GridSearchCV(
    LogisticRegression(random_state=1589),
    param_grid= param,
    cv=10,
    refit = 'f1',
    scoring=scoring
).fit(X_train, y_train)

In [179]:
add_score(
    name,
    grid.cv_results_['mean_test_accuracy'][grid.best_index_],
    grid.cv_results_['mean_test_recall'][grid.best_index_],
    grid.cv_results_['mean_test_precision'][grid.best_index_],
    grid.best_score_
)
grid.best_params_

Модель              	accuracy	recall	precision	f1
GridSearchCV        	0.866		0.880	0.872		0.875


{'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}

In [180]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

model_param = [
    [
        'LR Random', 
        LogisticRegression(random_state=1589),
        {
            'penalty': ['l1', 'l2', None],
            'max_iter': [100, 200, 300],
            'C': [0.1,  0.2,  0.3,  0.4,  0.5, 1],
        }
    ],
    [
        'DT Random', 
        DecisionTreeClassifier(random_state=1589),
        {           
            'criterion': ['gini', 'entropy', 'log_loss'],
            'max_depth': [5, 10, None],
            'min_samples_leaf': [1, 3, 5]
        }
    ],
    [
        'KNN Random',
        KNeighborsClassifier(), 
        {
            'n_neighbors': [3, 5, 10],
            'algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'weights': ['uniform',  'distance']
        }
    ],
    [
        'SVM Random',
        SVC(random_state=1589),
        {
            'kernel': ['linear',  'poly',  'rbf', 'sigmoid'],
            'degree': [2, 3, 4],
            'C': [0.1,  0.2,  0.3,  0.4,  0.5, 1],
        },
    ]
]

In [181]:
for name, model, model_par  in model_param:
    grid = RandomizedSearchCV(
        model,
        model_par,
        cv=10,
        refit = 'f1',
        scoring=scoring
    ).fit(X_train, y_train)
    add_score(
    name,
    grid.cv_results_['mean_test_accuracy'][grid.best_index_],
    grid.cv_results_['mean_test_recall'][grid.best_index_],
    grid.cv_results_['mean_test_precision'][grid.best_index_],
    grid.best_score_
    )

Модель              	accuracy	recall	precision	f1
LR Random           	0.866		0.880	0.872		0.875
Модель              	accuracy	recall	precision	f1
DT Random           	0.832		0.832	0.857		0.842
Модель              	accuracy	recall	precision	f1
KNN Random          	0.876		0.891	0.880		0.885
Модель              	accuracy	recall	precision	f1
SVM Random          	0.875		0.898	0.874		0.885


In [182]:
add_score('Best DZ ', 0.88, 0.85, 0.89, 0.86)

Модель              	accuracy	recall	precision	f1
Best DZ             	0.880		0.850	0.890		0.860


# Итог

In [183]:
print_scores()

Модель              	accuracy	recall	precision	f1
LR без CV           	0.875		0.871	0.939		0.904
LR с CV             	0.864		0.883	0.866		0.873
GridSearchCV        	0.866		0.880	0.872		0.875
LR Random           	0.866		0.880	0.872		0.875
DT Random           	0.832		0.832	0.857		0.842
KNN Random          	0.876		0.891	0.880		0.885
SVM Random          	0.875		0.898	0.874		0.885
Best DZ             	0.880		0.850	0.890		0.860


Без CV получился самый оптимистичный прогноз.

При использовании CV наилучшими выглядят модели KNN и SVM