In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

In [2]:
df = pd.read_csv('heart.csv')
df = df.drop('Oldpeak', axis=1)
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,Up,0
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,Flat,1


In [3]:
#Преобразуем MaxHR в категориальный признак
for j in df.index:
    if df['Sex'][j]=='M':
        if 202 - 0.55*int(df['Age'][j]) > int(df['MaxHR'][j]):
           df.loc[j, 'MaxHR'] = 'Повышенный'
        else:
            df.loc[j, 'MaxHR'] = 'В норме'
    else:
        if 2016 - 1.09*int(df['Age'][j]) > int(df['MaxHR'][j]):
           df.loc[j, 'MaxHR'] = 'Повышенный'
        else:
            df.loc[j, 'MaxHR'] = 'В норме'
df

  df.loc[j, 'MaxHR'] = 'Повышенный'


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,Повышенный,N,Up,0
1,49,F,NAP,160,180,0,Normal,Повышенный,N,Flat,1
2,37,M,ATA,130,283,0,ST,Повышенный,N,Up,0
3,48,F,ASY,138,214,0,Normal,Повышенный,Y,Flat,1
4,54,M,NAP,150,195,0,Normal,Повышенный,N,Up,0
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,Повышенный,N,Flat,1
914,68,M,ASY,144,193,1,Normal,Повышенный,N,Flat,1
915,57,M,ASY,130,131,0,Normal,Повышенный,Y,Flat,1
916,57,F,ATA,130,236,0,LVH,Повышенный,N,Flat,1


In [4]:
#Преобразуем Age в категориальный признак
for j in df.index:
    if int(df['Age'][j]) < 18.0:
        df.loc[j, 'Age'] = 'Молодой'
    elif int(df['Age'][j]) >= 18.0 and int(df['Age'][j]) <= 65.0:
        df.loc[j, 'Age'] = 'Взрослый'
    else:
        df.loc[j, 'Age'] = 'Пенсионер'
df

  df.loc[j, 'Age'] = 'Взрослый'


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,ST_Slope,HeartDisease
0,Взрослый,M,ATA,140,289,0,Normal,Повышенный,N,Up,0
1,Взрослый,F,NAP,160,180,0,Normal,Повышенный,N,Flat,1
2,Взрослый,M,ATA,130,283,0,ST,Повышенный,N,Up,0
3,Взрослый,F,ASY,138,214,0,Normal,Повышенный,Y,Flat,1
4,Взрослый,M,NAP,150,195,0,Normal,Повышенный,N,Up,0
...,...,...,...,...,...,...,...,...,...,...,...
913,Взрослый,M,TA,110,264,0,Normal,Повышенный,N,Flat,1
914,Пенсионер,M,ASY,144,193,1,Normal,Повышенный,N,Flat,1
915,Взрослый,M,ASY,130,131,0,Normal,Повышенный,Y,Flat,1
916,Взрослый,F,ATA,130,236,0,LVH,Повышенный,N,Flat,1


In [5]:
#Преобразуем RestingBP в категориальный признак
for j in df.index:
    if int(df['RestingBP'][j]) <= 110 and int(df['RestingBP'][j]) >= 130:
        df.loc[j, 'RestingBP'] = 'Нормальное'
    elif int(df['RestingBP'][j]) > 130:
        df.loc[j, 'RestingBP'] = 'Высокое'
    else:
        df.loc[j, 'RestingBP'] = 'Низкое'
df

  df.loc[j, 'RestingBP'] = 'Высокое'


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,ST_Slope,HeartDisease
0,Взрослый,M,ATA,Высокое,289,0,Normal,Повышенный,N,Up,0
1,Взрослый,F,NAP,Высокое,180,0,Normal,Повышенный,N,Flat,1
2,Взрослый,M,ATA,Низкое,283,0,ST,Повышенный,N,Up,0
3,Взрослый,F,ASY,Высокое,214,0,Normal,Повышенный,Y,Flat,1
4,Взрослый,M,NAP,Высокое,195,0,Normal,Повышенный,N,Up,0
...,...,...,...,...,...,...,...,...,...,...,...
913,Взрослый,M,TA,Низкое,264,0,Normal,Повышенный,N,Flat,1
914,Пенсионер,M,ASY,Высокое,193,1,Normal,Повышенный,N,Flat,1
915,Взрослый,M,ASY,Низкое,131,0,Normal,Повышенный,Y,Flat,1
916,Взрослый,F,ATA,Низкое,236,0,LVH,Повышенный,N,Flat,1


In [6]:
#Преобразуем Cholesterol в категориальный признак
for j in df.index:
    if int(df['Cholesterol'][j]) <= 239 and int(df['Cholesterol'][j]) >= 200:
        df.loc[j, 'Cholesterol'] = 'Повышенный'
    elif int(df['Cholesterol'][j]) >= 240:
        df.loc[j, 'Cholesterol'] = 'Высокий'
    else:
        df.loc[j, 'Cholesterol'] = 'В норме'
df

  df.loc[j, 'Cholesterol'] = 'Высокий'


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,ST_Slope,HeartDisease
0,Взрослый,M,ATA,Высокое,Высокий,0,Normal,Повышенный,N,Up,0
1,Взрослый,F,NAP,Высокое,В норме,0,Normal,Повышенный,N,Flat,1
2,Взрослый,M,ATA,Низкое,Высокий,0,ST,Повышенный,N,Up,0
3,Взрослый,F,ASY,Высокое,Повышенный,0,Normal,Повышенный,Y,Flat,1
4,Взрослый,M,NAP,Высокое,В норме,0,Normal,Повышенный,N,Up,0
...,...,...,...,...,...,...,...,...,...,...,...
913,Взрослый,M,TA,Низкое,Высокий,0,Normal,Повышенный,N,Flat,1
914,Пенсионер,M,ASY,Высокое,В норме,1,Normal,Повышенный,N,Flat,1
915,Взрослый,M,ASY,Низкое,В норме,0,Normal,Повышенный,Y,Flat,1
916,Взрослый,F,ATA,Низкое,Повышенный,0,LVH,Повышенный,N,Flat,1


In [7]:
transformer = make_column_transformer((OneHotEncoder(), ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'MaxHR', 'ExerciseAngina', 'ST_Slope']), remainder='passthrough', sparse_threshold=0)
transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
transformed_df

Unnamed: 0,onehotencoder__Age_Взрослый,onehotencoder__Age_Пенсионер,onehotencoder__Sex_F,onehotencoder__Sex_M,onehotencoder__ChestPainType_ASY,onehotencoder__ChestPainType_ATA,onehotencoder__ChestPainType_NAP,onehotencoder__ChestPainType_TA,onehotencoder__RestingBP_Высокое,onehotencoder__RestingBP_Низкое,...,onehotencoder__MaxHR_В норме,onehotencoder__MaxHR_Повышенный,onehotencoder__MaxHR_В норме.1,onehotencoder__MaxHR_Повышенный.1,onehotencoder__ExerciseAngina_N,onehotencoder__ExerciseAngina_Y,onehotencoder__ST_Slope_Down,onehotencoder__ST_Slope_Flat,onehotencoder__ST_Slope_Up,remainder__HeartDisease
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
914,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
915,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
916,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0


In [29]:
X, y = transformed_df.loc[:, ~transformed_df.columns.isin(['remainder__HeartDisease'])], transformed_df['remainder__HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
lr = LogisticRegression().fit(X_train, y_train)
#Делаем кросс-валидацию
scores = cross_validate(lr, X_train, y_train, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])
for key, value in scores.items():
    print(f'{key}: {value.mean()}')

fit_time: 0.003374052047729492
score_time: 0.003949880599975586
test_accuracy: 0.8541095890410959
test_recall: 0.8826219512195121
test_precision: 0.859257501428554
test_f1: 0.8689929049653019


In [25]:
#На обучаемой выборке
print(f'accuracy: {accuracy_score(y_train, lr.predict(X_train))}')
print(f'recall: {recall_score(y_train, lr.predict(X_train))}')
print(f'precision: {precision_score(y_train, lr.predict(X_train))}')
print(f'f1: {f1_score(y_train, lr.predict(X_train))}')

accuracy: 0.8692098092643051
recall: 0.8977556109725686
precision: 0.8674698795180723
f1: 0.8823529411764706


In [26]:
#На тестовой выборке
print(f'accuracy: accuracy_score(y_test, lr.predict(X_test))}')
print(f'recall: recall_score(y_test, lr.predict(X_test))}')
print(f'precision: precision_score(y_test, lr.predict(X_test))}')
print(f'f1: f1_score(y_test, lr.predict(X_test))}')

accuracy: 0.8641304347826086
recall: 0.8504672897196262
precision: 0.91
f1: 0.8792270531400966


In [32]:
#Оптимизируем параметры модели с помощью GridSearchCV
param_grid = 'C': [0.001, 0.01, 0.1, 1, 10], 'solver': ['liblinear', 'lbfgs', 'sag', 'newton-cg', 'saga'], 'max_iter': [100, 200, 500, 1000]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=10, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print(f"Лучшие параметры, найденные GridSearchCV: {grid_search.best_params_}")
print(f'Лучшее значение метрики на тренировачной выборке: {grid_search.best_score_}')
print(f'Лучшее значение метрики на тестовой выборке: {grid_search.best_estimator_.score(X_test, y_test)}')

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Лучшие параметры, найденные GridSearchCV: {'C': 0.01, 'max_iter': 100, 'solver': 'liblinear'}
Лучшее значение метрики на тренировачной выборке: 0.8636616068122918
Лучшее значение метрики на тренировачной выборке: 0.875


In [22]:
#Обучаем модель лог.регрессии с этими параметрами
model_grid = LogisticRegression(**grid_search.best_params_)
model_grid.fit(X_train, y_train)

In [None]:
#На обучаемой выборке
print(f'accuracy: {accuracy_score(y_train, model_grid.predict(X_train))}')
print(f'recall: {recall_score(y_train, model_grid.predict(X_train))}')
print(f'precision: {precision_score(y_train, model_grid.predict(X_train))}')
print(f'f1: {f1_score(y_train, model_grid.predict(X_train))}')

In [None]:
#На тестовой выборке
print(f'accuracy: {accuracy_score(y_test, model_grid.predict(X_test))}')
print(f'recall: {recall_score(y_test, model_grid.predict(X_test))}')
print(f'precision: {precision_score(y_test, model_grid.predict(X_test))}')
print(f'f1: {f1_score(y_test, model_grid.predict(X_test))}')

In [41]:
#Оптимизируем параметры модели с помощью RandomizedSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'solver': ['liblinear', 'lbfgs', 'sag', 'newton-cg', 'saga'], 'max_iter': [100, 200, 500, 1000]}
random_grid_search = RandomizedSearchCV(LogisticRegression(), param_grid, cv=10)
random_grid_search.fit(X_train, y_train)
print(f"Лучшие параметры, найденные RandomizedSearchCV: {random_grid_search.best_params_}")
print(f'Лучшее значение метрики на тренировачной выборке: {random_grid_search.best_score_}')
print(f'Лучшее значение метрики на тестовой выборке: {random_grid_search.best_estimator_.score(X_test, y_test)}')

Лучшие параметры, найденные RandomizedSearchCV: {'solver': 'saga', 'max_iter': 1000, 'C': 0.01}
Лучшее значение метрики на тренировачной выборке: 0.8636616068122918
Лучшее значение метрики на тестовой выборке: 0.875


In [36]:
#Обучаем модель лог.регрессии с этими параметрами
random_grid_search = LogisticRegression(**random_grid_search.best_params_)
random_grid_search.fit(X_train, y_train)

In [37]:
#На обучаемой выборке
print(f'accuracy: {accuracy_score(y_train, random_grid_search.predict(X_train))}')
print(f'recall: {recall_score(y_train, random_grid_search.predict(X_train))}')
print(f'precision: {precision_score(y_train, random_grid_search.predict(X_train))}')
print(f'f1: {f1_score(y_train, random_grid_search.predict(X_train))}')

accuracy: 0.8637602179836512
recall: 0.9077306733167082
precision: 0.8524590163934426
f1: 0.8792270531400965


In [38]:
#На тестовой выборке
print(f'accuracy: {accuracy_score(y_test, random_grid_search.predict(X_test))}')
print(f'recall: {recall_score(y_test, random_grid_search.predict(X_test))}')
print(f'precision: {precision_score(y_test, random_grid_search.predict(X_test))}')
print(f'f1: {f1_score(y_test, random_grid_search.predict(X_test))}')

accuracy: 0.875
recall: 0.8878504672897196
precision: 0.8962264150943396
f1: 0.892018779342723


In [40]:
pd.set_option("max_colwidth", 500)
relult = (pd.DataFrame({'Модель': ['Логистическая регрессия без подбора гиперпараметров', 'Логистическая регрессия с подбором гиперпараметров через GridSearchCV', 'Логистическая регрессия с подбором гиперпараметров через RandomizedSearchCV'], 
                        'Accuracy': [accuracy_score(y_test, lr.predict(X_test)), accuracy_score(y_test, model_grid.predict(X_test)), accuracy_score(y_test, random_grid_search.predict(X_test))],
                        'Recall': [recall_score(y_test, lr.predict(X_test)), recall_score(y_test, model_grid.predict(X_test)), recall_score(y_test, random_grid_search.predict(X_test))],
                        'Precision': [precision_score(y_test, lr.predict(X_test)), precision_score(y_test, model_grid.predict(X_test)), precision_score(y_test, random_grid_search.predict(X_test))],
                        'F1': [f1_score(y_test, lr.predict(X_test)), f1_score(y_test, model_grid.predict(X_test)), f1_score(y_test, random_grid_search.predict(X_test))]
                        }))
relult

Unnamed: 0,Модель,Accuracy,Recall,Precision,F1
0,Логистическая регрессия без подбора гиперпараметров,0.86413,0.850467,0.91,0.879227
1,Логистическая регрессия с подбором гиперпараметров через GridSearchCV,0.875,0.88785,0.896226,0.892019
2,Логистическая регрессия с подбором гиперпараметров через RandomizedSearchCV,0.875,0.88785,0.896226,0.892019
