### Улучшение качества модели

In [144]:
#импортируем библиотеки
import os
import pandas as pd
import zipfile
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [125]:
#загрузка файла с kaggle
def load_data_from_kaggle():
    data_path = 'heart-failure-prediction'

    if not os.path.exists(data_path):
        !kaggle datasets download -d fedesoriano/heart-failure-prediction      
        with zipfile.ZipFile('heart-failure-prediction.zip', 'r') as zipp:
            zipp.extractall(data_path)

    train_path = "\\".join([data_path, os.listdir(data_path)[0]])
    
    return pd.read_csv(train_path)

def show_score(model):
    print("{}: {}".format('test_accuracy', model['test_accuracy'].sum() / len(model['test_accuracy'])))
    print("{}: {}".format('test_recall', model['test_recall'].sum() / len(model['test_recall'])))
    print("{}: {}".format('test_precision', model['test_precision'].sum() / len(model['test_precision'])))
    print("{}: {}".format('test_f1', model['test_f1'].sum() / len(model['test_f1'])))

#### Предобработка данных

In [8]:
df = load_data_from_kaggle()
df

Downloading heart-failure-prediction.zip to C:\Users\Suile




  0%|          | 0.00/8.56k [00:00<?, ?B/s]
100%|##########| 8.56k/8.56k [00:00<00:00, 8.78MB/s]


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


Категориальные переменные привести в цифровые значения:
1. Sex
2. ChestPainType
3. RestingECG
4. ExerciseAngina
5. ST_Slope

**1. Sex**

In [24]:
processing_df = df.copy()

Преобразуем пол пациента в цифровой 0/1. Где 1 - мужской, 0 - женский.

In [28]:
processing_df.Sex = pd.get_dummies(df.Sex, dtype=int, drop_first=True)
processing_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,N,0.0,Up,0


**2. ChestPainType**

В столбце 4 уникальных значения 'ATA', 'NAP', 'ASY', 'TA'

In [37]:
df.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

Используя LabelEncoder закодируем значения в цифры

In [35]:
le = LabelEncoder()
le.fit(df.ChestPainType)
processing_df.ChestPainType = le.transform(df.ChestPainType)
processing_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,Normal,172,N,0.0,Up,0
1,49,0,2,160,180,0,Normal,156,N,1.0,Flat,1
2,37,1,1,130,283,0,ST,98,N,0.0,Up,0
3,48,0,0,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,1,2,150,195,0,Normal,122,N,0.0,Up,0


Такое соответсвие первоначальным значениям мы получили 

In [39]:
print(df.ChestPainType.unique())
print(le.transform(df.ChestPainType.unique()))

['ATA' 'NAP' 'ASY' 'TA']
[1 2 0 3]


**3. RestingECG**

In [40]:
df.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [41]:
le = LabelEncoder()
le.fit(df.RestingECG)
processing_df.RestingECG = le.transform(df.RestingECG)
processing_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,N,0.0,Up,0
1,49,0,2,160,180,0,1,156,N,1.0,Flat,1
2,37,1,1,130,283,0,2,98,N,0.0,Up,0
3,48,0,0,138,214,0,1,108,Y,1.5,Flat,1
4,54,1,2,150,195,0,1,122,N,0.0,Up,0


In [42]:
print(df.RestingECG.unique())
print(le.transform(df.RestingECG.unique()))

['Normal' 'ST' 'LVH']
[1 2 0]


**4. ExerciseAngina**

In [44]:
df.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

In [45]:
processing_df.ExerciseAngina = pd.get_dummies(df.ExerciseAngina, dtype=int, drop_first=True)
processing_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,Up,0
1,49,0,2,160,180,0,1,156,0,1.0,Flat,1
2,37,1,1,130,283,0,2,98,0,0.0,Up,0
3,48,0,0,138,214,0,1,108,1,1.5,Flat,1
4,54,1,2,150,195,0,1,122,0,0.0,Up,0


**5. ST_Slope**

In [46]:
df.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [48]:
le = LabelEncoder()
le.fit(df.ST_Slope)
processing_df.ST_Slope = le.transform(df.ST_Slope)
processing_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [50]:
print(df.ST_Slope.unique())
print(le.transform(df.ST_Slope.unique()))

['Up' 'Flat' 'Down']
[2 1 0]


#### Обучение модели с параметрами по умолчанию

In [51]:
clean_df = processing_df.copy()

In [137]:
X_train, X_test, Y_train, Y_test = train_test_split(
    clean_df[clean_df.columns[:-1]], 
    clean_df[clean_df.columns[-1]], 
    test_size=0.2, 
    random_state=2)

In [186]:
logreg = LogisticRegression(random_state=13)
_ = logreg.fit(X_train, Y_train)

In [187]:
default_logreg = cross_validate(logreg, X_test, Y_test, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])

In [188]:
show_score(default_logreg)

test_accuracy: 0.8304093567251464
test_recall: 0.8511111111111112
test_precision: 0.8381962481962482
test_f1: 0.8378651173388016


#### Оптимизация модели - подбор гиперпараметров

##### 1. GridSearchCV

In [189]:
parameters = {'penalty': ['l1', 'l2', 'elasticnet'], 
              'C': [0.001, 1, 5, 10, 20], 
              'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
              'fit_intercept': [False, True]}
logreg_grid_search_opt = LogisticRegression(random_state=13)
clf = GridSearchCV(logreg_grid_search_opt, parameters)
clf.fit(X_train, Y_train)

In [190]:
clf.best_estimator_

In [191]:
logreg_cv = cross_validate(clf.best_estimator_, X_test, Y_test, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])

In [192]:
show_score(logreg_cv)

test_accuracy: 0.8192982456140351
test_recall: 0.8300000000000001
test_precision: 0.8328679653679654
test_f1: 0.8248203842940685


##### 2. RandomizedSearchCV

###### 2.1 LogisticRegression

In [97]:
logreg_random_search_opt = LogisticRegression()
distributions = {'penalty': ['l1', 'l2', 'elasticnet'], 
                 'C': uniform(loc=0, scale=20), 
                 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
                 'fit_intercept': [False, True]}
randomCV = RandomizedSearchCV(logreg_random_search_opt, distributions, random_state=0)
randomCV.fit(X_train, Y_train)

In [98]:
randomCV.best_estimator_

In [134]:
logreg_random = cross_validate(randomCV.best_estimator_, X_test, Y_test, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])

In [135]:
show_score(logreg_random)

test_accuracy: 0.8245614035087719
test_recall: 0.8400000000000001
test_precision: 0.8351406926406927
test_f1: 0.8304480899217742


###### 2.2 SGDClassifier

In [102]:
sgd_random_search_opt = SGDClassifier()
distributions = {'penalty': ['l1', 'l2', 'elasticnet'], 
                 'alpha': uniform(loc=0, scale=5), 
                 'l1_ratio' : uniform(loc=0, scale=1),
                 'fit_intercept': [False, True],
                 'loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']}
randomCV_sgd = RandomizedSearchCV(sgd_random_search_opt, distributions, random_state=0)
randomCV_sgd.fit(X_train, Y_train)

In [103]:
randomCV_sgd.best_estimator_

In [123]:
sgd = cross_validate(randomCV_sgd.best_estimator_, X_test, Y_test, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])

In [127]:
show_score(sgd)

test_accuracy: 0.6736842105263159
test_recall: 0.6722222222222223
test_precision: 0.7513170163170163
test_f1: 0.6414414812240901


###### 2.2 KNeighborsClassifier

In [113]:
k_neighbors_random_search_opt = KNeighborsClassifier()
distributions = {'n_neighbors': list(range(2, 20, 1)), 
                 'weights': ['uniform', 'distance'],
                 'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                 'leaf_size': list(range(5, 40, 5))}
randomCV_k_neighbors = RandomizedSearchCV(k_neighbors_random_search_opt, distributions, random_state=0)
randomCV_k_neighbors.fit(X_train, Y_train)

In [114]:
randomCV_k_neighbors.best_estimator_

In [117]:
k_neighbors = cross_validate(randomCV_k_neighbors.best_estimator_, X_test, Y_test, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])

In [126]:
show_score(k_neighbors)

test_accuracy: 0.6625730994152047
test_recall: 0.6744444444444444
test_precision: 0.6744444444444444
test_f1: 0.6643988759778234


###### 2.3 RandomForestClassifier

In [179]:
forest_random_search_opt = RandomForestClassifier(random_state=13)
distributions = {'n_estimators': list(range(10, 500, 10)), 
                 'criterion': ['gini', 'entropy', 'log_loss'],
                 'max_features': ['auto', 'sqrt', 'log2'],
                 'min_samples_leaf': list(range(1, 10, 1)), 
                 'min_samples_split': list(range(1, 10, 1)),
                 'max_depth' : list(range(4, 15, 1))}

forest_random_search_opt = RandomizedSearchCV(forest_random_search_opt, distributions, random_state=13)
forest_random_search_opt.fit(X_train, Y_train)

In [180]:
forest_random_search_opt.best_estimator_

In [181]:
forest = cross_validate(forest_random_search_opt.best_estimator_, X_test, Y_test, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])

In [182]:
show_score(forest)

test_accuracy: 0.8195906432748539
test_recall: 0.8522222222222222
test_precision: 0.8333333333333333
test_f1: 0.831580554696048


### Выводы

Попробовал обучить несколько моделей:
1. LogisticRegression (Default)
2. LogisticRegression (GridSearchCV)
3. LogisticRegression (RandomizedSearchCV)
4. SGDClassifier (RandomizedSearchCV)
5. KNeighborsClassifier (RandomizedSearchCV)
6. RandomForestClassifier (RandomForestClassifier)

Получилось, что лучшие результаты дала модель с параметрами по умолчанию, хотя и с небольшими улучшениями

* **test_accuracy:** 0.8304093567251464
* **test_recall:** 0.8511111111111112
* **test_precision:** 0.8381962481962482
* **test_f1:** 0.8378651173388016