**Домашнее задание к лекции «Улучшение качества модели»**

**Задание**\
**Цель:**\
Применить на практике алгоритмы по автоматической оптимизации параметров моделей машинного обучения.\
**Описание задания:**\
В домашнем задании нужно решить задачу классификации наличия болезни сердца у пациентов наиболее эффективно. Данные для обучения моделей необходимо загрузить самостоятельно с сайта. Целевая переменная – наличие болезни сердца (HeartDisease). Она принимает значения 0 или 1 в зависимости от отсутствия или наличия болезни соответственно. Подробное описание признаков можно прочесть в описании датасета на сайте. Для выполнения работы не обязательно вникать в медицинские показатели.

**1.Получите данные и загрузите их в рабочую среду.\
2.Подготовьте датасет к обучению моделей:\
a) Категориальные переменные переведите в цифровые значения. Можно использовать pd.get_dummies, preprocessing.LabelEncoder. Старайтесь не использовать для этой задачи циклы.\
3.Разделите выборку на обучающее и тестовое подмножество. 80% данных оставить на обучающее множество, 20% на тестовое.\
4.Обучите модель логистической регрессии с параметрами по умолчанию.\
5.Подсчитайте основные метрики модели. Используйте следующие метрики и функцию:
cross_validate(…, cv=10, scoring=[‘accuracy’,‘recall’,‘precision’,‘f1’])**

In [100]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, make_scorer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
heart_df = pd.read_csv('heart.csv')
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [4]:
heart_df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [5]:
# Преобразуем категориальные признаки
# Для признаков Sex и ExerciseAngina можно использовать LabelEncoder

In [6]:
from sklearn.preprocessing import LabelEncoder
le_sex = LabelEncoder()
le_exang = LabelEncoder()

In [7]:
le_sex.fit(heart_df['Sex'])
le_exang.fit(heart_df['ExerciseAngina'])

In [8]:
heart_df['Sex'] = le_sex.transform(heart_df['Sex'])
heart_df['ExerciseAngina'] = le_exang.transform(heart_df['ExerciseAngina'])
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [9]:
# Остальные категориальные признаки преобразуем с помощью get_dummies

In [10]:
selectedColumns = heart_df.select_dtypes(include=['object']).columns
X = pd.get_dummies (heart_df.loc[ : , heart_df.columns != 'HeartDisease'], 
                    columns = selectedColumns)
X.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,1,0,0,0,1,0,0,0,1
1,49,0,160,180,0,156,0,1.0,0,0,1,0,0,1,0,0,1,0
2,37,1,130,283,0,98,0,0.0,0,1,0,0,0,0,1,0,0,1
3,48,0,138,214,0,108,1,1.5,1,0,0,0,0,1,0,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,1,0,0,1,0,0,0,1


In [11]:
y = heart_df['HeartDisease']
y.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42)

In [36]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter = 1000, 
                                                          random_state = 42))
pipe.fit(X_train, y_train)

In [None]:
# Напишем функцию, которая будет подсчитывать основные метрики модели с помощью функции cross validate

In [68]:
def CV_scores(model):
    '''Функция, которая подсчитывает основные метрики модели:
    - accuracy
    - precision
    - recall
    - f1
    с помощью функции cross validate'''

    scoring = {'accuracy' : make_scorer(accuracy_score), 
               'precision' : make_scorer(precision_score),
               'recall' : make_scorer(recall_score), 
               'f1_score' : make_scorer(f1_score)}

    scores = cross_validate(model, X, y, cv=10, scoring=scoring)

    acc_mean = round(scores['test_accuracy'].mean(), 3)
    prec_mean = round(scores['test_precision'].mean(), 3)
    rec_mean = round(scores['test_recall'].mean(), 3)
    f1_mean = round(scores['test_f1_score'].mean(), 3)

    return print(f'accuracy: {acc_mean}, precision: {prec_mean}, recall: {rec_mean}, f1_score: {f1_mean}')

In [61]:
CV_scores(pipe)

accuracy: 0.851, precision: 0.867, recall: 0.87, f1_score: 0.864


**6.Оптимизируйте 3-4 параметра модели:\
a) Используйте GridSearchCV.\
b) Используйте RandomizedSearchCV.\
c) *Добавьте в п. 6b 2-5 моделей классификации и вариации их параметров.\
d) Повторите п. 5 после каждого итогового изменения параметров.**

In [None]:
# Используйте GridSearchCV

In [73]:
X_train = StandardScaler().fit_transform(X_train)

LR = LogisticRegression(max_iter = 10000, random_state = 42)

params = dict(penalty = ['l2', 'l1'],
              C = [1, 10],
              solver = ['liblinear', 'saga'])

grid_search = GridSearchCV(estimator = LR, param_grid = params, 
                           scoring = 'recall', error_score = 'raise')
grid_search.fit(X_train, y_train)

In [74]:
LR_GS_best = grid_search.best_estimator_

In [75]:
CV_scores(LR_GS_best)

accuracy: 0.849, precision: 0.863, recall: 0.872, f1_score: 0.863


In [None]:
# Используйте RandomizedSearchCV

In [124]:
params = dict(penalty = ['l2', 'l1'],
              C = list(range(1,10)),
              solver = ['liblinear', 'saga'])

random_search = RandomizedSearchCV(LR, params, scoring = 'recall', random_state = 42)

random_search.fit(X_train, y_train)

In [125]:
LR_RS_best = random_search.best_estimator_

In [126]:
CV_scores(LR_RS_best)

accuracy: 0.847, precision: 0.861, recall: 0.87, f1_score: 0.861


In [None]:
# Добавьте в п. 6b 2-5 моделей классификации и вариации их параметров.

In [None]:
# SVC

In [127]:
svc = svm.SVC()

params = {'kernel':('linear', 'rbf'), 
          'C':list(range(1,10))}

random_search = RandomizedSearchCV(svc, params, scoring = 'recall', random_state = 42)
random_search.fit(X_train, y_train)

In [128]:
svc_best = random_search.best_estimator_

In [129]:
CV_scores(svc_best)

accuracy: 0.706, precision: 0.757, recall: 0.711, f1_score: 0.709


In [None]:
# BaggingClassifier

In [118]:
BC_model = BaggingClassifier(random_state = 42, n_jobs = -1)

params = {'n_estimators':list(range(10, 200)),
          'max_samples':np.arange(0.2, 0.7, 0.1).tolist(),
          'max_features': np.arange(0.2, 0.7, 0.1).tolist()}

random_search = RandomizedSearchCV(BC_model, params, scoring = 'recall', random_state = 42)
random_search.fit(X_train, y_train)

In [119]:
BC_model_best = random_search.best_estimator_

In [120]:
CV_scores(BC_model_best)

accuracy: 0.855, precision: 0.851, recall: 0.899, f1_score: 0.872


In [None]:
# RandomForestClassifier

In [121]:
RF_model = RandomForestClassifier(random_state = 42, n_jobs = -1)

params = {'max_depth':list(range(2,70)),
         'min_samples_leaf':list(range(5,50)),
         'n_estimators':list(range(10, 200)), 
         'max_features': np.arange(0.2, 0.5, 0.05).tolist()}

random_search = RandomizedSearchCV(RF_model, params, scoring = 'recall', random_state = 42)
random_search.fit(X_train, y_train)

In [122]:
RF_model_best = random_search.best_estimator_

In [123]:
CV_scores(RF_model_best)

accuracy: 0.837, precision: 0.831, recall: 0.893, f1_score: 0.858


**7.Сформулируйте выводы по проделанной работе:\
a) Сравните метрики построенных моделей.\
b) *Сравните с полученными результатами в домашнем задании по теме «Ансамблирование».**

In [130]:
data = {'model': ['LR без подбора параметров', 'LR c GridSearch', 
                  'LR c RandomSearch', 'SVC c RandomSearch', 
                  'BaggingClassifier с GridSearch (ensembles)',
                  'BaggingClassifier с RandomSearch', 
                  'RandomForest c GridSearch (ensembles)', 
                  'RandomForest c RandomSearch'],
        'accuracy': [0.851, 0.849, 0.847, 0.706, 0.86, 0.855, 0.86, 0.837],
        'precision': [0.867, 0.863, 0.861, 0.757, 0.9, 0.851, 0.89, 0.831],
        'recall': [0.87, 0.872, 0.87, 0.711, 0.87, 0.89, 0.88, 0.893],
        'f1-score': [0.864, 0.863, 0.861, 0.709, 0.88, 0.872, 0.88, 0.858]
        } 
 
df_compare_models = pd.DataFrame(data)

df_compare_models

Unnamed: 0,model,accuracy,precision,recall,f1-score
0,LR без подбора параметров,0.851,0.867,0.87,0.864
1,LR c GridSearch,0.849,0.863,0.872,0.863
2,LR c RandomSearch,0.847,0.861,0.87,0.861
3,SVC c RandomSearch,0.706,0.757,0.711,0.709
4,BaggingClassifier с GridSearch (ensembles),0.86,0.9,0.87,0.88
5,BaggingClassifier с RandomSearch,0.855,0.851,0.89,0.872
6,RandomForest c GridSearch (ensembles),0.86,0.89,0.88,0.88
7,RandomForest c RandomSearch,0.837,0.831,0.893,0.858


In [None]:
# Поскольку подбор параметров осуществлялся на основе метрики recall, то показатели по ней в некоторых случаях становились лучше. 