# Неделя 3. Понедельник
## Обучение с учителем

### Применение базовых методов классификации

In [180]:
import pandas as pd
import numpy as np

import sklearn
sklearn.set_config(transform_output="pandas")

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

#### 0. Ознакомьтесь с датасетом

In [119]:
df = pd.read_csv('aux/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40.0,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49.0,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37.0,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48.0,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54.0,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


* __Age__: age of the patient [years]
* __Sex__: sex of the patient [M: Male, F: Female]
* __ChestPainType__: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
* __RestingBP__: resting blood pressure [mm Hg]
* __Cholesterol__: serum cholesterol [mm/dl]
* __FastingBS__: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
* __RestingECG__: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite * left ventricular hypertrophy by Estes' criteria]
* __MaxHR__: maximum heart rate achieved [Numeric value between 60 and 202]
* __ExerciseAngina__: exercise-induced angina [Y: Yes, N: No]
* __Oldpeak__: oldpeak = ST [Numeric value measured in depression]
* __ST_Slope__: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
* __HeartDisease__: output class [1: heart disease, 0: Normal]

* __Age__: возраст пациента [лет]
* __Sex__: пол пациента [M: Мужской, F: Женский]
* __ChestPainType__: тип боли в груди [TA: типичная стенокардия, ATA: атипичная стенокардия, NAP: неангинальная боль, ASY: бессимптомная]
* __RestingBP__: артериальное давление в состоянии покоя [мм рт. ст.]
* __Холестерин__: холестерин сыворотки [мм/дл]
* __FastingBS__: уровень сахара в крови натощак [1: если FastingBS > 120 мг/дл, 0: в противном случае]
* __ЭКГ покоя__: результаты электрокардиограммы покоя [Нормальный: нормальный, ST: наличие аномалий ST-T (инверсия зубца T и/или элевация или депрессия ST > 0,05 мВ), ГЛЖ: возможна или определенна * гипертрофия левого желудочка по критериям Эстеса ]
* __MaxHR__: достигнутая максимальная частота пульса [числовое значение от 60 до 202]
* __ExercisionAngin__: стенокардия, вызванная физической нагрузкой [Д: Да, Н: Нет]
* __Oldpeak__: oldpeak = ST [числовое значение, измеренное в депрессии]
* __ST_Slope__: наклон пикового сегмента ST при нагрузке [Вверх: наклон вверх, Плоский: плоский, Вниз: наклон вниз]
* __HeartDisease__: выходной класс [1: болезнь сердца, 0: норма]

* Таргетом является столбец `HeartDisease`. Необходимо предсказать по имеющимся данным, есть ли проблемы с сердцем

#### 1. Небольшие рекомендации ниже 


* __Baseline pipeline (базовый пайплайн)__ - это простой пайплайн, который используется как отправная точка или точка сравнения при разработке и оценке более сложных моделей или алгоритмов. 

* Для этого сначала используйте самые простые идеи по заполнению пропусков(средними, медианами, модами) и кодированию категориальных данных, которые вам приходят в голову. 

* После того, как вы построите модели провалидируете их. Можно будет приступать к попыткам улучшить свою модель с помощью ваших идей - пробовать создавать новые фичи, кодировать данные по-другому, заполнять иначе NaN и тд

#### 2. Заполните пропущенные значения(`Imputing`), как считаете нужным.  

- Не забывайте памятку выше, сначала заполняйте самыми тривиальными идеями. Наприсер, средними, медианами и т.д

In [120]:
pd.DataFrame(data={'Nan_count' : df.isna().mean(), 'data_type' : df.dtypes})

Unnamed: 0,Nan_count,data_type
Age,0.010893,float64
Sex,0.0,object
ChestPainType,0.0,object
RestingBP,0.0,int64
Cholesterol,0.0,int64
FastingBS,0.0,int64
RestingECG,0.0,object
MaxHR,0.0,int64
ExerciseAngina,0.0,object
Oldpeak,0.0,float64


In [121]:
X , y = df.drop('HeartDisease', axis=1), df['HeartDisease']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

##### 2.1 Оберните в `ColumnTransformer` свой `Imputing` данных. Проверьте корректность его работы. Для этого необходимо сделать:

1. Обучить и трансформировать свой `Imputer` с помощью `your_imputer.fit_transform` - на тренировочных данных
2. Заполнить с помощью `your_imputer.transform` - на тестовых данных

Убедитесь, что данные прошли через этап `Imputing'а` и пропусков в них больше нет

In [122]:
my_imputer = ColumnTransformer(
    transformers = 
    [
        ('Age_imp', SimpleImputer(strategy='median'), ['Age'])
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
)

In [123]:
my_imputer

In [124]:
filled_df = my_imputer.fit_transform(X_train)

In [126]:
pd.DataFrame(data={'Nan_count' : filled_df.isna().sum(), 'data_type' : filled_df.dtypes})

Unnamed: 0,Nan_count,data_type
Age,0,float64
Sex,0,object
ChestPainType,0,object
RestingBP,0,int64
Cholesterol,0,int64
FastingBS,0,int64
RestingECG,0,object
MaxHR,0,int64
ExerciseAngina,0,object
Oldpeak,0,float64


In [127]:
X_test.isna().sum()

Age               2
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64

In [128]:
my_imputer.transform(X_test).isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64

#### 3. Закодируйте категориальные переменные, как считаете нужным

* `OneHotEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)  
* `TargetEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.TargetEncoder.html)  
* `OrdinalEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)  
* `CatBoostEncoding` (https://www.geeksforgeeks.org/categorical-encoding-with-catboost-encoder/)  

In [129]:
one_hot_encoding_columns = ['ChestPainType', 'RestingECG']
ordinal_encoding_columns = ['Sex', 'ST_Slope', 'ExerciseAngina']

In [130]:
filled_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
485,63.0,M,ATA,139,217,1,ST,128,Y,1.2,Flat
486,55.0,M,ATA,110,214,1,ST,180,N,0.4,Up
117,59.0,F,ASY,130,338,1,ST,130,Y,1.5,Flat
361,47.0,M,ASY,160,0,0,Normal,124,Y,0.0,Flat
296,50.0,M,ASY,145,0,1,Normal,139,Y,0.7,Flat
...,...,...,...,...,...,...,...,...,...,...,...
276,51.0,M,NAP,135,160,0,Normal,150,N,2.0,Flat
201,46.0,M,NAP,120,230,0,Normal,150,N,0.0,Up
462,59.0,M,ASY,122,233,0,Normal,117,Y,1.3,Down
252,61.0,M,ASY,125,292,0,ST,115,Y,0.0,Up


##### 3.1 Оберните в `ColumnTransformer` свой `Encoding` данных. Проверьте корректность его работы. 

In [131]:
my_cat_encoder = ColumnTransformer(
    [
        ('one_hot_encoding_columns', OneHotEncoder(sparse_output=False), one_hot_encoding_columns),
        ('ordinal_encoding_columns', OrdinalEncoder(), ordinal_encoding_columns)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
)

In [132]:
my_cat_encoder

In [133]:
cat_encod_df = my_cat_encoder.fit_transform(filled_df, y)
cat_encod_df

Unnamed: 0,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,Sex,ST_Slope,ExerciseAngina,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
485,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,63.0,139,217,1,128,1.2
486,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,55.0,110,214,1,180,0.4
117,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,59.0,130,338,1,130,1.5
361,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,47.0,160,0,0,124,0.0
296,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,50.0,145,0,1,139,0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,51.0,135,160,0,150,2.0
201,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,46.0,120,230,0,150,0.0
462,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,59.0,122,233,0,117,1.3
252,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,61.0,125,292,0,115,0.0


In [134]:
my_cat_encoder.transform(X_test)

Unnamed: 0,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,Sex,ST_Slope,ExerciseAngina,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
356,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,46.0,115,0,0,113,1.5
763,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,58.0,132,224,0,173,3.2
817,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,60.0,125,258,0,141,2.8
735,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,49.0,120,188,0,139,2.0
892,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,39.0,138,220,0,152,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,45.0,140,224,1,122,0.0
752,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,56.0,125,249,1,144,1.2
492,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,49.0,130,0,0,145,3.0
622,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,59.0,110,239,0,142,1.2


#### 4. То же самое проделать с нормализацией данных

* `StandardScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* `MinMaxScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)
* `RobustScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)

In [137]:
num_columns = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

#### 4.1 Оберните в `ColumnTransformer` свой `Scaling` данных, проверьте корректность работы.

In [138]:
my_scaler = ColumnTransformer(
    [
        ('num_scaler', StandardScaler(), num_columns)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
)

In [139]:
my_scaler

In [140]:
num_cat_encod_df = my_scaler.fit_transform(cat_encod_df, y)

In [143]:
num_cat_encod_df

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,Sex,ST_Slope,ExerciseAngina,FastingBS
485,0.969838,0.339016,0.127137,-0.324520,0.317046,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1
486,0.117054,-1.266031,0.099443,1.689837,-0.440356,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,1
117,0.543446,-0.159102,1.244113,-0.247045,0.601071,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1
361,-0.735729,1.501291,-1.876035,-0.479470,-0.819056,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0
296,-0.415935,0.671094,-1.876035,0.101594,-0.156330,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,-0.309337,0.117630,-0.399042,0.527708,1.074447,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0
201,-0.842327,-0.712567,0.247142,0.527708,-0.819056,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0
462,0.543446,-0.601874,0.274836,-0.750634,0.411721,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0
252,0.756642,-0.435834,0.819477,-0.828109,-0.819056,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,0


#### 5. Соберите весь препроцессинг в общий Pipeline.

In [145]:
preprocessor = Pipeline(
    [
        ('imputer', my_imputer),
        ('encoder', my_cat_encoder),
        ('scaler', my_scaler)
    ]
)

In [146]:
preprocessor

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



##### 5.1 Прогоните свои данные через `preprocessor` и убедитесь, что ваши данные проходят через него корректно и уже готовы к ML-модели

In [147]:
preprocessor.fit_transform(X_train)

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,Sex,ST_Slope,ExerciseAngina,FastingBS
485,0.969838,0.339016,0.127137,-0.324520,0.317046,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1
486,0.117054,-1.266031,0.099443,1.689837,-0.440356,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,1
117,0.543446,-0.159102,1.244113,-0.247045,0.601071,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1
361,-0.735729,1.501291,-1.876035,-0.479470,-0.819056,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0
296,-0.415935,0.671094,-1.876035,0.101594,-0.156330,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,-0.309337,0.117630,-0.399042,0.527708,1.074447,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0
201,-0.842327,-0.712567,0.247142,0.527708,-0.819056,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0
462,0.543446,-0.601874,0.274836,-0.750634,0.411721,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0
252,0.756642,-0.435834,0.819477,-0.828109,-0.819056,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,0


In [148]:
preprocessor.transform(X_test)

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,Sex,ST_Slope,ExerciseAngina,FastingBS
356,-0.842327,-0.989299,-1.876035,-0.905584,0.601071,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0
763,0.436848,-0.048409,0.191755,1.418673,2.210549,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0
817,0.650044,-0.435834,0.505616,0.179069,1.831848,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0
735,-0.522533,-0.712567,-0.140568,0.101594,1.074447,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0
892,-1.588513,0.283669,0.154830,0.605183,-0.819056,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,-0.948925,0.394362,0.191755,-0.556946,-0.819056,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,1
752,0.223652,-0.435834,0.422535,0.295282,0.317046,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1
492,-0.522533,-0.159102,-1.876035,0.334020,2.021198,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0
622,0.543446,-1.266031,0.330223,0.217807,0.317046,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0


#### 6.ML-модели

* `LogisticRegression` (из `sklearn.linear_model`)  
* `LogisticRegression with regularization` (из `sklearn.linear_model`)  
* `KNeighborsClassifier` (из `sklearn.neighbors`)  
* `DecisionTree` (из `sklearn.tree`)  

##### 6.1 Обучите свой `Pipeline` с помощью метода `.fit()` с разными моделями.

In [161]:
ml_pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('LogistReg', LogisticRegression())
    ]
)

In [162]:
ml_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



#### 7. С помощью метода `.predict()` (на вход поступают только матрица признаков, без целевой переменной) предсказать значения на обучающей выборке (`X_train`) и валидационной выборке (`X_valid`).

In [163]:
print('train_predict:', accuracy_score(y_train, ml_pipeline.predict(X_train)))
print('test predict:', accuracy_score(y_test, ml_pipeline.predict(X_test)))

train_predict: 0.8460490463215259
test predict: 0.8695652173913043


In [164]:
coeffs = np.round(ml_pipeline['LogistReg'].coef_[0], 3)
features = list(preprocessor.transform(X_train).columns)

In [165]:
pd.DataFrame(data={'weight': coeffs}, index=features).sort_values(by='weight', key=lambda x: abs(x), ascending=False)

Unnamed: 0,weight
ST_Slope,-1.822
ChestPainType_ASY,1.19
ExerciseAngina,1.021
Sex,0.993
FastingBS,0.912
ChestPainType_ATA,-0.515
ChestPainType_NAP,-0.503
Cholesterol,-0.46
RestingECG_LVH,0.242
MaxHR,-0.239


##### 7.1 С помощью функции оценки качества (`accuracy_score`) собрать следующую таблицу ниже

* значение функции на обучающих данных
* значение функции на валидационных данных 
    
Результатом выполнения этого пункта будет `DataFrame` формата: 
    
|  |train|valid|
|--|-----|-----|
|**LogReg**|  train_score  | valid_score    |
|**LogReg with l1**|  train_score  | valid_score    |
|**LogReg with l2**|  train_score  | valid_score    |
|**KNN**| train_score  |  valid_score   |
|**SVC**| train_score  |  valid_score   |
|**Tree**| train_score | valid_score    |

In [173]:
models = {
    'LogReg': LogisticRegression(),
    'LogReg with l1': LogisticRegression(penalty='l1', solver='liblinear'), 
    'LogReg with l2': LogisticRegression(penalty='l2'),  
    'KNN': KNeighborsClassifier(),
    'SVC': SVC(),
    'Tree': DecisionTreeClassifier()
}
results = []

In [174]:
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)  
    ])
    
    pipeline.fit(X_train, y_train)

    train_pred = pipeline.predict(X_train)
    valid_pred = pipeline.predict(X_test)
    train_score = accuracy_score(y_train, train_pred)
    valid_score = accuracy_score(y_test, valid_pred)

    results.append({
        'Model': model_name,
        'Train Accuracy': train_score,
        'Valid Accuracy': valid_score
    })

results_df = pd.DataFrame(results)

In [178]:
results_df

Unnamed: 0,Model,Train Accuracy,Valid Accuracy
0,LogReg,0.846049,0.869565
1,LogReg with l1,0.848774,0.869565
2,LogReg with l2,0.846049,0.869565
3,KNN,0.865123,0.880435
4,SVC,0.885559,0.880435
5,Tree,1.0,0.782609


#### 8. Теперь реализуйте __кросс-валидацию__ с KFold=5 и выведите средний __score__

In [190]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results_kf = {
    'Model': [],
    'cross_val_score': []
}

for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)  
    ])
    scores = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy')
    
    results_kf['Model'].append(model_name)
    results_kf['cross_val_score'].append(scores.mean()) 


In [193]:
results_df1 = pd.DataFrame(results_kf)
results_df1.set_index('Model', inplace=True)  
print(results_df1)

                cross_val_score
Model                          
LogReg                 0.848539
LogReg with l1         0.848533
LogReg with l2         0.848539
KNN                    0.868140
SVC                    0.864879
Tree                   0.802804


|  |cross_val_score|
|--|-----|
|**LogReg**|  your_score |
|**LogReg with l1**|  your_score  |
|**LogReg with l2**|  your_score  |
|**KNN**| your_score  |
|**SVC**| your_score  |
|**Tree**| your_score |

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Пора сохранить изменения для __github__. 

1. Перейди в командной строке в папку, в которой расположен этот нотбук. 
2. Выполни команду `git add 06-01-task.ipynb`
3. Выполни команду `git commit -m "base models in progress"`
4. Выполни команду `git push`

##### 9. Теперь, когда вы проделали весь pipeline и обучили базовую модель, можно вернуться к началу и пробовать новые идеи и искать точки роста для ваших моделей, в том числе и добавление новых фичей

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Сохрани файл для __github__ и выполни команду `!git status` в ячейке ниже.


In [15]:
# code