In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import MultiTaskLasso
from sklearn.neighbors import KNeighborsRegressor
#from sklearn.metrics import f1_score, roc_auc_score, roc_curve, precision_score, recall_score
#from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
#from sklearn.utils import shuffle
#from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from catboost import CatBoostClassifier
#from imblearn.over_sampling import SMOTE

from phik.report import plot_correlation_matrix
from phik import report

import warnings
warnings.filterwarnings("ignore")

In [None]:
sns.set_style('darkgrid')
sns.set(rc={"figure.figsize":(12, 5)})

In [43]:
data = pd.read_csv('/Users/roman/Desktop/Яндекс Практикум/Medicine/train.csv')
data_test = pd.read_csv('/Users/roman/Desktop/Яндекс Практикум/Medicine/test.csv')

In [None]:
def general_info(dataset):
    display(dataset.info())
    display(dataset.head(5))
    display(dataset.describe())
    dataset.hist(bins=40, figsize=(18,18), linewidth=0.5)
plt.show()

In [None]:
corr_data = data.drop(['id', 'cardio'], axis=1)

In [None]:
plt.subplots(figsize=(29, 8))
sns.heatmap(corr_data.corr(), annot=True, fmt=".1f", square=True, cmap='YlGnBu')\
.set(title='Матрица корреляции признаков')
plt.show()

In [None]:
data.columns

In [None]:
phik_data = data.drop(['id', 'cardio'], axis=1).copy()
interval_cols = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active']
data_phik_overview = phik_data.phik_matrix(interval_cols=interval_cols)
plot_correlation_matrix(data_phik_overview.values, 
                        x_labels=data_phik_overview.columns, 
                        y_labels=data_phik_overview.index, 
                        vmin=0, vmax=1, color_map='YlGnBu', 
                        title='Матрица кореляции $\phi_K$', 
                        figsize=(29, 12))

In [None]:
def explore(series_name):
    display(data[series_name].describe().to_frame())
    
    sns.boxplot(data=data, x=series_name,)\
    .set(xlabel=f'Значения признака');
    plt.show()

    sns.histplot(data=data, x=series_name, kde=True)\
    .set(title=f'Гистограмма распределения признака',
         ylabel='Количество записей');

In [None]:
explore('age')

In [None]:
data[data['age'] < 12000]

In [None]:
explore('height')

In [None]:
data[data['height'] < 100]

In [44]:
def ed_h(height):
    if height < 100:
        height = height + 100
    return height

In [45]:
data['height'] = data.apply(lambda x: ed_h(x['height']), axis=1)
data_test['height'] = data_test.apply(lambda x: ed_h(x['height']), axis=1)

In [46]:
data = data[data['height'] <= 200]

In [47]:
data = data[data['weight'] >= 35]

In [None]:
explore('ap_hi')

In [None]:
data[data['ap_hi'] < 0]

In [None]:
data[data['ap_hi'] > 1000]

In [48]:
def ap_edit(ap):
    if ap < 0:
        ap = ap * (-1)
    if 0 < ap <= 20:
        ap = ap * 10
    if 300 < ap <= 1000:
        ap = ap / 10
    if 1000 < ap <= 10000:
        ap = ap / 100
    if ap > 10000:
        ap = ap / 1000
    return ap

In [49]:
data['ap_hi'] = data.apply(lambda x: ap_edit(x['ap_hi']), axis=1)
data['ap_lo'] = data.apply(lambda x: ap_edit(x['ap_lo']), axis=1)

data_test['ap_hi'] = data_test.apply(lambda x: ap_edit(x['ap_hi']), axis=1)
data_test['ap_lo'] = data_test.apply(lambda x: ap_edit(x['ap_lo']), axis=1)

In [None]:
data[data['ap_hi'] < 50]

In [50]:
def idd_edit(idd, ap):
    if idd == 12494:
        ap = 120
    if idd == 60477:
        ap = 120
    if idd == 7657:
        ap = 120
    if idd == 57646:
        ap = 120
    return ap

data['ap_hi'] = data.apply(lambda x: idd_edit(x['id'], x['ap_hi']), axis=1)

In [None]:
data[data['ap_hi'] < data['ap_lo']]

In [51]:
data_tmp = data[['ap_hi', 'ap_lo']]
data['ap_hi'], data['ap_lo'] = data_tmp.max(axis=1), data_tmp.min(axis=1)

data_test_tmp = data_test[['ap_hi', 'ap_lo']]
data_test['ap_hi'], data_test['ap_lo'] = data_test_tmp.max(axis=1), data_test_tmp.min(axis=1)

In [37]:
data[data['ap_lo'].isna()]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio


In [56]:
data[data['ap_lo'] == 0] = data[data['ap_lo'] > 0].median()

In [60]:
data_test[data_test['ap_lo'] == 0]['ap_lo'] = data[data['ap_lo'] > 0].median()

In [62]:
data[['gender', 'gluc', 'smoke', 'alco', 'active']] =\
     data[['gender', 'gluc', 'smoke', 'alco', 'active']].astype('str')

In [63]:
data_test[['gender', 'gluc', 'smoke', 'alco', 'active']] =\
     data_test[['gender', 'gluc', 'smoke', 'alco', 'active']].astype('str')

In [125]:
encoder_ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False)

#scaler = StandardScaler()
scaler = MinMaxScaler()

In [126]:
x_coded = data.drop(['id', 'cardio'], axis=1).copy()

cat_features = x_coded.select_dtypes(include='object').columns.to_list()

num_features = x_coded.drop(['gender', 'gluc', 'smoke', 'alco', 'active'],\
                            axis=1).select_dtypes(exclude='object').columns.to_list()

encoder_ohe.fit(x_coded[cat_features])

x_coded[encoder_ohe.get_feature_names_out()] = \
        encoder_ohe.transform(x_coded[cat_features])

x_coded = x_coded.drop(cat_features, axis=1)
        
x_coded[num_features] = scaler.fit_transform(x_coded[num_features])

In [127]:
test_coded = data_test.drop(['id'], axis=1).copy()

In [128]:
test_coded[encoder_ohe.get_feature_names_out()] =\
       encoder_ohe.transform(test_coded[cat_features])

test_coded = test_coded.drop(cat_features, axis=1)

test_coded[num_features] = scaler.transform(test_coded[num_features])

In [129]:
x = x_coded.copy()
y = data['cardio'].copy()

cat_features2 = x_coded.select_dtypes(include='object').columns.to_list()

In [130]:
x2 = data.drop(['id', 'cardio'], axis=1).copy()

In [144]:
x_coded.sample(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gender_2,gluc_2,gluc_3,smoke_1,alco_1,active_1
8992,0.957569,0.66,0.2,0.444444,0.523256,1.0,1.0,0.0,0.0,0.0,0.0,1.0
55424,0.476268,0.65,0.236364,0.333333,0.406977,0.0,1.0,0.0,0.0,0.0,0.0,1.0
28533,0.596283,0.65,0.175758,0.333333,0.406977,0.5,0.0,0.0,0.0,0.0,0.0,0.0


### CatBoost

In [131]:
cb = CatBoostClassifier(random_state = 42,
                      cat_features = cat_features2,
                       verbose=False)#,
                      #objective = 'logloss')
cb.fit(x2, y)

<catboost.core.CatBoostClassifier at 0x7fad3b6f46a0>

### Дерево решений

In [72]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_params = { 'max_depth': range (1,26,5),
              'min_samples_leaf': range (1,5),
              'min_samples_split': range (2,12,2) }

In [132]:
dt_model.fit(x, y)

In [None]:
dt_grid = GridSearchCV(dt_model, dt_params, cv=5, scoring='roc_auc', error_score='raise')
dt_grid.fit(x, y)
dt_grid.best_params_

### Случайный лес

In [74]:
rf_model = RandomForestClassifier(random_state=1488)
rf_params = { 'n_estimators': range(200,231,10),
              'max_depth': range(2,18,3) }

In [133]:
rf_model.fit(x, y)

In [None]:
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='roc_auc')
rf_grid.fit(x, y)
rf_grid.best_params_

In [134]:
gbm = GradientBoostingClassifier(random_state=42,
                                 n_estimators=73,
                                 max_depth=5)
gbm.fit(x, y)

In [82]:
gbm_model = GradientBoostingClassifier(random_state=42)
gbm_params = { 'n_estimators': range(70,91,5),
              'max_depth': range(5,16,3),
              'min_samples_leaf': range (1,9),
              'min_samples_split': range (2,25,3)}

In [84]:
%%time
gbm_model.fit(x,y)

CPU times: user 4.31 s, sys: 15 ms, total: 4.33 s
Wall time: 4.33 s


In [85]:
%%time
gbm_grid = GridSearchCV(gbm_model, gbm_params, cv=5)#, scoring='roc_auc')
gbm_grid.fit(x, y)
gbm_grid.best_params_

KeyboardInterrupt: 

In [135]:
def get_scores(x_trn, y_trn, x2_trn):
    #DecisionTree
    dt_model = DecisionTreeClassifier(max_depth=6,
                                  min_samples_leaf=4,
                                  min_samples_split=10,
                                  #class_weight='balanced',
                                  random_state=42)
    #dt_model.fit(x_trn, y_trn)

    dt_auc = cross_val_score(dt_model, 
                             x_trn, y_trn,
                             n_jobs=-1, 
                             scoring='roc_auc')
    print('Дерево решений')
    print('Метрика AUC:', f'{dt_auc.mean():.5f}')
    print()
    
    #RandomForest
    rf_model = RandomForestClassifier(n_estimators=210,
                                max_depth=10,
                                #class_weight='balanced',
                                random_state=42,
                                n_jobs=-1)    
    #rf_model.fit(x_trn, y_trn)

    rf_auc = cross_val_score(rf_model, 
                             x_trn, y_trn,
                             #n_jobs=-1, 
                             scoring='roc_auc')
    print('Случайный лес')
    print('Метрика AUC:', f'{rf_auc.mean():.5f}')
    print()
    
    # gbm
    gbm = GradientBoostingClassifier(random_state=42,
                                          n_estimators=73,
                                          max_depth=5)
    
    gbm_auc = cross_val_score(gbm,
                              x_trn, y_trn,
                              scoring='roc_auc')
    
    print('Метод GBM')
    print('Метрика AUC:', f'{gbm_auc.mean():.5f}')
    print()
    
    #CatBoost
    cb = CatBoostClassifier(random_state = 42,
                           cat_features = cat_features2,
                           verbose=False)
    cb_auc = cross_val_score(cb,
                              x2_trn, y_trn,
                              scoring='roc_auc')
    print('Метод CB')
    print('Метрика AUC:', f'{cb_auc.mean():.5f}')
    print()

In [136]:
get_scores(x, y, x2)

Дерево решений
Метрика AUC: 0.79502

Случайный лес
Метрика AUC: 0.80247

Метод GBM
Метрика AUC: 0.80341

Метод CB
Метрика AUC: 0.80237



In [None]:
##### x_test = data_test.drop(['id'], axis=1)

In [None]:
gbm_test_model = GradientBoostingClassifier(random_state=42,
                                          n_estimators=59,
                                          max_depth=5)
gbm_test_model.fit(x, y)

In [None]:
rftm = RandomForestClassifier(n_estimators=210,
                                max_depth=10,
                                random_state=1488)
rftm.fit(x, y)

In [137]:
test_coded = test_coded.drop(['cardio'], axis=1)

KeyError: "['cardio'] not found in axis"

In [138]:
test_coded['cardio'] = gbm.predict_proba(test_coded)[:, 1]

In [143]:
gbm.feature_importances_

array([0.13502057, 0.00882878, 0.02735192, 0.71866515, 0.01364808,
       0.07219768, 0.00171868, 0.00169388, 0.00660817, 0.00408533,
       0.0024874 , 0.00769437])

In [115]:
test_coded.head(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gender_2,gluc_2,gluc_3,smoke_1,alco_1,active_1,cardio
0,-0.235476,-1.302733,0.749994,0.171732,-0.118017,-0.539198,0.0,0.0,0.0,0.0,0.0,1.0,0.496164
1,-0.173046,0.701015,-0.363153,0.171732,0.833124,-0.539198,1.0,0.0,0.0,0.0,0.0,1.0,0.571574
2,0.390449,-0.551327,-0.293582,-0.41353,-0.593587,-0.539198,0.0,0.0,0.0,0.0,0.0,0.0,0.431884


In [139]:
test_coded['id'] = data_test['id']

In [None]:
data_test.info()

In [140]:
test_coded[['id', 'cardio']].to_csv('/Users/roman/Desktop/Яндекс Практикум/Medicine/gbm_prefinal5.csv', index=False)

In [141]:
pd.Series(data=gbm.feature_importances_,
          index=data_test.columns,
          name='feature_importance').sort_values(ascending=False).head(15).to_frame()

Unnamed: 0,feature_importance
height,0.718665
id,0.135021
ap_hi,0.072198
gender,0.027352
weight,0.013648
age,0.008829
active,0.007694
gluc,0.006608
smoke,0.004085
alco,0.002487


In [148]:
import pickle

In [152]:
pickle.dump(gbm, '/Users/roman/Desktop/Яндекс Практикум/Medicine/model.pkl')

TypeError: file must have a 'write' attribute