In [None]:
import catboost
!pip install imblearn
!pip install phik
!pip install lightgbm
!pip install xgboost
!pip install optuna
!pip install catboost

# Import Libraries

In [None]:
from functools import reduce
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats as st
import warnings
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from phik.report import plot_correlation_matrix
from scipy import stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.dummy import DummyClassifier
from sklearn.metrics import make_scorer,plot_confusion_matrix
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.metrics import roc_curve, roc_auc_score,accuracy_score,precision_score,recall_score
import catboost as cb
import optuna


In [None]:
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams['figure.figsize']=16,8
random_seed=12345

# Read data

In [None]:
train_df=pd.read_csv('./data/train.csv')
test_df=pd.read_csv('./data/test.csv')

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head()

In [None]:
test_df.head()

# EDA

In [None]:
print('Количество пропусков в train -',train_df.isna().sum().sum())
print('Количество пропусков в test -',test_df.isna().sum().sum())

In [None]:
print('Количество дубликатов в train -',train_df.duplicated().sum())
print('Количество дубликатов в test -',test_df.duplicated().sum())

Переведем признак age в года

In [None]:
train_df['age']=train_df['age']/365
test_df['age']=test_df['age']/365

Дропнем бесполезный признак id

In [None]:
train_df = train_df.drop(columns=['id'])

In [None]:
train_df['gender'].value_counts()

Наблюдается дисбаланс классов, женщин(?) в 2 раза меньше, чем мужчин

In [None]:
categorical_features=['cholesterol', 'gluc', 'smoke', 'alco', 'active']
numeric_features=['age', 'height', 'weight', 'ap_hi', 'ap_lo']

In [None]:
train_df[numeric_features].describe()

Как видно из описательной статистики имеются некорректные значения практически во всех столбцах

In [None]:
train_df[train_df['ap_hi']<0]

In [None]:
train_df[train_df['ap_lo']<0]

Отрицательных значений мало, можем спокойно дропнуть

In [None]:
train_df=train_df.loc[(train_df['ap_hi']>0)]
train_df=train_df.loc[(train_df['ap_lo']>0)]

In [None]:
train_df['ap_lo'].describe()

Посмотрим 99% и 1% квантили

In [None]:
train_df.quantile(.01)

In [None]:
train_df.quantile(.99)

In [None]:
def quantile(df,col):
    return df[col].quantile(.01),df[col].quantile(.99)

In [None]:
cols = ['height','weight']

In [None]:
for col in cols:
    train_df=train_df.loc[train_df[col]>quantile(train_df,col)[0]]

In [None]:
train_df.describe()

Удалим выбросы по ад

In [None]:
train_df=train_df.loc[train_df['ap_hi']>60]
train_df=train_df.loc[train_df['ap_lo']>40]
train_df=train_df.loc[train_df['ap_hi']<300]
train_df=train_df.loc[train_df['ap_lo']<250]

In [None]:
train_df=train_df.loc[train_df['ap_hi']>train_df['ap_lo']]

In [None]:
train_df=train_df[train_df['height']<200]

In [None]:
train_df[train_df['weight']>140]

Довольно странно видеть людей с ожирением и нормальным ад, дропнем эти строки (попозже)

In [None]:
train_df.describe()

In [None]:
train_df[numeric_features].corr()

In [None]:
sns.pairplot(train_df, vars=numeric_features, hue='cardio').fig.set_size_inches(16,8)

In [None]:
phik_overview = train_df.phik_matrix(interval_cols=None)
plot_correlation_matrix(phik_overview.values,
                        x_labels=phik_overview.columns,
                        y_labels=phik_overview.index,
                        vmin=0, vmax=1, color_map="Greens",
                        title=r"correlation $\phi_K$",
                        fontsize_factor=0.8,
                        figsize=(16, 16))
plt.tight_layout()

Мультиколлинеарность не обнаружена, можно приступать к подбору гиперпараметров

In [None]:
train_df.info()

# Подготовка признаков и выбор модели

In [None]:
def importance(model,features):
    imp = pd.Series(model.feature_importances_,index=features.columns).sort_values(ascending=False)
    ax = sns.barplot(x=imp,y=imp.index)
    ax.set(xlabel = 'Важность признаков',ylabel ='Признаки')
    return imp

In [None]:
features_train = train_df.drop(columns=['cardio'])
target_train = train_df['cardio']

In [None]:
features_train.shape,target_train.shape

In [None]:
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

## Logistic Regression

In [None]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(random_state=random_seed))]
)

In [None]:
param_grid = {
    "classifier__C": [0.1, 1.0, 10, 100],
}
grid_search = GridSearchCV(clf, param_grid, cv=5,scoring='roc_auc')
grid_search.fit(features_train, target_train)
print("Best params:")
print(grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[
    [
        "mean_test_score",
        "std_test_score",
        "param_classifier__C",
    ]
].head(5)

## CatBoost

In [None]:
train_dataset = cb.Pool(features_train,target_train, cat_features=categorical_features)
model = cb.CatBoostClassifier(iterations=100,
                              loss_function='Logloss',
                              eval_metric='AUC',
                              logging_level='Silent',
                              random_seed=random_seed)

#Declare parameters to tune and values to try
grid = {'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.5, 1],
        'depth': [4, 6, 10, 20, 30],
        'l2_leaf_reg': [1, 3, 5, 10, 30],
        }

#Find optimum parameters
model.grid_search(grid, train_dataset, cv=5,
                  plot=True,
                  calc_cv_statistics=True,
                  stratified = True,
                  shuffle=True)

In [None]:
features_test=test_df.drop(columns='id')

In [None]:
pred_proba_test = model.predict_proba(features_test)[:, 1]
predicted_test = model.predict(features_test)
predicted_test

In [None]:
sub = pd.concat([test_df['id'],pd.Series(predicted_test)],axis=1)
sub.columns=['id','cardio']
sub.to_csv('submission.csv',index=False)

In [None]:
%%time
roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
                             needs_threshold=True)

param_dist ={
    'max_features':[int(x) for x in np.linspace (start = 3, stop = 11, num =2)],
    'min_samples_leaf' : np.arange(3, 11, 2),
    'min_samples_split':[2,5,10,15,100],
    'n_estimators': [100,300,500,800]   }
clf = GridSearchCV(RandomForestClassifier(random_state=random_seed),
                   param_dist,
                   scoring=roc_auc_scorer,
                   )
clf.fit(features_train, target_train)
print('Best params:',clf.best_params_,'Best roc_auc:',clf.best_score_)

# Feature Engineering