In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, classification_report

import catboost as cb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import time

In [6]:
data = pd.read_csv('data/medical_examination.csv')

In [7]:
oof_data = pd.read_csv('data/oof.csv')

In [8]:
data_oof = pd.concat([data, oof_data], axis=1)

In [9]:
data_oof.head()

Unnamed: 0,id,age,sex,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,best_best_catboost_2_0,best_best_catboost_3_alco,best_best_catboost_3_smoke
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,0.160621,0.163162,0.168077
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,0.869001,0.858915,0.85079
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,0.748102,0.755549,0.767979
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,0.89187,0.898499,0.878329
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,0.110501,0.120273,0.118629


In [10]:
# CatBoost
def catboost_base(x, x_val, y, y_val):
    cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.001,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": False,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
    }

    model = cb.CatBoostClassifier(**cb_params)
    model.fit(x, y,
        )
    y_pred = model.predict(x_val)
    ra = roc_auc_score(y_val, y_pred)
    print("roc_auc_score: ",ra)
    print("confusion_matrix: ",confusion_matrix(y_val, y_pred))
    print("accuracy_score: ",accuracy_score(y_val, y_pred))
    return model


In [11]:
def train_split(x, y, Random_State= 27):
    """x: dataframe, y: name target name"""
    x_train, x_valid, y_train, y_valid = train_test_split(x.drop(y, axis=1), x[y], test_size = 0.2, random_state=Random_State)
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state=Random_State)
    print(x_train.shape, x_valid.shape, y_train.shape, y_valid.shape, x_test.shape, y_test.shape)
    return x_train, x_valid, y_train, y_valid, x_test, y_test

In [15]:
x_train, x_valid, y_train, y_valid, x_test, y_test = train_split(data_oof, "cardio")

(44800, 15) (14000, 15) (44800,) (14000,) (11200, 15) (11200,)


In [16]:
model = catboost_base(x_train, x_valid, y_train, y_valid)

roc_auc_score:  0.7407072167560713
confusion_matrix:  [[5588 1396]
 [2236 4780]]
accuracy_score:  0.7405714285714285


In [17]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,best_best_catboost_2_0,48.936342
1,best_best_catboost_3_alco,29.367375
2,best_best_catboost_3_smoke,14.569888
3,ap_hi,1.509906
4,age,1.364358
5,weight,0.710488
6,cholesterol,0.686696
7,gluc,0.50994
8,id,0.505568
9,ap_lo,0.436463
