In [1]:
import pandas as pd
import numpy as np
import random
import os
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'NanumGothic' # 폰트 변경
plt.rcParams['axes.unicode_minus'] = False # 축 값 마이너스 깨짐 해결
from collections import defaultdict
from scipy import stats as stats
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
test_cols = [
    # '나이', 
    '키(cm)', 
    '몸무게(kg)', 
    # 'BMI', 
    '시력', 
    # '충치', 
    # '공복 혈당', 
    '혈압',
    # '중성 지방', 
    # '혈청 크레아티닌', 
    '콜레스테롤', 
    '고밀도지단백', 
    '저밀도지단백', 
    '헤모글로빈', 
    '요 단백',
    # '간 효소율'
]
train_cols = test_cols + ['label']

In [6]:
train = train[train_cols]
x_test = test[test_cols]

In [10]:
from pycaret.classification import *

x_train = setup(data=train,
                target='label',
                train_size=0.8,
                fold=5)

Unnamed: 0,Description,Value
0,Session id,559
1,Target,label
2,Target type,Binary
3,Original data shape,"(7000, 10)"
4,Transformed data shape,"(7000, 10)"
5,Transformed train set shape,"(5600, 10)"
6,Transformed test set shape,"(1400, 10)"
7,Numeric features,9
8,Preprocess,True
9,Imputation type,simple


In [14]:
best = compare_models(sort='Accuracy')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7148,0.7911,0.6441,0.6056,0.6241,0.3948,0.3954,0.036
gbc,Gradient Boosting Classifier,0.7139,0.789,0.6952,0.5952,0.6411,0.4058,0.4093,0.012
lightgbm,Light Gradient Boosting Machine,0.7095,0.7856,0.6257,0.6012,0.6127,0.3805,0.3811,0.018
qda,Quadratic Discriminant Analysis,0.7093,0.7697,0.7905,0.576,0.6663,0.4198,0.4365,0.008
rf,Random Forest Classifier,0.7086,0.7874,0.6028,0.6037,0.603,0.3729,0.3731,0.038
xgboost,Extreme Gradient Boosting,0.7075,0.7815,0.5887,0.6055,0.5966,0.3673,0.3677,0.01
et,Extra Trees Classifier,0.7055,0.7891,0.5946,0.6006,0.5972,0.3653,0.3655,0.054
ada,Ada Boost Classifier,0.7046,0.7823,0.6558,0.5885,0.62,0.3797,0.3815,0.026
lr,Logistic Regression,0.6982,0.7824,0.5435,0.5994,0.5696,0.3382,0.3395,0.014
nb,Naive Bayes,0.6973,0.7636,0.7457,0.5671,0.6442,0.3893,0.4005,0.008


In [15]:
best_tune = tune_model(best)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7205,0.796,0.708,0.6012,0.6503,0.4201,0.424
1,0.7259,0.7967,0.6886,0.6126,0.6483,0.425,0.427
2,0.6911,0.7691,0.6813,0.5657,0.6181,0.3625,0.3669
3,0.7125,0.7874,0.6383,0.6032,0.6203,0.3893,0.3897
4,0.7295,0.8056,0.6796,0.6208,0.6489,0.4296,0.4307
Mean,0.7159,0.791,0.6792,0.6007,0.6372,0.4053,0.4077
Std,0.0137,0.0123,0.0228,0.0189,0.0147,0.0257,0.0251


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [16]:
evaluate_model(best_tune)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [17]:
predict_model(best_tune)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.6993,0.7825,0.6654,0.5787,0.619,0.3726,0.3751


Unnamed: 0,키(cm),몸무게(kg),시력,혈압,콜레스테롤,고밀도지단백,저밀도지단백,헤모글로빈,요 단백,label,prediction_label,prediction_score
189,165,70,1.20,57,206,61,106,16.6,1,0,1,0.6927
3141,160,70,0.95,52,200,54,128,15.8,1,0,0,0.5830
6776,175,75,1.20,40,175,48,119,17.0,1,1,0,0.5136
2103,170,70,0.35,56,251,58,159,13.7,1,1,0,0.6383
3583,150,55,1.00,52,186,63,112,12.1,1,0,0,0.9777
...,...,...,...,...,...,...,...,...,...,...,...,...
2235,170,65,1.35,49,172,67,96,14.4,1,0,1,0.5216
4291,160,50,1.00,50,131,50,59,15.0,1,0,0,0.5585
1397,170,70,1.10,40,125,51,63,14.0,1,1,0,0.5236
5001,175,80,0.75,59,203,47,124,14.9,1,1,0,0.5177


In [18]:
predict_model(best_tune, data= x_test)

Unnamed: 0,키(cm),몸무게(kg),시력,혈압,콜레스테롤,고밀도지단백,저밀도지단백,헤모글로빈,요 단백,prediction_label,prediction_score
0,165,55,0.90,47,229,59,155,13.700000,1,0,0.8379
1,145,50,0.50,59,200,65,115,12.200000,1,0,0.9793
2,160,75,1.00,34,170,50,73,15.100000,1,1,0.5021
3,180,90,1.35,45,197,55,98,15.200000,1,1,0.6860
4,155,55,0.75,67,230,66,136,15.000000,1,0,0.9092
...,...,...,...,...,...,...,...,...,...,...,...
2995,170,90,0.85,60,221,55,137,14.300000,1,1,0.5341
2996,175,80,1.00,50,167,47,100,13.900000,1,0,0.5391
2997,155,60,1.20,32,186,61,114,12.800000,1,0,0.9738
2998,175,90,1.35,42,163,44,91,16.299999,1,0,0.5541
