In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from category_encoders import BinaryEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer 
from sklearn.metrics import fbeta_score, roc_auc_score
import time
from pycaret.classification import *
import shap


In [None]:
df = pd.read_csv('loan_data.csv', low_memory=False)
df_ind = df[(df.application_type == "Individual")]

In [None]:
#총 21개 컬럼 (annual inc(특성 엔지니어링에 사용후 제거)
select_features = ["loan_amnt", "term", "int_rate", "installment", "sub_grade",
                    "emp_length", "verification_status", "addr_state", "dti", 
                    "fico_range_low", "fico_range_high", "last_fico_range_high", "last_fico_range_low",
                    "avg_cur_bal", "open_acc", "revol_util", "total_acc", 'annual_inc',
                    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "pct_tl_nvr_dlq", "loan_status"]

target = "loan_status"

In [None]:
df = df_ind[select_features].dropna()
X, y = df.loc[:, [column for column in df.columns if column != target]], df.loan_status
#Charged Off : 1 / Fully Paid : 0으로 라벨인코딩
binary_y = y.replace({'Charged Off': 1, 'Fully Paid': 0})
X_train, X_1, label_y_train, label_y_1 = train_test_split(X, binary_y, test_size=0.2, random_state=6, stratify=binary_y)
X_val, X_test, y_val, y_test = train_test_split(X_1, label_y_1, test_size=0.5, random_state=6, stratify=label_y_1)

In [None]:
data = pd.concat((X_train, label_y_train), axis=1)
test = pd.concat((X_val, y_val), axis=1)

In [None]:
def new_features(df):
    transformed_df = df.copy()
    
    #dti 0~40으로 제한
    transformed_df['dti'] = np.clip(transformed_df['dti'], 0, 40)

    #비율 특성 0~100으로 제한
    for col in ['revol_util', 'pct_tl_nvr_dlq']:
        transformed_df[col] = np.clip(transformed_df[col], 0, 100)

    # installment_rate : 전체 대출액대비 한달 상환액의 비율
    transformed_df['installment_ratio'] = transformed_df['installment'] / transformed_df['loan_amnt']

    # installment_rate : 전체 대출액대비 한달 상환액의 비율
    transformed_df['loan_vs_inc'] =  transformed_df['loan_amnt'] / (transformed_df['annual_inc'] + 1) #annual_inc==0인 값도 있으므로 분모 0를 방지하기 위헤 1을 더함
    transformed_df.drop(columns=['loan_amnt','annual_inc'], inplace=True)
    
    #CountEncoding
    state_counts = transformed_df['addr_state'].value_counts()
    transformed_df['addr_state_count'] = transformed_df['addr_state'].map(state_counts)
    transformed_df.drop(columns=['addr_state'], inplace=True)

    #avg_cur_bal 구간화
    # 동등한 분포를 가진 5구간 계산
    transformed_df['avg_cur_bal_level'] = pd.qcut(transformed_df['avg_cur_bal'], 5, labels=[1,2,3,4,5])
    transformed_df.drop(columns=['avg_cur_bal'], inplace=True)
    
    #mo_sin_rcnt_rev_tl_op 구간화
    # 동등한 분포를 가진 4구간 계산
    transformed_df['mo_sin_rcnt_rev_tl_op_level'] = pd.qcut(transformed_df['mo_sin_rcnt_rev_tl_op'], 4, labels=[1,2,3,4])
    transformed_df.drop(columns=['mo_sin_rcnt_rev_tl_op'], inplace=True)

    #pct_tl_nvr_dlq
    transformed_df['pct_tl_nvr_dlq_level'] = pd.qcut(transformed_df['pct_tl_nvr_dlq'].rank(method='first'), 3, labels=[1,2,3])
    transformed_df.drop(columns=['pct_tl_nvr_dlq'], inplace=True)

    return transformed_df

In [None]:
pre_data = new_features(data)
pre_test = new_features(test)

In [8]:
pre_data.columns

Index(['term', 'int_rate', 'installment', 'sub_grade', 'emp_length',
       'verification_status', 'dti', 'fico_range_low', 'fico_range_high',
       'last_fico_range_high', 'last_fico_range_low', 'open_acc', 'revol_util',
       'total_acc', 'mo_sin_old_rev_tl_op', 'loan_status', 'installment_ratio',
       'loan_vs_inc', 'addr_state_count', 'avg_cur_bal_level',
       'mo_sin_rcnt_rev_tl_op_level', 'pct_tl_nvr_dlq_level'],
      dtype='object')

In [None]:
#총 22개 컬럼 
nominal_features = ["term", "sub_grade","verification_status"]
ordinal_features = {'mo_sin_rcnt_rev_tl_op_level': ["1", "2", "3", "4"],
                    'avg_cur_bal_level': ["1", "2", "3", "4", "5"],
                    'pct_tl_nvr_dlq_level' : ["1", "2", "3"],
                    "emp_length": ["< 1 year", "1 year", "2 years", "3 years", "4 years", "5 years", "6 years", "7 years", "8 years", "9 years", "10+ years"]    
                   }

numeric_features = ["int_rate", "installment",  "dti", "fico_range_low", "fico_range_high",  
                    "revol_util", "total_acc", 'open_acc', "last_fico_range_high", "last_fico_range_low", 
                    "mo_sin_old_rev_tl_op", "addr_state_count",'installment_ratio', 'loan_vs_inc']

In [10]:
exp = ClassificationExperiment()

##  이상치 remove_outliers = True
catboost AUC : 0.9569, Recall : 0.9298	

In [11]:
#sub_grade max 25넘어가므로, target encoding 

exp.setup(data=pre_data, 
          target=target, 
          test_data=pre_test , 
          ordinal_features=ordinal_features, 
          numeric_features=numeric_features, 
          categorical_features=nominal_features, 
          fix_imbalance=True, 
          fix_imbalance_method="RandomUnderSampler", 
          remove_multicollinearity= True,
          multicollinearity_threshold = 0.9,
          normalize = True,
          normalize_method = 'robust',
          remove_outliers = True,
          fold=3, 
          session_id=6)

Unnamed: 0,Description,Value
0,Session id,6
1,Target,loan_status
2,Target type,Binary
3,Original data shape,"(1060568, 22)"
4,Transformed data shape,"(436367, 22)"
5,Transformed train set shape,"(318526, 22)"
6,Transformed test set shape,"(117841, 22)"
7,Ordinal features,5
8,Numeric features,14
9,Categorical features,3


<pycaret.classification.oop.ClassificationExperiment at 0x1decf2b0160>

In [94]:
#fico_range_low 제거됨
pre_col = pre_data.columns.tolist()
sel_col = exp.X_train_transformed.columns.tolist()
remove_col = [i for i in pre_col if i not in sel_col]
remove_col

['verification_status', 'fico_range_low', 'loan_status']

In [99]:
cell_start_time = time.time() 
exp.compare_models(exclude=["Knn"], fold = 5, round = 4, sort = 'AUC')
cell_end_time = time.time() 
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8851,0.9569,0.9298,0.6446,0.7614,0.689,0.709,44.0633
lightgbm,Light Gradient Boosting Machine,0.8845,0.9564,0.931,0.643,0.7607,0.6879,0.7084,26.1267
xgboost,Extreme Gradient Boosting,0.8845,0.9556,0.9286,0.6434,0.7601,0.6873,0.7074,24.6667
gbc,Gradient Boosting Classifier,0.8852,0.9552,0.9296,0.6448,0.7614,0.689,0.709,46.5
ada,Ada Boost Classifier,0.8796,0.9535,0.931,0.6322,0.753,0.6772,0.6994,30.3867
lr,Logistic Regression,0.8944,0.953,0.9098,0.6712,0.7725,0.7058,0.7196,23.1967
et,Extra Trees Classifier,0.8855,0.953,0.9257,0.6462,0.7611,0.6889,0.7081,32.15
lda,Linear Discriminant Analysis,0.8918,0.9527,0.9157,0.6634,0.7694,0.7011,0.7166,21.78
rf,Random Forest Classifier,0.8846,0.9523,0.9281,0.6438,0.7602,0.6875,0.7074,36.6167
nb,Naive Bayes,0.8726,0.9381,0.8762,0.6263,0.7305,0.65,0.6657,22.9767


CELL RUN TIME :  2221.710957765579


- AUC : 0.9571 로 - 0.001 / Recall  : 0.9300 +0.5 상승
- lr 순위권 진입

데이터셋 무작위 섞어 폴드하기 fold_shuffle=True

In [13]:
exp.setup(data=pre_data, 
          target=target, 
          test_data=pre_test , 
          ordinal_features=ordinal_features, 
          numeric_features=numeric_features, 
          categorical_features=nominal_features, 
          fix_imbalance=True, 
          fix_imbalance_method="RandomUnderSampler", 
          remove_multicollinearity= True,
          multicollinearity_threshold = 0.9,
          normalize = True,
          normalize_method = 'robust',
          remove_outliers = True,
          fold=3, 
          fold_shuffle=True,
          session_id=6)

Unnamed: 0,Description,Value
0,Session id,6
1,Target,loan_status
2,Target type,Binary
3,Original data shape,"(1060568, 22)"
4,Transformed data shape,"(436367, 22)"
5,Transformed train set shape,"(318526, 22)"
6,Transformed test set shape,"(117841, 22)"
7,Ordinal features,5
8,Numeric features,14
9,Categorical features,3


<pycaret.classification.oop.ClassificationExperiment at 0x1decf2b0160>

In [101]:
exp.X_train_transformed.columns

Index(['term', 'installment', 'sub_grade', 'emp_length',
       'verification_status_Verified', 'verification_status_Source Verified',
       'verification_status_Not Verified', 'dti', 'fico_range_high',
       'last_fico_range_high', 'last_fico_range_low', 'open_acc', 'revol_util',
       'total_acc', 'mo_sin_old_rev_tl_op', 'installment_ratio', 'loan_vs_inc',
       'addr_state_count', 'avg_cur_bal_level', 'mo_sin_rcnt_rev_tl_op_level',
       'pct_tl_nvr_dlq_level'],
      dtype='object')

In [102]:
cell_start_time = time.time() 
exp.compare_models(exclude=["Knn"], fold = 5, round = 4, sort = 'AUC')
cell_end_time = time.time() 
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.885,0.9569,0.9299,0.6443,0.7612,0.6887,0.7088,39.738
lightgbm,Light Gradient Boosting Machine,0.8846,0.9565,0.9309,0.6433,0.7608,0.6881,0.7085,19.672
xgboost,Extreme Gradient Boosting,0.8846,0.9559,0.9295,0.6434,0.7604,0.6877,0.7079,20.846
gbc,Gradient Boosting Classifier,0.885,0.9553,0.9299,0.6443,0.7612,0.6887,0.7088,39.846
ada,Ada Boost Classifier,0.881,0.9535,0.9303,0.6353,0.755,0.6801,0.7016,25.066
lr,Logistic Regression,0.8942,0.953,0.9101,0.6707,0.7723,0.7054,0.7194,20.756
et,Extra Trees Classifier,0.8856,0.9528,0.9266,0.6463,0.7615,0.6893,0.7087,28.632
lda,Linear Discriminant Analysis,0.892,0.9527,0.916,0.6639,0.7698,0.7016,0.7171,19.644
rf,Random Forest Classifier,0.8847,0.9523,0.9285,0.644,0.7605,0.6879,0.7079,32.572
nb,Naive Bayes,0.8838,0.9434,0.8835,0.6514,0.7499,0.6765,0.6898,19.276


CELL RUN TIME :  2694.0213103294373


##  **Best** 이상치 remove_outliers = False / fold_shuffle=True / robust / all OneHot
catboost AUC : 0.9573, Recall : 0.9251

In [15]:
exp.setup(data=pre_data, 
          target=target, 
          test_data=pre_test , 
          ordinal_features=ordinal_features, 
          numeric_features=numeric_features, 
          categorical_features=nominal_features, 
          fix_imbalance=True, 
          fix_imbalance_method="RandomUnderSampler", 
          remove_multicollinearity= True,
          multicollinearity_threshold = 0.9,
          normalize=True,
          normalize_method='robust',
          fold=3, 
          fold_shuffle=True,
          session_id=6)

Unnamed: 0,Description,Value
0,Session id,6
1,Target,loan_status
2,Target type,Binary
3,Original data shape,"(1060568, 22)"
4,Transformed data shape,"(489477, 22)"
5,Transformed train set shape,"(371636, 22)"
6,Transformed test set shape,"(117841, 22)"
7,Ordinal features,5
8,Numeric features,14
9,Categorical features,3


<pycaret.classification.oop.ClassificationExperiment at 0x1decf2b0160>

In [114]:
cell_start_time = time.time() 
exp.compare_models(exclude=["knn"], fold = 5, round = 4, sort = 'AUC')
cell_end_time = time.time() 
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8887,0.9573,0.9251,0.6538,0.7661,0.6959,0.7139,26.118
lightgbm,Light Gradient Boosting Machine,0.888,0.9566,0.9259,0.652,0.7652,0.6945,0.713,4.39
xgboost,Extreme Gradient Boosting,0.8879,0.9563,0.9243,0.6522,0.7647,0.694,0.7122,4.338
gbc,Gradient Boosting Classifier,0.8884,0.9554,0.9243,0.6533,0.7655,0.6951,0.7131,29.082
ada,Ada Boost Classifier,0.8835,0.9536,0.9261,0.6416,0.758,0.6846,0.7046,11.17
rf,Random Forest Classifier,0.8878,0.9535,0.9236,0.6522,0.7645,0.6937,0.7118,18.508
et,Extra Trees Classifier,0.8885,0.9535,0.921,0.6543,0.7651,0.6947,0.7122,14.508
lr,Logistic Regression,0.8976,0.9531,0.9017,0.6815,0.7763,0.7115,0.7232,3.904
lda,Linear Discriminant Analysis,0.8951,0.9527,0.9085,0.6735,0.7735,0.7072,0.7206,3.248
nb,Naive Bayes,0.8904,0.9438,0.863,0.6731,0.7563,0.687,0.6958,3.05


CELL RUN TIME :  735.5935678482056


In [None]:
exp.setup(data=pre_data, 
          target=target, 
          test_data=pre_test , 
          ordinal_features=ordinal_features, 
          numeric_features=numeric_features, 
          categorical_features=nominal_features, 
          max_encoding_ohe=36,
          fix_imbalance=True, 
          fix_imbalance_method="RandomUnderSampler", 
          remove_multicollinearity= True,
          multicollinearity_threshold = 0.9,
          normalize=True,
          normalize_method='robust',
          fold=3, 
          fold_shuffle=True,
          session_id=6)

In [23]:
exp.X_train_transformed.shape

(371636, 56)

In [19]:
#Best
cell_start_time = time.time() 
exp.compare_models(exclude=["knn"], fold = 5, round = 4, sort = 'AUC')
cell_end_time = time.time() 
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8888,0.9576,0.9249,0.6541,0.7663,0.6961,0.7141,39.808
lightgbm,Light Gradient Boosting Machine,0.888,0.9566,0.926,0.6519,0.7652,0.6945,0.7129,9.794
xgboost,Extreme Gradient Boosting,0.8879,0.9564,0.9242,0.6522,0.7647,0.694,0.7122,12.772
gbc,Gradient Boosting Classifier,0.8884,0.9553,0.924,0.6535,0.7655,0.6951,0.7131,40.582
rf,Random Forest Classifier,0.8879,0.9537,0.9236,0.6522,0.7645,0.6938,0.7118,22.83
et,Extra Trees Classifier,0.8891,0.9537,0.9175,0.6565,0.7653,0.6953,0.712,24.972
ada,Ada Boost Classifier,0.883,0.9536,0.9265,0.6406,0.7575,0.6837,0.704,16.078
lr,Logistic Regression,0.8975,0.9531,0.9017,0.6814,0.7763,0.7115,0.7232,17.99
lda,Linear Discriminant Analysis,0.8953,0.9526,0.908,0.6739,0.7737,0.7075,0.7208,6.616
dt,Decision Tree Classifier,0.8421,0.8404,0.8375,0.5674,0.6765,0.5771,0.5962,14.606


CELL RUN TIME :  1290.7405123710632


In [25]:
#Best
cell_start_time = time.time() 
catboost0 = exp.compare_models(include=["catboost", "rf", "lr"], fold = 5, round = 4, sort = 'AUC')
cell_end_time = time.time() 
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8888,0.9576,0.9249,0.6541,0.7663,0.6961,0.7141,34.284
rf,Random Forest Classifier,0.8879,0.9537,0.9236,0.6522,0.7645,0.6938,0.7118,23.502
lr,Logistic Regression,0.8975,0.9531,0.9017,0.6814,0.7763,0.7115,0.7232,8.112


CELL RUN TIME :  391.3228232860565


## 1.통계기반 lr 모델 생성

In [18]:
lr = exp.create_model('lr', return_train_score=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8977,0.9531,0.901,0.682,0.7763,0.7116,0.7232
CV-Train,1,0.8979,0.9532,0.902,0.6823,0.7769,0.7124,0.7241
CV-Train,2,0.8973,0.953,0.9011,0.681,0.7757,0.7108,0.7225
CV-Val,0,0.8977,0.9531,0.9021,0.6818,0.7766,0.712,0.7237
CV-Val,1,0.8967,0.9528,0.9011,0.6793,0.7747,0.7093,0.7212
CV-Val,2,0.8984,0.9534,0.9015,0.6837,0.7777,0.7134,0.7249
CV-Train,Mean,0.8976,0.9531,0.9014,0.6818,0.7763,0.7116,0.7233
CV-Train,Std,0.0002,0.0001,0.0005,0.0006,0.0005,0.0006,0.0006
CV-Val,Mean,0.8976,0.9531,0.9016,0.6816,0.7763,0.7116,0.7233
CV-Val,Std,0.0007,0.0003,0.0004,0.0018,0.0012,0.0017,0.0015


### lr 하이퍼파라미터 서치 
- optimize = 'AUC'
- search_library="optuna" / search_algorithm="tpe"

In [40]:
best_lr = exp.tune_model(lr, optimize = 'AUC',search_library = 'optuna', search_algorithm="tpe", return_train_score=True, choose_better=True, n_iter=20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8977,0.9531,0.901,0.682,0.7763,0.7116,0.7232
CV-Train,1,0.8979,0.9532,0.902,0.6823,0.7769,0.7124,0.7241
CV-Train,2,0.8973,0.953,0.9011,0.681,0.7757,0.7108,0.7225
CV-Val,0,0.8977,0.9531,0.9021,0.6818,0.7766,0.712,0.7237
CV-Val,1,0.8967,0.9528,0.9011,0.6793,0.7746,0.7093,0.7212
CV-Val,2,0.8984,0.9534,0.9015,0.6837,0.7777,0.7134,0.7249
CV-Train,Mean,0.8976,0.9531,0.9014,0.6818,0.7763,0.7116,0.7233
CV-Train,Std,0.0002,0.0001,0.0005,0.0006,0.0005,0.0006,0.0006
CV-Val,Mean,0.8976,0.9531,0.9015,0.6816,0.7763,0.7116,0.7233
CV-Val,Std,0.0007,0.0003,0.0004,0.0018,0.0013,0.0017,0.0015


[I 2024-02-27 16:06:12,569] Searching the best hyperparameters using 942727 samples...
[I 2024-02-27 16:12:38,518] Finished hyperparameter search!


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


## 2. 부스팅 모델 - catboost/ xgb / lgbm/ gbc 생성

In [None]:
catboost = exp.create_model('catboost', return_train_score=True)

In [36]:
catboost.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'Logloss',
 'iterations': 1000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'class_names': [0, 1],
 'random_seed': 6,
 'depth': 6,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'lo

In [15]:
xgb = exp.create_model('xgboost', return_train_score=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8914,0.9623,0.9337,0.6584,0.7722,0.7038,0.7223
CV-Train,1,0.8917,0.9625,0.9329,0.6593,0.7726,0.7043,0.7225
CV-Train,2,0.8915,0.9618,0.9323,0.6587,0.772,0.7035,0.7218
CV-Val,0,0.8874,0.9558,0.924,0.651,0.7639,0.6928,0.7111
CV-Val,1,0.8872,0.9557,0.925,0.6504,0.7638,0.6926,0.7112
CV-Val,2,0.8895,0.9564,0.9236,0.656,0.7672,0.6974,0.715
CV-Train,Mean,0.8915,0.9622,0.933,0.6588,0.7723,0.7038,0.7222
CV-Train,Std,0.0001,0.0003,0.0006,0.0004,0.0002,0.0003,0.0003
CV-Val,Mean,0.888,0.956,0.9242,0.6525,0.7649,0.6943,0.7124
CV-Val,Std,0.001,0.0003,0.0006,0.0025,0.0016,0.0022,0.0018


In [39]:
xgb

In [16]:
lgbm = exp.create_model('lightgbm', return_train_score=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8886,0.9586,0.9289,0.6527,0.7667,0.6964,0.7151
CV-Train,1,0.8895,0.959,0.9275,0.6552,0.7679,0.6982,0.7163
CV-Train,2,0.889,0.9585,0.9274,0.6541,0.7671,0.6971,0.7154
CV-Val,0,0.8875,0.9565,0.9266,0.6507,0.7645,0.6935,0.7122
CV-Val,1,0.8877,0.956,0.9259,0.6513,0.7647,0.6938,0.7123
CV-Val,2,0.8894,0.9569,0.9249,0.6556,0.7673,0.6975,0.7153
CV-Train,Mean,0.889,0.9587,0.9279,0.654,0.7672,0.6972,0.7156
CV-Train,Std,0.0004,0.0002,0.0007,0.001,0.0005,0.0007,0.0005
CV-Val,Mean,0.8882,0.9565,0.9258,0.6525,0.7655,0.695,0.7133
CV-Val,Std,0.0009,0.0003,0.0007,0.0022,0.0013,0.0018,0.0014


In [41]:
lgbm

In [12]:
gbc = exp.create_model('gbc', return_train_score=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8885,0.9558,0.9248,0.6535,0.7658,0.6955,0.7135
CV-Train,1,0.8891,0.956,0.9243,0.6549,0.7666,0.6966,0.7144
CV-Train,2,0.8885,0.9557,0.9243,0.6536,0.7657,0.6954,0.7133
CV-Val,0,0.8884,0.9554,0.9245,0.6533,0.7656,0.6952,0.7132
CV-Val,1,0.8879,0.9549,0.9243,0.6522,0.7648,0.6941,0.7122
CV-Val,2,0.8897,0.9558,0.9235,0.6565,0.7675,0.6978,0.7153
CV-Train,Mean,0.8887,0.9558,0.9245,0.654,0.766,0.6958,0.7138
CV-Train,Std,0.0003,0.0001,0.0002,0.0007,0.0004,0.0006,0.0005
CV-Val,Mean,0.8887,0.9553,0.9241,0.654,0.7659,0.6957,0.7136
CV-Val,Std,0.0007,0.0004,0.0004,0.0018,0.0011,0.0016,0.0013


In [13]:
gbc

## 3. (트리기반) rf 모델 생성

In [17]:
rf = exp.create_model('rf', return_train_score=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.9268,0.9893,0.9999,0.7293,0.8434,0.7972,0.8141
CV-Train,1,0.9277,0.9893,0.9999,0.7317,0.845,0.7994,0.8159
CV-Train,2,0.9268,0.9891,1.0,0.7293,0.8435,0.7972,0.8142
CV-Val,0,0.8876,0.9531,0.9236,0.6516,0.7641,0.6932,0.7113
CV-Val,1,0.8878,0.9525,0.923,0.6521,0.7642,0.6934,0.7115
CV-Val,2,0.889,0.9537,0.9235,0.655,0.7664,0.6964,0.714
CV-Train,Mean,0.9271,0.9892,0.9999,0.7301,0.844,0.7979,0.8147
CV-Train,Std,0.0004,0.0001,0.0,0.0011,0.0007,0.001,0.0009
CV-Val,Mean,0.8881,0.9531,0.9233,0.6529,0.7649,0.6943,0.7123
CV-Val,Std,0.0006,0.0005,0.0002,0.0015,0.0011,0.0015,0.0012


## optuna 하이퍼 파라미터 서치 
- catboost 다양한 optimize metrics 사용
  - 1. AUC
    2. F1
    3. MCC
    4. Recall
   
- **optimize = 'F1', search_library = 'optuna', search_algorithm="tpe" - AUC:0.9570 Recall:0.925**
- **하이퍼파라미터 서치 하기전 pycaret 자동화 파라미터가 더 높음** -> <u>return original model</u>

In [24]:
#2. 0.9568
best_catboost= exp.tune_model(catboost, optimize = 'AUC',search_library = 'optuna', search_algorithm="tpe", return_train_score=True, choose_better=True, n_iter=50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8909,0.9609,0.9324,0.6573,0.7711,0.7022,0.7207
CV-Train,1,0.8917,0.9612,0.9313,0.6595,0.7722,0.7038,0.7219
CV-Train,2,0.891,0.9607,0.9312,0.6578,0.771,0.7022,0.7205
CV-Val,0,0.8882,0.9567,0.9257,0.6527,0.7656,0.6951,0.7134
CV-Val,1,0.8883,0.9565,0.925,0.6529,0.7655,0.695,0.7132
CV-Val,2,0.89,0.9572,0.9249,0.6571,0.7683,0.6989,0.7165
CV-Train,Mean,0.8912,0.9609,0.9316,0.6582,0.7714,0.7028,0.721
CV-Train,Std,0.0004,0.0002,0.0006,0.0009,0.0005,0.0008,0.0006
CV-Val,Mean,0.8889,0.9568,0.9252,0.6542,0.7665,0.6963,0.7143
CV-Val,Std,0.0008,0.0003,0.0004,0.002,0.0013,0.0018,0.0015


[I 2024-02-27 11:19:48,913] Searching the best hyperparameters using 942727 samples...
[I 2024-02-27 11:43:27,415] Finished hyperparameter search!


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [22]:
#1. 0.9570
best_catboost3= exp.tune_model(catboost, optimize = 'F1',search_library = 'optuna', search_algorithm="tpe", return_train_score=True, choose_better=True, n_iter=50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.89,0.9593,0.9297,0.6558,0.7691,0.6996,0.718
CV-Train,1,0.8909,0.9599,0.9286,0.6583,0.7704,0.7016,0.7195
CV-Train,2,0.8903,0.9594,0.929,0.6567,0.7695,0.7003,0.7184
CV-Val,0,0.8882,0.9568,0.926,0.6526,0.7656,0.6951,0.7135
CV-Val,1,0.8885,0.9567,0.9249,0.6533,0.7657,0.6954,0.7134
CV-Val,2,0.8901,0.9573,0.9241,0.6574,0.7682,0.6989,0.7163
CV-Train,Mean,0.8904,0.9595,0.9291,0.657,0.7697,0.7005,0.7186
CV-Train,Std,0.0004,0.0003,0.0005,0.001,0.0006,0.0008,0.0006
CV-Val,Mean,0.8889,0.957,0.925,0.6544,0.7665,0.6965,0.7144
CV-Val,Std,0.0008,0.0003,0.0008,0.0021,0.0012,0.0017,0.0013


[I 2024-02-27 10:26:16,380] Searching the best hyperparameters using 942727 samples...
[I 2024-02-27 10:48:48,061] Finished hyperparameter search!


In [23]:
#3. 0.9567
best_catboost2= exp.tune_model(catboost, optimize = 'MCC',search_library = 'optuna', search_algorithm="tpe", return_train_score=True, choose_better=True, n_iter=50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8892,0.9575,0.9267,0.6547,0.7673,0.6974,0.7155
CV-Train,1,0.89,0.9582,0.9265,0.6565,0.7685,0.699,0.7169
CV-Train,2,0.8895,0.9579,0.9268,0.6553,0.7677,0.698,0.716
CV-Val,0,0.8885,0.9566,0.9252,0.6534,0.7659,0.6956,0.7137
CV-Val,1,0.8882,0.9563,0.9251,0.6527,0.7654,0.6949,0.7131
CV-Val,2,0.89,0.9573,0.9244,0.657,0.7681,0.6986,0.7162
CV-Train,Mean,0.8895,0.9579,0.9266,0.6555,0.7678,0.6981,0.7161
CV-Train,Std,0.0003,0.0003,0.0001,0.0007,0.0005,0.0007,0.0005
CV-Val,Mean,0.8889,0.9567,0.9249,0.6544,0.7665,0.6964,0.7143
CV-Val,Std,0.0008,0.0004,0.0003,0.0019,0.0012,0.0016,0.0013


[I 2024-02-27 10:52:16,770] Searching the best hyperparameters using 942727 samples...
[I 2024-02-27 11:16:35,219] Finished hyperparameter search!


In [127]:
#2.0.9568
best_catboost1 = exp.tune_model(catboost, optimize = 'AUC', return_train_score=True, choose_better=True, n_iter=50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.889,0.9574,0.9269,0.6542,0.7671,0.6971,0.7153
CV-Train,1,0.8906,0.9591,0.9278,0.6578,0.7698,0.7008,0.7186
CV-Train,2,0.8901,0.9588,0.9279,0.6564,0.7689,0.6995,0.7175
CV-Val,0,0.8884,0.9565,0.9256,0.6531,0.7658,0.6955,0.7137
CV-Val,1,0.8884,0.9566,0.9247,0.6532,0.7656,0.6952,0.7133
CV-Val,2,0.8901,0.9574,0.9242,0.6573,0.7682,0.6988,0.7162
CV-Train,Mean,0.8899,0.9584,0.9275,0.6562,0.7686,0.6991,0.7171
CV-Train,Std,0.0007,0.0008,0.0004,0.0015,0.0011,0.0016,0.0014
CV-Val,Mean,0.889,0.9568,0.9248,0.6545,0.7665,0.6965,0.7144
CV-Val,Std,0.0008,0.0004,0.0006,0.0019,0.0012,0.0017,0.0013


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [32]:
best_xgb= exp.tune_model(xgb, optimize = 'AUC',search_library = 'optuna', search_algorithm="tpe", return_train_score=True, choose_better=True, n_iter=50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.5358,0.9574,0.9965,0.2976,0.4584,0.2223,0.3521
CV-Train,1,0.5352,0.9576,0.9964,0.2973,0.458,0.2217,0.3514
CV-Train,2,0.5392,0.9572,0.9963,0.2991,0.4601,0.2252,0.3546
CV-Val,0,0.5344,0.9559,0.9945,0.2968,0.4571,0.2204,0.3494
CV-Val,1,0.535,0.9553,0.9951,0.2971,0.4576,0.2211,0.3504
CV-Val,2,0.5385,0.9563,0.9945,0.2986,0.4593,0.2241,0.3527
CV-Train,Mean,0.5367,0.9574,0.9964,0.298,0.4588,0.2231,0.3527
CV-Train,Std,0.0017,0.0001,0.0001,0.0008,0.0009,0.0015,0.0014
CV-Val,Mean,0.536,0.9558,0.9947,0.2975,0.458,0.2219,0.3508
CV-Val,Std,0.0018,0.0004,0.0003,0.0008,0.0009,0.0016,0.0014


[I 2024-02-27 14:39:30,227] Searching the best hyperparameters using 942727 samples...
[I 2024-02-27 15:02:44,015] Finished hyperparameter search!


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [31]:
best_lgbm= exp.tune_model(lgbm, optimize = 'AUC',search_library = 'optuna', search_algorithm="tpe", return_train_score=True, choose_better=True, n_iter=50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8911,0.962,0.9351,0.6573,0.772,0.7033,0.7222
CV-Train,1,0.8924,0.9621,0.934,0.6607,0.7739,0.706,0.7243
CV-Train,2,0.8913,0.9617,0.934,0.658,0.7721,0.7035,0.7221
CV-Val,0,0.8875,0.9567,0.9265,0.6508,0.7645,0.6936,0.7123
CV-Val,1,0.8877,0.9562,0.9258,0.6513,0.7647,0.6938,0.7123
CV-Val,2,0.8891,0.9571,0.9248,0.6549,0.7668,0.6968,0.7147
CV-Train,Mean,0.8916,0.962,0.9344,0.6586,0.7727,0.7043,0.7228
CV-Train,Std,0.0006,0.0002,0.0005,0.0014,0.0009,0.0013,0.001
CV-Val,Mean,0.8881,0.9567,0.9257,0.6523,0.7653,0.6947,0.7131
CV-Val,Std,0.0007,0.0004,0.0007,0.0018,0.001,0.0015,0.0011


[I 2024-02-27 14:14:18,637] Searching the best hyperparameters using 942727 samples...
[I 2024-02-27 14:37:26,413] Finished hyperparameter search!




In [30]:
best_rf= exp.tune_model(rf, optimize = 'AUC',search_library = 'optuna', search_algorithm="tpe", return_train_score=True, choose_better=True, n_iter=50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8882,0.9601,0.9324,0.6512,0.7668,0.6963,0.7157
CV-Train,1,0.8898,0.9602,0.9299,0.6554,0.7689,0.6993,0.7178
CV-Train,2,0.8882,0.9597,0.9314,0.6514,0.7666,0.6962,0.7154
CV-Val,0,0.8864,0.9557,0.9277,0.648,0.763,0.6914,0.7107
CV-Val,1,0.887,0.9551,0.9255,0.6498,0.7635,0.6922,0.7109
CV-Val,2,0.8877,0.956,0.9265,0.6513,0.7649,0.6941,0.7127
CV-Train,Mean,0.8888,0.96,0.9312,0.6527,0.7674,0.6973,0.7163
CV-Train,Std,0.0007,0.0002,0.001,0.0019,0.001,0.0015,0.001
CV-Val,Mean,0.8871,0.9556,0.9265,0.6497,0.7638,0.6926,0.7114
CV-Val,Std,0.0005,0.0004,0.0009,0.0013,0.0008,0.0011,0.0009


[I 2024-02-27 12:58:09,053] Searching the best hyperparameters using 942727 samples...
[I 2024-02-27 14:09:49,739] Finished hyperparameter search!


## 하이퍼파라미터 서치 (커스텀 파라미터)

In [None]:
from scipy.stats import randint
from sklearn.utils.fixes import loguniform


custom_param = {
    'n_estimators': randint(100, 300),
    'depth': randint(1, 5),
    'learning_rate': loguniform(1e-3, 0.1),
    'min_child_samples': randint(10, 40),
    'grow_policy': ['SymmetricTree', 'Lossguide', 'Depthwise']
}

#custom_grid=custom_param

In [47]:
best_catboost1 = exp.tune_model(catboost, return_train_score=True, 
                                return_tuner=True, choose_better=True, custom_grid=custom_param)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8883,0.956,0.9265,0.6524,0.7657,0.6952,0.7136
CV-Train,1,0.8893,0.956,0.9251,0.6551,0.767,0.6972,0.715
CV-Train,2,0.8883,0.9558,0.9249,0.6528,0.7654,0.6949,0.7131
CV-Val,0,0.8881,0.9553,0.9246,0.6524,0.765,0.6944,0.7126
CV-Val,1,0.8878,0.9552,0.9245,0.6517,0.7645,0.6937,0.712
CV-Val,2,0.8892,0.956,0.9261,0.6546,0.767,0.6971,0.7152
CV-Train,Mean,0.8886,0.9559,0.9255,0.6534,0.766,0.6958,0.7139
CV-Train,Std,0.0005,0.0001,0.0007,0.0012,0.0007,0.001,0.0008
CV-Val,Mean,0.8883,0.9555,0.9251,0.6529,0.7655,0.6951,0.7132
CV-Val,Std,0.0006,0.0004,0.0007,0.0012,0.0011,0.0015,0.0014


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


## 앙상블 blend_model

In [126]:
#부스팅 모델 3개 앙상블 : 0.9575, Recall: 0.9258
blender = exp.blend_models(estimator_list=[best_catboost, best_xgb, best_lgbm], fold=5, method = 'soft' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8879,0.9573,0.925,0.652,0.7649,0.6942,0.7125
1,0.8886,0.9578,0.9275,0.6531,0.7665,0.6962,0.7147
2,0.887,0.9567,0.9254,0.6498,0.7635,0.6922,0.7109
3,0.8894,0.958,0.9266,0.6551,0.7676,0.6978,0.7158
4,0.8895,0.9576,0.9243,0.6559,0.7673,0.6975,0.7152
Mean,0.8885,0.9575,0.9258,0.6532,0.7659,0.6956,0.7138
Std,0.0009,0.0005,0.0012,0.0022,0.0015,0.0021,0.0018


In [34]:
#부스팅 모델 3개 앙상블(catboost;optimize :F1) : 0.9573
blender2 = exp.blend_models(estimator_list=[best_catboost3,best_lgbm, best_xgb], fold=5, method = 'soft' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8878,0.957,0.925,0.6519,0.7648,0.694,0.7123
1,0.8887,0.9576,0.9276,0.6533,0.7667,0.6964,0.7149
2,0.8874,0.9565,0.9255,0.6506,0.7641,0.693,0.7116
3,0.889,0.9578,0.9265,0.6543,0.7669,0.6969,0.7151
4,0.8893,0.9574,0.9236,0.6556,0.7669,0.697,0.7146
Mean,0.8884,0.9573,0.9257,0.6531,0.7659,0.6955,0.7137
Std,0.0007,0.0005,0.0013,0.0018,0.0012,0.0016,0.0014


In [42]:
# 트리계열 + 부스팅 : 0.9571 / Recall : 0.9266(Best)
blender3 = exp.blend_models(estimator_list=[best_rf,best_catboost], fold=5, method = 'soft' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8875,0.9569,0.9257,0.6509,0.7644,0.6934,0.712
1,0.8884,0.9575,0.929,0.6523,0.7664,0.696,0.7148
2,0.8866,0.9564,0.9263,0.6488,0.7631,0.6916,0.7105
3,0.8885,0.9576,0.9273,0.6529,0.7663,0.696,0.7144
4,0.8889,0.9573,0.9249,0.6543,0.7664,0.6963,0.7142
Mean,0.888,0.9571,0.9266,0.6518,0.7653,0.6946,0.7132
Std,0.0008,0.0004,0.0014,0.0019,0.0014,0.0018,0.0017


In [45]:
# 트리계열 + 부스팅 2개(catboost, xgb): 0.9572 
blender7 = exp.blend_models(estimator_list=[best_rf,best_catboost, best_xgb], fold=5, method = 'soft' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8878,0.957,0.9256,0.6515,0.7648,0.6939,0.7124
1,0.8884,0.9576,0.928,0.6524,0.7662,0.6958,0.7144
2,0.8867,0.9564,0.9254,0.6491,0.763,0.6915,0.7103
3,0.8887,0.9578,0.927,0.6535,0.7666,0.6964,0.7147
4,0.8891,0.9574,0.9246,0.6549,0.7667,0.6967,0.7146
Mean,0.8881,0.9572,0.9261,0.6523,0.7654,0.6949,0.7133
Std,0.0008,0.0005,0.0012,0.002,0.0014,0.0019,0.0017


In [41]:
# 통계 모델 + 부스팅 : 0.9565
blender4 = exp.blend_models(estimator_list=[lr,best_catboost], fold=5, method = 'soft' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8934,0.9563,0.9158,0.6673,0.772,0.7047,0.7197
1,0.8936,0.9569,0.9187,0.6671,0.7729,0.7057,0.7211
2,0.8918,0.9558,0.915,0.6635,0.7693,0.7009,0.7164
3,0.894,0.957,0.9158,0.6688,0.773,0.7061,0.7209
4,0.894,0.9565,0.9135,0.6694,0.7727,0.7057,0.7202
Mean,0.8934,0.9565,0.9158,0.6672,0.772,0.7046,0.7197
Std,0.0008,0.0004,0.0017,0.002,0.0014,0.0019,0.0017


In [43]:
# gbc, catboost :  0.9571
blender5 = exp.blend_models(estimator_list=[gbc,best_catboost], fold=5, method = 'soft' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8882,0.9569,0.9239,0.6529,0.7651,0.6946,0.7126
1,0.8887,0.9575,0.9279,0.6533,0.7667,0.6965,0.715
2,0.8875,0.9564,0.9247,0.6511,0.7641,0.6931,0.7115
3,0.8894,0.9576,0.9254,0.6553,0.7673,0.6975,0.7153
4,0.89,0.9571,0.924,0.6572,0.7681,0.6986,0.7161
Mean,0.8888,0.9571,0.9252,0.654,0.7663,0.6961,0.7141
Std,0.0009,0.0004,0.0015,0.0021,0.0014,0.002,0.0017


## Stacking Ensemble

In [35]:
#부스팅 모델 3개 : 0.9573
stack_cat = exp.stack_models(estimator_list=[best_catboost, best_xgb, best_lgbm], optimize='AUC', choose_better=True)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8906,0.9573,0.9215,0.659,0.7685,0.6994,0.7163
1,0.8906,0.9569,0.9211,0.6593,0.7685,0.6994,0.7162
2,0.8921,0.9577,0.9201,0.6631,0.7707,0.7026,0.7187
Mean,0.8911,0.9573,0.9209,0.6605,0.7692,0.7005,0.7171
Std,0.0007,0.0003,0.0006,0.0019,0.0011,0.0015,0.0012


In [44]:
#lr, rf, lgbm : 0.9564
stack_lr = exp.stack_models(estimator_list=[best_lr, best_rf, best_lgbm], optimize='AUC', choose_better=True)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.892,0.9565,0.9187,0.6632,0.7703,0.7021,0.718
1,0.8916,0.956,0.9174,0.6625,0.7694,0.701,0.7168
2,0.8934,0.9568,0.917,0.6669,0.7722,0.7048,0.7201
Mean,0.8923,0.9564,0.9177,0.6642,0.7706,0.7026,0.7183
Std,0.0007,0.0003,0.0007,0.0019,0.0012,0.0016,0.0013


Original model was better than the stacked model, hence it will be returned. NOTE: The display metrics are for the stacked model (not the original one).


## bagging

In [None]:
#catboost : 0.9574
bag_cat = exp.ensemble_model(estimator=catboost, fold=5, optimize = 'AUC')

In [27]:
#catboost : 0.9577
bag_cat = exp.ensemble_model(estimator=catboost0, fold=5, optimize = 'AUC')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8879,0.9575,0.925,0.6521,0.7649,0.6942,0.7125
1,0.889,0.9581,0.9274,0.654,0.7671,0.6971,0.7154
2,0.8876,0.957,0.9244,0.6514,0.7642,0.6933,0.7116
3,0.8895,0.9582,0.9261,0.6555,0.7676,0.6979,0.7158
4,0.89,0.9579,0.9247,0.657,0.7682,0.6988,0.7163
Mean,0.8888,0.9577,0.9255,0.654,0.7664,0.6962,0.7143
Std,0.0009,0.0004,0.0011,0.0021,0.0016,0.0021,0.0019


In [29]:
#catboost : 0.9577
bag_cat = exp.ensemble_model(estimator=catboost0,round=2, fold=5, optimize = 'AUC')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.89,0.96,0.92,0.65,0.76,0.69,0.71
1,0.89,0.96,0.93,0.65,0.77,0.7,0.72
2,0.89,0.96,0.92,0.65,0.76,0.69,0.71
3,0.89,0.96,0.93,0.66,0.77,0.7,0.72
4,0.89,0.96,0.92,0.66,0.77,0.7,0.72
Mean,0.89,0.96,0.93,0.65,0.77,0.7,0.71
Std,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- https://dacon.io/en/codeshare/2430

## 최종 예측 모델

In [None]:
final_model = exp.finalize_model(bag_cat)

In [None]:
exp.dashboard(bag_cat, dashboard_kwargs={"shap_interaction": False})