## 0. 데이터 로드

In [1]:
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator, TransformerMixin
from pycaret.classification import ClassificationExperiment
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from category_encoders import CountEncoder, OneHotEncoder, OrdinalEncoder

In [2]:
accept = pd.read_csv("data/accepted_2007_to_2018Q4.csv", low_memory=False)

In [3]:
select_features = ["loan_amnt", "term", "int_rate", "sub_grade", 
                   "verification_status", "addr_state", "dti", "open_acc", 
                   "revol_util", "total_acc", "last_fico_range_high", "last_fico_range_low", 
                   "avg_cur_bal", "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "pct_tl_nvr_dlq"]

target = "loan_status"

In [4]:
raw_data = accept[(accept.loan_status == "Charged Off") & (accept.application_type == "Individual") | (accept.loan_status == "Fully Paid") & (accept.application_type == "Individual")]
data = raw_data[select_features + [target]].dropna()

In [5]:
drop_rows = np.unique(np.concatenate((data[(data.dti < 0) | (data.dti > 40)].index, 
                                      data[data.revol_util > 100].index, 
                                      data[data.last_fico_range_low < 300].index, ))) # 중복제외 총 37486행 데이터셋의 3.18%

use_data = data.drop(index=drop_rows)

In [6]:
X, y = use_data.loc[:, [column for column in use_data.columns if column != target]], use_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=6, stratify=y)

In [7]:
le = LabelEncoder()
le.fit(y_train)
le.classes_ = le.classes_[::-1]
y_train_transformed = pd.Series(le.transform(y_train), index=X_train.index, name=target)
y_test_transformed = pd.Series(le.transform(y_test), index=X_test.index, name=target)

In [8]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy["term"] = X_copy.term.str.extract("(\d+)").astype(np.float64)
        X_copy["open_acc_rate"] = (X_copy.open_acc / X_copy.total_acc * 100).round(1)
        X_copy["last_fico_score"] = X_copy[["last_fico_range_low", "last_fico_range_high"]].mean(axis=1).round()
        X_copy["installment"] = np.vectorize(lambda x, y, z: round((x * y * 0.01 / 12) * ((1 + y * 0.01 / 12) ** z) / (((1 + y * 0.01 / 12) ** z) - 1), 2))(X_copy.loan_amnt, X_copy.int_rate, X_copy.term)
        X_copy = X_copy.drop(["open_acc", "total_acc", "last_fico_range_low", "last_fico_range_high"], axis=1)
        return X_copy

In [9]:
preprocessing = Pipeline([("ordinal_encoder", OrdinalEncoder(mapping=[{"col": "sub_grade", 
                                                                       "mapping": {"F5": 0, "F4": 1, "F3": 2, "F2": 3, "F1": 4, 
                                                                                   "E5": 5, "E4": 6, "E3": 7, "E2": 8, "E1": 9, 
                                                                                   "D5": 10, "D4": 11, "D3": 12, "D2": 13, "D1": 14, 
                                                                                   "C5": 15, "C4": 16, "C3": 17, "C2": 18, "C1": 19, 
                                                                                   "B5": 20, "B4": 21, "B3": 22, "B2": 23, "B1": 24, 
                                                                                   "A5": 25, "A4": 26, "A3": 27, "A2": 28, "A1": 29}}], 
                                                             cols=["sub_grade"])), 
                          ("count_encoder", CountEncoder(cols=["addr_state"])), 
                          ("onehot_encoder", OneHotEncoder(cols=["verification_status"], use_cat_names=True)), 
                          ("feature_engineer", FeatureEngineer()), 
                          ("under_resample", RandomUnderSampler(random_state=6))])

In [10]:
X_train_preprocessed, y_train_preprocessed = preprocessing.fit_resample(X_train, y_train_transformed)
X_test_preprocessed, y_test_preprocessed = preprocessing[:4].transform(X_test), y_test_transformed

In [11]:
data = pd.concat((X_train_preprocessed, y_train_preprocessed), axis=1)
test_data = pd.concat((X_test_preprocessed, y_test_preprocessed), axis=1)

## 1. 기본모델 생성

In [12]:
exp = ClassificationExperiment()

In [13]:
exp.setup(data=data, 
          target=target, 
          test_data=test_data, 
          preprocess=False, 
          fold=5, 
          fold_shuffle=True, 
          session_id=6)

Unnamed: 0,Description,Value
0,Session id,6
1,Target,loan_status
2,Target type,Binary
3,Original data shape,"(517684, 18)"
4,Transformed data shape,"(517684, 18)"
5,Transformed train set shape,"(396596, 18)"
6,Transformed test set shape,"(121088, 18)"
7,Numeric features,17


<pycaret.classification.oop.ClassificationExperiment at 0x2938a762890>

In [14]:
exp.remove_metric("acc")
exp.remove_metric("precision")
exp.remove_metric("f1")
exp.remove_metric("kappa")
exp.remove_metric("mcc")

In [15]:
base_model = exp.create_model("dt", return_train_score=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,Recall
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1
CV-Train,0,1.0,1.0
CV-Train,1,1.0,1.0
CV-Train,2,1.0,1.0
CV-Train,3,1.0,1.0
CV-Train,4,1.0,1.0
CV-Val,0,0.8286,0.8283
CV-Val,1,0.8282,0.8263
CV-Val,2,0.8274,0.8249
CV-Val,3,0.8286,0.8249
CV-Val,4,0.8263,0.8218


## 2. 하이퍼파라미터 튜닝 진행

In [16]:
custom_grid={"criterion": ["gini", "entropy"], 
             "max_depth": np.arange(2, 20, 1), 
             "min_samples_split": np.arange(2, 100, 1), 
             "min_samples_leaf": np.arange(1, 100, 1), 
             "max_features": [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, "sqrt", "log2", None]}

In [17]:
tune_model = DecisionTreeClassifier(criterion="entropy", max_depth=7, min_samples_split=26, min_samples_leaf=18, random_state=6)

## 3. 다른 모델과의 비교

In [18]:
exp.compare_models(include=[tune_model, "rf", "ada", "gbc", "et", "xgboost", "lightgbm", "catboost"], sort="AUC")

Unnamed: 0,Model,AUC,Recall,TT (Sec)
7,CatBoost Classifier,0.9516,0.9135,24.128
6,Light Gradient Boosting Machine,0.9506,0.9143,1.208
5,Extreme Gradient Boosting,0.9505,0.9131,1.812
3,Gradient Boosting Classifier,0.9494,0.9142,29.27
2,Ada Boost Classifier,0.9473,0.9155,8.746
4,Extra Trees Classifier,0.9471,0.907,12.292
1,Random Forest Classifier,0.9469,0.9117,17.204
0,Decision Tree Classifier,0.9463,0.9115,1.824


<catboost.core.CatBoostClassifier at 0x293a5da5bd0>

### 결론

* 앙상블 모델에 비해 성능이 비교적 떨어진다.

## 4. 앙상블 진행 (모델링과정 (sklearn).ipynb 에서 최적화된 모델 사용)

### 4-1. 앙상블 모델 생성

In [19]:
class FeatureSelect(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy = X_copy.drop(["addr_state"], axis=1)
        return X_copy

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy["term"] = X_copy.term.str.extract("(\d+)").astype(np.float64)
        X_copy["open_acc_rate"] = (X_copy.open_acc / X_copy.total_acc * 100).round(1)
        X_copy["last_fico_score"] = X_copy[["last_fico_range_low", "last_fico_range_high"]].mean(axis=1).round()
        X_copy["installment"] = np.vectorize(lambda x, y, z: round((x * y * 0.01 / 12) * ((1 + y * 0.01 / 12) ** z) / (((1 + y * 0.01 / 12) ** z) - 1), 2))(X_copy.loan_amnt, X_copy.int_rate, X_copy.term)
        X_copy = X_copy.drop(["open_acc", "total_acc", "last_fico_range_low", "last_fico_range_high"], axis=1)
        return X_copy

In [20]:
preprocessing = Pipeline([("feature_select", FeatureSelect()), 
                          ("ordinal_encoder", OrdinalEncoder(mapping=[{"col": "sub_grade", 
                                                                       "mapping": {"F5": 0, "F4": 1, "F3": 2, "F2": 3, "F1": 4, 
                                                                                   "E5": 5, "E4": 6, "E3": 7, "E2": 8, "E1": 9, 
                                                                                   "D5": 10, "D4": 11, "D3": 12, "D2": 13, "D1": 14, 
                                                                                   "C5": 15, "C4": 16, "C3": 17, "C2": 18, "C1": 19, 
                                                                                   "B5": 20, "B4": 21, "B3": 22, "B2": 23, "B1": 24, 
                                                                                   "A5": 25, "A4": 26, "A3": 27, "A2": 28, "A1": 29}}], 
                                                             cols=["sub_grade"])), 
                          ("onehot_encoder", OneHotEncoder(cols=["verification_status"], use_cat_names=True)), 
                          ("feature_engineer", FeatureEngineer()), 
                          ("under_resample", RandomUnderSampler(random_state=6))])

boosting_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=7, min_samples_split=26, min_samples_leaf=18, random_state=6), 
                                    algorithm="SAMME", 
                                    random_state=6)

bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=7, min_samples_split=26, min_samples_leaf=18, random_state=6), 
                                  n_jobs=-1, 
                                  random_state=6)

In [21]:
X_train_preprocessed, y_train_preprocessed = preprocessing.fit_resample(X_train, y_train_transformed)
X_test_preprocessed, y_test_preprocessed = preprocessing[:4].transform(X_test), y_test_transformed

In [22]:
data = pd.concat((X_train_preprocessed, y_train_preprocessed), axis=1)
test_data = pd.concat((X_test_preprocessed, y_test_preprocessed), axis=1)

In [23]:
exp = ClassificationExperiment()

In [24]:
exp.setup(data=data, 
          target="loan_status", 
          test_data=test_data, 
          preprocess=False, 
          fold=5, 
          fold_shuffle=True, 
          session_id=6)

Unnamed: 0,Description,Value
0,Session id,6
1,Target,loan_status
2,Target type,Binary
3,Original data shape,"(517684, 17)"
4,Transformed data shape,"(517684, 17)"
5,Transformed train set shape,"(396596, 17)"
6,Transformed test set shape,"(121088, 17)"
7,Numeric features,16


<pycaret.classification.oop.ClassificationExperiment at 0x2938119e1d0>

In [25]:
exp.remove_metric("acc")
exp.remove_metric("precision")
exp.remove_metric("f1")
exp.remove_metric("kappa")
exp.remove_metric("mcc")

### 4-2. 성능향상을 위한 하이퍼파라미터 튜닝

In [26]:
boosting_grid = {"n_estimators": [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300], 
                 "learning_rate": [0.0001, 0.0003, 0.0005, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1]}

bagging_grid = {"n_estimators": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 
                "max_samples": [0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0], 
                "max_features": [0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]}

In [27]:
boosting_tune_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=7, min_samples_split=26, min_samples_leaf=18, random_state=6), n_estimators=260, learning_rate=0.05, algorithm="SAMME", random_state=6)

In [28]:
bagging_tune_model = BaggingClassifier(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=7, min_samples_split=26, min_samples_leaf=18, random_state=6), n_estimators=90, max_samples=0.6, n_jobs=-1, random_state=6)

## 5. 다른 앙상블모델과 성능지표 비교

In [29]:
exp.compare_models(include=[boosting_tune_model, bagging_tune_model, "rf", "gbc", "et", "xgboost", "lightgbm", "catboost"], sort="AUC")

Unnamed: 0,Model,AUC,Recall,TT (Sec)
7,CatBoost Classifier,0.9514,0.9136,19.548
6,Light Gradient Boosting Machine,0.9506,0.9141,0.796
5,Extreme Gradient Boosting,0.9503,0.9128,1.27
0,Ada Boost Classifier,0.9495,0.9118,185.758
3,Gradient Boosting Classifier,0.9494,0.9142,19.51
1,Bagging Classifier,0.9484,0.9152,13.338
4,Extra Trees Classifier,0.9471,0.9072,10.138
2,Random Forest Classifier,0.9469,0.9116,14.534


<catboost.core.CatBoostClassifier at 0x293a6288370>

## 6. 최종모델 선정

### BaggingClassifier

#### 이유
1. 해석가능성 : 여러개의 기본 추정기를 평균화하는 방식으로 작동하여 모델의 동작이 상대적으로 간단하고 직관적이다.
2. 성능지표 : 다른 앙상블모델과 비교하였을때, 성능면에서 큰 차이가 나지 않는다.
3. 계산비용 : 계산비용이 적어 효율인 메모리관리가 가능하다.

In [30]:
best_model = exp.create_model(bagging_tune_model)

Unnamed: 0_level_0,AUC,Recall
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.9473,0.9143
1,0.9486,0.9175
2,0.9491,0.9166
3,0.9485,0.9134
4,0.9484,0.9143
Mean,0.9484,0.9152
Std,0.0006,0.0016


### 6-1. 성능점수 확인

In [31]:
exp.predict_model(best_model, data)

Unnamed: 0,Model,AUC,Recall
0,Bagging Classifier,0.9495,0.9161


Unnamed: 0,loan_amnt,term,int_rate,sub_grade,verification_status_Source Verified,verification_status_Verified,verification_status_Not Verified,dti,revol_util,avg_cur_bal,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,pct_tl_nvr_dlq,open_acc_rate,last_fico_score,installment,loan_status,prediction_label,prediction_score
1123198,35000.0,60.0,12.99,18.0,0,1,0,17.430000,67.500000,28274.0,165.0,16.0,100.000000,50.000000,667.0,796.179993,0,0,0.6619
1163528,4000.0,36.0,14.99,15.0,0,0,1,27.900000,60.500000,2346.0,64.0,6.0,100.000000,93.300003,567.0,138.639999,0,1,0.8738
1138582,12000.0,36.0,8.19,25.0,0,0,1,10.300000,45.700001,17839.0,193.0,4.0,97.900002,17.000000,742.0,377.089996,0,0,0.9823
1285218,13600.0,36.0,14.16,18.0,1,0,0,19.400000,50.000000,3389.0,79.0,14.0,90.900002,83.300003,717.0,465.869995,0,0,0.9788
1111585,21600.0,36.0,7.91,25.0,0,1,0,34.290001,73.000000,18514.0,163.0,6.0,100.000000,48.500000,707.0,675.969971,0,0,0.9557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2152218,20000.0,36.0,7.35,26.0,1,0,0,1.460000,9.300000,13835.0,478.0,30.0,80.000000,50.000000,657.0,620.750000,1,0,0.6222
51128,10000.0,36.0,8.18,24.0,0,0,1,5.500000,65.699997,12330.0,335.0,10.0,100.000000,45.500000,632.0,314.190002,1,1,0.5106
2184276,15000.0,36.0,13.49,18.0,0,1,0,14.570000,17.000000,15735.0,42.0,10.0,95.500000,45.500000,502.0,508.959991,1,1,0.9421
1107960,20000.0,60.0,14.85,15.0,0,0,1,10.750000,32.700001,1806.0,213.0,28.0,83.599998,23.600000,632.0,474.230011,1,1,0.7443


In [32]:
exp.predict_model(best_model)

Unnamed: 0,Model,AUC,Recall
0,Bagging Classifier,0.9482,0.9142


Unnamed: 0,loan_amnt,term,int_rate,sub_grade,verification_status_Source Verified,verification_status_Verified,verification_status_Not Verified,dti,revol_util,avg_cur_bal,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,pct_tl_nvr_dlq,open_acc_rate,last_fico_score,installment,loan_status,prediction_label,prediction_score
1869363,24000.0,60.0,8.90,25.0,0,1,0,19.820000,16.799999,26412.0,421.0,26.0,100.000000,66.699997,762.0,497.040009,0,0,0.9282
1090225,20000.0,36.0,5.32,29.0,1,0,0,11.810000,19.299999,2033.0,148.0,2.0,97.199997,44.400002,762.0,602.299988,0,0,0.9846
1096985,20000.0,36.0,5.32,29.0,1,0,0,10.020000,53.299999,7235.0,151.0,44.0,100.000000,86.699997,787.0,602.299988,0,0,0.9864
314319,20000.0,36.0,13.33,17.0,1,0,0,16.530001,16.400000,3311.0,211.0,1.0,96.300003,26.100000,652.0,677.059998,1,0,0.5784
368351,18400.0,36.0,6.68,27.0,0,0,1,13.710000,46.799999,2930.0,186.0,75.0,100.000000,30.799999,742.0,565.450012,0,0,0.9828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2114881,15000.0,36.0,5.32,29.0,0,0,1,9.410000,22.600000,32764.0,139.0,38.0,100.000000,36.799999,792.0,451.720001,0,0,0.9861
1714755,7000.0,36.0,26.24,5.0,1,0,0,3.180000,55.500000,2872.0,38.0,14.0,100.000000,83.300003,587.0,282.929993,1,1,0.7706
30704,20000.0,60.0,7.89,25.0,0,0,1,20.980000,49.500000,7934.0,246.0,39.0,92.500000,45.000000,742.0,404.480011,0,0,0.9697
111043,1800.0,36.0,13.99,16.0,0,1,0,18.059999,52.900002,4076.0,111.0,35.0,100.000000,35.299999,602.0,61.509998,0,1,0.7131


In [33]:
exp.evaluate_model(best_model[1])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…