In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from category_encoders import BinaryEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer 
from sklearn.metrics import fbeta_score, roc_auc_score
import time
from pycaret.classification import *
import shap


In [2]:
df = pd.read_csv('loan_data.csv', low_memory=False)
df_ind = df[(df.application_type == "Individual")]

In [3]:
#총 21개 컬럼 (annual inc(특성 엔지니어링에 사용후 제거)
select_features = ["loan_amnt", "term", "int_rate", "installment", "sub_grade",
                    "emp_length", "verification_status", "addr_state", "dti", 
                    "fico_range_low", "fico_range_high", "last_fico_range_high", "last_fico_range_low",
                    "avg_cur_bal", "open_acc", "revol_util", "total_acc", 'annual_inc',
                    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "pct_tl_nvr_dlq", "loan_status"]

target = "loan_status"

In [4]:
df = df_ind[select_features].dropna()
X, y = df.loc[:, [column for column in df.columns if column != target]], df.loan_status
#Charged Off : 1 / Fully Paid : 0으로 라벨인코딩
binary_y = y.replace({'Charged Off': 1, 'Fully Paid': 0})
X_train, X_1, label_y_train, label_y_1 = train_test_split(X, binary_y, test_size=0.2, random_state=6, stratify=binary_y)
X_val, X_test, y_val, y_test = train_test_split(X_1, label_y_1, test_size=0.5, random_state=6, stratify=label_y_1)

In [5]:
data = pd.concat((X_train, label_y_train), axis=1)
test = pd.concat((X_val, y_val), axis=1)

In [6]:
def new_features(df):
    transformed_df = df.copy()
    
    #dti 0~40으로 제한
    transformed_df['dti'] = np.clip(transformed_df['dti'], 0, 40)

    #비율 특성 0~100으로 제한
    for col in ['revol_util', 'pct_tl_nvr_dlq']:
        transformed_df[col] = np.clip(transformed_df[col], 0, 100)

    # installment_rate : 전체 대출액대비 한달 상환액의 비율
    transformed_df['installment_ratio'] = transformed_df['installment'] / transformed_df['loan_amnt']

    # installment_rate : 전체 대출액대비 한달 상환액의 비율
    transformed_df['loan_vs_inc'] =  transformed_df['loan_amnt'] / (transformed_df['annual_inc'] + 1) #annual_inc==0인 값도 있으므로 분모 0를 방지하기 위헤 1을 더함
    transformed_df.drop(columns=['loan_amnt','annual_inc'], inplace=True)
    
    #CountEncoding
    state_counts = transformed_df['addr_state'].value_counts()
    transformed_df['addr_state_count'] = transformed_df['addr_state'].map(state_counts)
    transformed_df.drop(columns=['addr_state'], inplace=True)

    #avg_cur_bal 구간화
    # 동등한 분포를 가진 5구간 계산
    transformed_df['avg_cur_bal_level'] = pd.qcut(transformed_df['avg_cur_bal'], 5, labels=[1,2,3,4,5])
    transformed_df.drop(columns=['avg_cur_bal'], inplace=True)
    
    #mo_sin_rcnt_rev_tl_op 구간화
    # 동등한 분포를 가진 4구간 계산
    transformed_df['mo_sin_rcnt_rev_tl_op_level'] = pd.qcut(transformed_df['mo_sin_rcnt_rev_tl_op'], 4, labels=[1,2,3,4])
    transformed_df.drop(columns=['mo_sin_rcnt_rev_tl_op'], inplace=True)

    #pct_tl_nvr_dlq
    transformed_df['pct_tl_nvr_dlq_level'] = pd.qcut(transformed_df['pct_tl_nvr_dlq'].rank(method='first'), 3, labels=[1,2,3])
    transformed_df.drop(columns=['pct_tl_nvr_dlq'], inplace=True)

    return transformed_df

In [7]:
pre_data = new_features(data)
pre_test = new_features(test)

In [8]:
pre_data.columns

Index(['term', 'int_rate', 'installment', 'sub_grade', 'emp_length',
       'verification_status', 'dti', 'fico_range_low', 'fico_range_high',
       'last_fico_range_high', 'last_fico_range_low', 'open_acc', 'revol_util',
       'total_acc', 'mo_sin_old_rev_tl_op', 'loan_status', 'installment_ratio',
       'loan_vs_inc', 'addr_state_count', 'avg_cur_bal_level',
       'mo_sin_rcnt_rev_tl_op_level', 'pct_tl_nvr_dlq_level'],
      dtype='object')

In [9]:
#총 22개 컬럼 
nominal_features = ["term", "sub_grade","verification_status"]
ordinal_features = {'mo_sin_rcnt_rev_tl_op_level': ["1", "2", "3", "4"],
                    'avg_cur_bal_level': ["1", "2", "3", "4", "5"],
                    'pct_tl_nvr_dlq_level' : ["1", "2", "3"],
                    "emp_length": ["< 1 year", "1 year", "2 years", "3 years", "4 years", "5 years", "6 years", "7 years", "8 years", "9 years", "10+ years"]    
                   }

numeric_features = ["int_rate", "installment",  "dti", "fico_range_low", "fico_range_high",  
                    "revol_util", "total_acc", 'open_acc', "last_fico_range_high", "last_fico_range_low", 
                    "mo_sin_old_rev_tl_op", "addr_state_count",'installment_ratio', 'loan_vs_inc']

In [10]:
exp = ClassificationExperiment()

##  **Best** 이상치 remove_outliers = False / fold_shuffle=True / robust / all OneHot
catboost AUC : 0.9573, Recall : 0.9251

In [31]:
exp.setup(data=pre_data, 
          target=target, 
          test_data=pre_test , 
          ordinal_features=ordinal_features, 
          numeric_features=numeric_features, 
          categorical_features=nominal_features, 
          max_encoding_ohe=36,
          fix_imbalance=True, 
          fix_imbalance_method="RandomUnderSampler", 
          remove_multicollinearity= True,
          multicollinearity_threshold = 0.9,
          normalize=True,
          normalize_method='robust',
          fold=3, 
          fold_shuffle=True,
          session_id=6)

Unnamed: 0,Description,Value
0,Session id,6
1,Target,loan_status
2,Target type,Binary
3,Original data shape,"(1060568, 22)"
4,Transformed data shape,"(489477, 57)"
5,Transformed train set shape,"(371636, 57)"
6,Transformed test set shape,"(117841, 57)"
7,Ordinal features,5
8,Numeric features,14
9,Categorical features,3


<pycaret.classification.oop.ClassificationExperiment at 0x1decf2b0160>

In [19]:
#Best
cell_start_time = time.time() 
exp.compare_models(exclude=["knn"], fold = 5, round = 4, sort = 'AUC')
cell_end_time = time.time() 
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8888,0.9576,0.9249,0.6541,0.7663,0.6961,0.7141,39.808
lightgbm,Light Gradient Boosting Machine,0.888,0.9566,0.926,0.6519,0.7652,0.6945,0.7129,9.794
xgboost,Extreme Gradient Boosting,0.8879,0.9564,0.9242,0.6522,0.7647,0.694,0.7122,12.772
gbc,Gradient Boosting Classifier,0.8884,0.9553,0.924,0.6535,0.7655,0.6951,0.7131,40.582
rf,Random Forest Classifier,0.8879,0.9537,0.9236,0.6522,0.7645,0.6938,0.7118,22.83
et,Extra Trees Classifier,0.8891,0.9537,0.9175,0.6565,0.7653,0.6953,0.712,24.972
ada,Ada Boost Classifier,0.883,0.9536,0.9265,0.6406,0.7575,0.6837,0.704,16.078
lr,Logistic Regression,0.8975,0.9531,0.9017,0.6814,0.7763,0.7115,0.7232,17.99
lda,Linear Discriminant Analysis,0.8953,0.9526,0.908,0.6739,0.7737,0.7075,0.7208,6.616
dt,Decision Tree Classifier,0.8421,0.8404,0.8375,0.5674,0.6765,0.5771,0.5962,14.606


CELL RUN TIME :  1290.7405123710632


In [47]:
best_catboost1 = exp.tune_model(catboost, return_train_score=True, 
                                return_tuner=True, choose_better=True, custom_grid=custom_param)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8883,0.956,0.9265,0.6524,0.7657,0.6952,0.7136
CV-Train,1,0.8893,0.956,0.9251,0.6551,0.767,0.6972,0.715
CV-Train,2,0.8883,0.9558,0.9249,0.6528,0.7654,0.6949,0.7131
CV-Val,0,0.8881,0.9553,0.9246,0.6524,0.765,0.6944,0.7126
CV-Val,1,0.8878,0.9552,0.9245,0.6517,0.7645,0.6937,0.712
CV-Val,2,0.8892,0.956,0.9261,0.6546,0.767,0.6971,0.7152
CV-Train,Mean,0.8886,0.9559,0.9255,0.6534,0.766,0.6958,0.7139
CV-Train,Std,0.0005,0.0001,0.0007,0.0012,0.0007,0.001,0.0008
CV-Val,Mean,0.8883,0.9555,0.9251,0.6529,0.7655,0.6951,0.7132
CV-Val,Std,0.0006,0.0004,0.0007,0.0012,0.0011,0.0015,0.0014


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


## bagging

In [27]:
#catboost : 0.9577
bag_cat = exp.ensemble_model(estimator=catboost0, fold=5, optimize = 'AUC')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8879,0.9575,0.925,0.6521,0.7649,0.6942,0.7125
1,0.889,0.9581,0.9274,0.654,0.7671,0.6971,0.7154
2,0.8876,0.957,0.9244,0.6514,0.7642,0.6933,0.7116
3,0.8895,0.9582,0.9261,0.6555,0.7676,0.6979,0.7158
4,0.89,0.9579,0.9247,0.657,0.7682,0.6988,0.7163
Mean,0.8888,0.9577,0.9255,0.654,0.7664,0.6962,0.7143
Std,0.0009,0.0004,0.0011,0.0021,0.0016,0.0021,0.0019


In [29]:
#catboost : 0.96
bag_cat = exp.ensemble_model(estimator=catboost0,round=2, fold=5, optimize = 'AUC')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.89,0.96,0.92,0.65,0.76,0.69,0.71
1,0.89,0.96,0.93,0.65,0.77,0.7,0.72
2,0.89,0.96,0.92,0.65,0.76,0.69,0.71
3,0.89,0.96,0.93,0.66,0.77,0.7,0.72
4,0.89,0.96,0.92,0.66,0.77,0.7,0.72
Mean,0.89,0.96,0.93,0.65,0.77,0.7,0.71
Std,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 최종 예측 모델

In [None]:
final_model = exp.finalize_model(bag_cat)