Feature Engineering에서 가장 좋은 성능을 보였던 데이터 (10개 컬럼) 를 가지고

RFE를 사용하여 최적의 특성 집합을 뽑아 모델링 해보자

In [50]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE, RFECV
warnings.filterwarnings('ignore')

In [51]:
train = pd.read_csv('./train_new.csv')

train.drop(['ID', '근로기간', '연간소득', '주택소유상태', '최근_2년간_연체_횟수', 
            '부채_대비_소득_비율', '총계좌수', '대출목적', '총연체금액', '연체계좌수'], axis=1, inplace=True)

train['대출금액*대출기간'] = train['대출금액'] * train['대출기간']
train['대출금액/대출기간'] = train['대출금액'] / train['대출기간']

train['총상환원금*대출기간'] = train['총상환원금'] * train['대출기간']
train['총상환이자*대출기간'] = train['총상환이자'] * train['대출기간']

train['총상환원금/대출기간'] = train['총상환원금'] / train['대출기간']
train['총상환이자/대출기간'] = train['총상환이자'] / train['대출기간']

le = LabelEncoder()

train['대출등급'] = le.fit_transform(train['대출등급'])

X = train.loc[:, train.columns != '대출등급']
y = train.loc[:, '대출등급']

ss = StandardScaler()

X_ss = ss.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_ss, y, test_size=0.3, stratify=y, random_state=42)

In [52]:
X

Unnamed: 0,대출금액,대출기간,총상환원금,총상환이자,대출금액*대출기간,대출금액/대출기간,총상환원금*대출기간,총상환이자*대출기간,총상환원금/대출기간,총상환이자/대출기간
0,12480000,3,0,0.0,37440000,4160000.0,0,0.0,0.0,0.0
1,14400000,5,373572,234060.0,72000000,2880000.0,1867860,1170300.0,74714.4,46812.0
2,12000000,3,928644,151944.0,36000000,4000000.0,2785932,455832.0,309548.0,50648.0
3,14400000,3,325824,153108.0,43200000,4800000.0,977472,459324.0,108608.0,51036.0
4,4800000,3,240216,55428.0,14400000,1600000.0,720648,166284.0,80072.0,18476.0
...,...,...,...,...,...,...,...,...,...,...
90617,14400000,3,974580,492168.0,43200000,4800000.0,2923740,1476504.0,324860.0,164056.0
90618,28800000,5,583728,855084.0,144000000,5760000.0,2918640,4275420.0,116745.6,171016.8
90619,14400000,3,1489128,241236.0,43200000,4800000.0,4467384,723708.0,496376.0,80412.0
90620,15600000,3,1378368,818076.0,46800000,5200000.0,4135104,2454228.0,459456.0,272692.0


In [19]:
# 최종데이터 저장
train[['총상환원금','총상환원금*대출기간','총상환이자*대출기간','총상환원금/대출기간','총상환이자/대출기간','대출등급']].to_csv('train_최종.csv', index=False)

In [12]:
model = XGBClassifier(objective='multi:softmax', n_estimators=50, max_depth=40, 
          learning_rate=0.4, n_jobs=-1, random_state=42)

rfe = RFE(model)

rfe.fit(X_train, y_train)

selected_features = rfe.support_
print(selected_features)

[False  True False False False False  True  True  True  True]


1,6,7,8,9번째 컬럼이 선택되었음, 적용해서 SKFold go

In [17]:
indices_of_true = list(np.where(selected_features)[0])
X = train.loc[:, train.columns[indices_of_true]]
X_ss = ss.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_ss, y, test_size=0.3, stratify=y, random_state=42)

In [53]:
def grid_search(model, params, random=False):
    clf = model
    if not random:
        grid = GridSearchCV(clf, params,
                                scoring='f1_macro', cv=5,
                                n_jobs=-1)
    else:
        grid = RandomizedSearchCV(clf, params, n_iter=10,
                                scoring='f1_macro', cv=5,
                                n_jobs=-1, random_state=42)
        
    grid.fit(X_train, y_train)
    
    
    best_model = grid.best_estimator_
    
    best_params = grid.best_params_
    print("최상의 매개변수:", best_params)
    
    best_score = grid.best_score_
    print("훈련 점수: {:.3f}".format(best_score))
    
    y_pred = best_model.predict(X_val)
    macro_f1_val = f1_score(y_val, y_pred, average='macro')
    print('테스트 세트 점수: {:.3f}'.format(macro_f1_val))

In [29]:
params = {'n_estimators' : [50,100,200,300,400,500],
          'learning_rate' : [0.01,0.05,0.1,0.2,0.3,0.4],
          'max_depth' : [6,10,20,30,40,50,100],
          'objective' : ['multi:softmax']}

grid_search(XGBClassifier(random_state=42, n_jobs=-1), params, random=True)

최상의 매개변수: {'objective': 'multi:softprob', 'n_estimators': 50, 'max_depth': 40, 'learning_rate': 0.4}
훈련 점수: 0.829
테스트 세트 점수: 0.825


최상의 매개변수: {'objective': 'multi:softprob', 'n_estimators': 50, 'max_depth': 40, 'learning_rate': 0.4}
훈련 점수: 0.829
테스트 세트 점수: 0.825

In [45]:
X_ss = pd.DataFrame(X_ss, columns=X.columns)
y = pd.DataFrame(y)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

f1_macro_scores = []

def skf_score(model):
    for train_idx, valid_idx in skf.split(X_ss, y):
        X_train = X_ss.iloc[train_idx]
        X_val = y.iloc[train_idx]

        y_train = X_ss.iloc[valid_idx]
        y_val = y.iloc[valid_idx]

        model.fit(X_train, X_val)

        pred = model.predict(y_train)

        f1_macro = f1_score(y_val, pred, average='macro')
        f1_macro_scores.append(f1_macro)
    
    average_f1_macro = np.mean(f1_macro_scores)

    print("Average F1-macro score:", average_f1_macro)
    try:
        if model.feature_importances_.any():
            feature_importances = model.feature_importances_
            print("\n",'-'*10,'특성중요도','-'*10)
            for feature, importance in zip(X_ss.columns, feature_importances):
                print(f"{feature}: {importance}")
    except:
        None

In [31]:
skf_score(XGBClassifier(objective='multi:softmax', n_estimators=50, max_depth=40, 
          learning_rate=0.4, n_jobs=-1, random_state=42))

Average F1-macro score: 0.8489262679454228

 ---------- 특성중요도 ----------
대출기간: 0.661698043346405
대출금액/대출기간: 0.01999281533062458
총상환원금*대출기간: 0.08542182296514511
총상환이자*대출기간: 0.11432438343763351
총상환원금/대출기간: 0.11856291443109512


* DecisionTree RFE

In [4]:
model = DecisionTreeClassifier(min_samples_leaf=19, min_impurity_decrease= 0, max_features=0.68,
                                 max_depth=17, class_weight=None, random_state=42)
rfe = RFE(model)

rfe.fit(X_train, y_train)

selected_features = rfe.support_
print(selected_features)

[False False False  True False False  True  True  True  True]


In [4]:
indices_of_true = list(np.where(selected_features)[0])
X = train.loc[:, train.columns[indices_of_true]]
X_ss = ss.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_ss, y, test_size=0.3, stratify=y, random_state=42)

In [10]:
params = {'min_samples_leaf':[18,19,20,21,22],
          'min_impurity_decrease':[0.0],
          'max_features':['auto',0.6,0.61,0.62,0.63,0.64,0.65,0.66,0.67,0.68,0.69,0.70],
          'max_depth':[None,11,12,13,14,15,16,17,18],
          'class_weight' : [None, 'balanced']}

grid_search(DecisionTreeClassifier(random_state=42), params, random=True)

최상의 매개변수: {'min_samples_leaf': 18, 'min_impurity_decrease': 0.0, 'max_features': 0.65, 'max_depth': 15, 'class_weight': None}
훈련 점수: 0.752
테스트 세트 점수: 0.747


In [12]:
X_ss = pd.DataFrame(X_ss, columns=X.columns)
y = pd.DataFrame(y)

In [13]:
skf_score(DecisionTreeClassifier(min_samples_leaf=18, min_impurity_decrease=0, max_features=0.65,
                                 max_depth=15, class_weight=None, random_state=42))

Average F1-macro score: 0.7777502879496405

 ---------- 특성중요도 ----------
총상환이자: 0.23804910992262585
대출금액/대출기간: 0.037187671370198565
총상환원금*대출기간: 0.2532472581864127
총상환이자*대출기간: 0.24442337301125633
총상환원금/대출기간: 0.22709258750950667


* RandomForest RFE

In [54]:
model = RandomForestClassifier(n_estimators=770, min_samples_leaf=1, min_impurity_decrease=0,
                               max_depth=95, random_state=42, n_jobs=-1)

rfe = RFE(model)

rfe.fit(X_train, y_train)

selected_features = rfe.support_
print(selected_features)

[False False  True False False False  True  True  True  True]


* RandomForest RFECV

In [22]:
model = RandomForestClassifier(n_estimators=770, min_samples_leaf=1, min_impurity_decrease=0, 
                               max_depth=95, random_state=42, n_jobs=-1)

rfecv = RFECV(model, step=1, cv=5, min_features_to_select=5)

rfecv.fit(X_train, y_train)

selected_features = rfecv.support_
print(selected_features)

[ True False  True  True  True  True  True  True  True  True]


In [55]:
indices_of_true = list(np.where(selected_features)[0])
X = X.loc[:, X.columns[indices_of_true]]
X_ss = ss.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_ss, y, test_size=0.3, stratify=y, random_state=42)

In [56]:
X

Unnamed: 0,총상환원금,총상환원금*대출기간,총상환이자*대출기간,총상환원금/대출기간,총상환이자/대출기간
0,0,0,0.0,0.0,0.0
1,373572,1867860,1170300.0,74714.4,46812.0
2,928644,2785932,455832.0,309548.0,50648.0
3,325824,977472,459324.0,108608.0,51036.0
4,240216,720648,166284.0,80072.0,18476.0
...,...,...,...,...,...
90617,974580,2923740,1476504.0,324860.0,164056.0
90618,583728,2918640,4275420.0,116745.6,171016.8
90619,1489128,4467384,723708.0,496376.0,80412.0
90620,1378368,4135104,2454228.0,459456.0,272692.0


In [57]:
X_ss = pd.DataFrame(X_ss, columns=X.columns)
y = pd.DataFrame(y)

* RFE 결과 (5개 컬럼)

In [59]:
skf_score(RandomForestClassifier(n_estimators=770, min_samples_leaf=1, min_impurity_decrease=0,
                                 max_features=0.7, max_depth=99, random_state=42, n_jobs=-1))

Average F1-macro score: 0.8331198292611628

 ---------- 특성중요도 ----------
총상환원금: 0.1707860211868218
총상환원금*대출기간: 0.18923458565216014
총상환이자*대출기간: 0.23530360829164357
총상환원금/대출기간: 0.16499245610770408
총상환이자/대출기간: 0.2396833287616705


--> 결과 날아가서 다음 날 다시했더니 0.8499651809840966에서 0.8331198292611628로 바뀌었다.. Why? 랜덤시드 이슈????

어쨌든 결과를 이미 기록해두었으니 그대로 이용하자..

RFECV를 적용한결과 RFE보다 0.0004점 낮게 나옴

* RFECV 결과 (9개 컬럼)

Average F1-macro score: 0.849547912699314

 ---------- 특성중요도 ----------

대출금액: 0.03616528765301695

총상환원금: 0.15239655350743023

총상환이자: 0.13747149025595118

대출금액*대출기간: 0.04060973134659964

대출금액/대출기간: 0.038496830939239265

총상환원금*대출기간: 0.1678831708944553

총상환이자*대출기간: 0.14018528448684664

총상환원금/대출기간: 0.1520173273525923

총상환이자/대출기간: 0.1347743235638686

* LGBM RFE

In [8]:
model = LGBMClassifier(objective='multiclass', num_leaves=25, n_estimators=500, max_depth=10, 
               learning_rate=0.1, n_jobs=-1, random_state=42)

rfe = RFE(model)

rfe.fit(X_train, y_train)

selected_features = rfe.support_
print(selected_features)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2298
[LightGBM] [Info] Number of data points in the train set: 63435, number of used features: 10
[LightGBM] [Info] Start training from score -1.744243
[LightGBM] [Info] Start training from score -1.208106
[LightGBM] [Info] Start training from score -1.248814
[LightGBM] [Info] Start training from score -1.982449
[LightGBM] [Info] Start training from score -2.564256
[LightGBM] [Info] Start training from score -3.885346
[LightGBM] [Info] Start training from score -5.433754
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002584 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 63435, number of used features: 9
[LightGBM] [Info] Start training from score -

In [9]:
indices_of_true = list(np.where(selected_features)[0])
X = train.loc[:, train.columns[indices_of_true]]
X_ss = ss.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_ss, y, test_size=0.3, stratify=y, random_state=42)

In [10]:
params = {'n_estimators' : [200,300,400,500],
          'learning_rate' : [0.01,0.05,0.1,0.15],
          'max_depth' : [-1,10,20,30,40,50],
          'num_leaves' : [1,5,10,15,20,25,31,40,50]}

grid_search(LGBMClassifier(objective='multiclass', random_state=42, n_jobs=-1), params, random=True)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 63435, number of used features: 5
[LightGBM] [Info] Start training from score -1.744243
[LightGBM] [Info] Start training from score -1.208106
[LightGBM] [Info] Start training from score -1.248814
[LightGBM] [Info] Start training from score -1.982449
[LightGBM] [Info] Start training from score -2.564256
[LightGBM] [Info] Start training from score -3.885346
[LightGBM] [Info] Start training from score -5.433754
최상의 매개변수: {'num_leaves': 50, 'n_estimators': 300, 'max_depth': 40, 'learning_rate': 0.1}
훈련 점수: 0.819
테스트 세트 점수: 0.819


In [11]:
X_ss = pd.DataFrame(X_ss, columns=X.columns)
y = pd.DataFrame(y)

In [12]:
skf_score(LGBMClassifier(objective='multiclass', num_leaves=50, n_estimators=300, max_depth=40, 
               learning_rate=0.1, n_jobs=-1, random_state=42))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003594 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 81559, number of used features: 5
[LightGBM] [Info] Start training from score -1.744289
[LightGBM] [Info] Start training from score -1.208056
[LightGBM] [Info] Start training from score -1.248847
[LightGBM] [Info] Start training from score -1.982382
[LightGBM] [Info] Start training from score -2.564275
[LightGBM] [Info] Start training from score -3.885514
[LightGBM] [Info] Start training from score -5.434151
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 81559, number of used features: 5
[LightGBM] [Info] Start training from score -1

model|k-Fold|Sk-Fold
-|-|-
DecisionTree Classifier|0.747|0.7777502879496405
RandomForest Classifier|0.839|0.8499651809840966 -> 최고성능
XGBoost Classifier|0.825|0.8489262679454228
Light GBM Classifier|0.819|0.8310357025653075