### 고객만족 데이터 세트를 이용항 앙상블[실습]

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, Binarizer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import missingno as ns

import warnings
warnings.filterwarnings('ignore')

In [2]:
from xgboost import XGBClassifier

In [3]:
# 학습 데이터
customer_train = pd.read_csv('./data/train.csv')
customer_train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [4]:
# 테스트 데이터
customer_test = pd.read_csv('./data/test.csv')
customer_test.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,2,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40532.1
1,5,2,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45486.72
2,6,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46993.95
3,7,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187898.61
4,9,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73649.73


In [5]:
# 데이터 전처리 없이 XGBM을 이용한 예측
X_train, X_test, y_train, y_test = train_test_split(customer_train.iloc[:,:-1] ,customer_train.loc[:,'TARGET'], test_size=0.2, random_state=100)

In [6]:
# 클래스에 대한 분포 확인
print(customer_train.TARGET.value_counts())
# 비율
unsati_cnt = customer_train[customer_train['TARGET']== 1]['TARGET'].count()/ customer_train['TARGET'].count()
print('불만족 비율: ',unsati_cnt)

0    73012
1     3008
Name: TARGET, dtype: int64
불만족 비율:  0.0395685345961589


In [7]:
# 데이터 전처리 없이 XGBM을 이용한 예측

# 성능평가를 조기 중단 파라미터를 설정하고 학습/예측/평가
# 하이퍼 파라미터 튜닝을 해보자 - GridSearchCV - 교차검증
# XGBM을 이용한 예측
# 피처 임포턴트 시각화

# stacking model 로 변화하여 성능평가

In [8]:
customer_train.isna().sum()

ID                               0
var3                             0
var15                            0
imp_ent_var16_ult1               0
imp_op_var39_comer_ult1          0
imp_op_var39_comer_ult3          0
imp_op_var40_comer_ult1          0
imp_op_var40_comer_ult3          0
imp_op_var40_efect_ult1          0
imp_op_var40_efect_ult3          0
imp_op_var40_ult1                0
imp_op_var41_comer_ult1          0
imp_op_var41_comer_ult3          0
imp_op_var41_efect_ult1          0
imp_op_var41_efect_ult3          0
imp_op_var41_ult1                0
imp_op_var39_efect_ult1          0
imp_op_var39_efect_ult3          0
imp_op_var39_ult1                0
imp_sal_var16_ult1               0
ind_var1_0                       0
ind_var1                         0
ind_var2_0                       0
ind_var2                         0
ind_var5_0                       0
ind_var5                         0
ind_var6_0                       0
ind_var6                         0
ind_var8_0          

In [9]:
def classifier_eval(y_test , y_pred) :
    print('오차행렬 : \n' , confusion_matrix(y_test, y_pred))
    print('정확도   : ' , accuracy_score(y_test, y_pred))
    print('정밀도   : ' , precision_score(y_test, y_pred))
    print('재현율   : ' , recall_score(y_test, y_pred))
    print('F1       : ' , f1_score(y_test, y_pred))
    print('AUC      : ' , roc_auc_score(y_test, y_pred))

In [10]:
# 성능평가를 조기 중단 파라미터를 설정하고 학습/예측/평가
sklearn_xgboost_model = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
sklearn_xgboost_model.fit(X_train, y_train,
                         early_stopping_rounds=100, eval_metric='logloss',eval_set=[(X_test, y_test)], verbose=True)

[0]	validation_0-logloss:0.611757
Will train until validation_0-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.54517
[2]	validation_0-logloss:0.489864
[3]	validation_0-logloss:0.443297
[4]	validation_0-logloss:0.403783
[5]	validation_0-logloss:0.369954
[6]	validation_0-logloss:0.340852
[7]	validation_0-logloss:0.315703
[8]	validation_0-logloss:0.293898
[9]	validation_0-logloss:0.274906
[10]	validation_0-logloss:0.258339
[11]	validation_0-logloss:0.243854
[12]	validation_0-logloss:0.231179
[13]	validation_0-logloss:0.219991
[14]	validation_0-logloss:0.210212
[15]	validation_0-logloss:0.201666
[16]	validation_0-logloss:0.194055
[17]	validation_0-logloss:0.187412
[18]	validation_0-logloss:0.181546
[19]	validation_0-logloss:0.176361
[20]	validation_0-logloss:0.171859
[21]	validation_0-logloss:0.167852
[22]	validation_0-logloss:0.164302
[23]	validation_0-logloss:0.161165
[24]	validation_0-logloss:0.158455
[25]	validation_0-logloss:0.156096
[26]	validation_0-logloss:0.1539

[230]	validation_0-logloss:0.135182
[231]	validation_0-logloss:0.135182
[232]	validation_0-logloss:0.135179
[233]	validation_0-logloss:0.135186
[234]	validation_0-logloss:0.135187
[235]	validation_0-logloss:0.135198
[236]	validation_0-logloss:0.135213
[237]	validation_0-logloss:0.135218
[238]	validation_0-logloss:0.135242
[239]	validation_0-logloss:0.135244
[240]	validation_0-logloss:0.135253
[241]	validation_0-logloss:0.135246
[242]	validation_0-logloss:0.135237
[243]	validation_0-logloss:0.135235
[244]	validation_0-logloss:0.135237
[245]	validation_0-logloss:0.135243
[246]	validation_0-logloss:0.135271
[247]	validation_0-logloss:0.135276
[248]	validation_0-logloss:0.135277
[249]	validation_0-logloss:0.135267
[250]	validation_0-logloss:0.135265
[251]	validation_0-logloss:0.135272
[252]	validation_0-logloss:0.135286
[253]	validation_0-logloss:0.135283
[254]	validation_0-logloss:0.135282
[255]	validation_0-logloss:0.135283
[256]	validation_0-logloss:0.13529
[257]	validation_0-logloss:0.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=400, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [11]:
y_pred184 = sklearn_xgboost_model.predict(X_test)
classifier_eval(y_test, y_pred184)

오차행렬 : 
 [[14584     1]
 [  616     3]]
정확도   :  0.9594185740594581
정밀도   :  0.75
재현율   :  0.004846526655896607
F1       :  0.009630818619582664
AUC      :  0.5023889815315822


In [None]:
# 하이퍼 파라미터 튜닝을 해보자 - GridSearchCV - 교차검증

params = {
    'n_estimators': [100,200,300,400],
    'max_depth':[6,8,10,12],
    'min_samples_leaf': [8,12,18],
    'min_samples_split' : [8, 16, 20],
    'learning_rate' : [0.1]
}
sklearn_xgboost_model = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
grid_cv = GridSearchCV(sklearn_xgboost_model, param_grid=params, cv=5)
grid_cv.fit(X_train, y_train)

In [None]:
# 피처 임포턴트 시각화
feature_importance = pd.Series(grid_cv.best_importance_, index=X_train.columns)
feature_top20 = feature_importance.sort_values(asending=False)[:20]

df = pd.DataFrame(feature_top20, columns=['importance'])
df.index.name = 'feature'
df.