In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 경고 메시지가 뜨지 않게 설정
import warnings 
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 인코딩
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# 변수선택
from sklearn.model_selection import cross_val_score, train_test_split

# 검정
from scipy import stats
# 로지스틱 모델 만들기
from statsmodels.formula.api import logit

# 다중공선성 확인
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 
from catboost import CatBoostClassifier

In [17]:
df = pd.read_csv('data/성과정보_전처리.csv')
df_segment = pd.read_csv('data/회원정보_전처리.csv', usecols = ['Segment'])

In [18]:
df = pd.concat([df, df_segment['Segment']], axis = 1)

In [19]:
df['기준년월'] = df['기준년월'] - 201800

In [20]:
train = df.loc[df['Segment'].notna()]
test = df.loc[df['Segment'].isna()]

In [21]:
train.drop('ID', axis = 1, inplace = True)
test.drop('ID', axis = 1, inplace = True)

In [22]:
y_train = train.pop('Segment')

In [23]:
# target 변수 인코딩
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [24]:
test.drop('Segment', axis = 1, inplace = True)

In [25]:
x_tr, x_val, y_tr, y_val = train_test_split(train,
                                            y_train,
                                            stratify = y_train,
                                            test_size = 0.25,
                                           random_state = 10)

In [26]:
print(x_tr.shape, x_val.shape, y_tr.shape, y_val.shape)

(1800000, 48) (600000, 48) (1800000,) (600000,)


In [27]:
model1 = XGBClassifier(tree_method = 'gpu_hist', predictor = 'gpu_predictor')
model1.fit(x_tr, y_tr)
model1.feature_importances_

array([0.00764164, 0.00446571, 0.00440124, 0.01032922, 0.03155197,
       0.00237637, 0.0291171 , 0.00090864, 0.01789328, 0.01136605,
       0.02501952, 0.01355611, 0.04184053, 0.06519347, 0.        ,
       0.02112328, 0.00538334, 0.00715586, 0.00841217, 0.01268709,
       0.0105859 , 0.00259774, 0.00537389, 0.00380851, 0.00660105,
       0.00539818, 0.01633826, 0.00936595, 0.00404688, 0.0108612 ,
       0.0062181 , 0.00686573, 0.00888911, 0.01108271, 0.11083513,
       0.00821398, 0.16179016, 0.03293507, 0.01144798, 0.01299269,
       0.04215891, 0.04137246, 0.0065816 , 0.00385112, 0.00441088,
       0.04328559, 0.02200812, 0.06966051], dtype=float32)

In [28]:
select_feature = pd.DataFrame({
    '변수' : list(x_tr.columns),
    'Feature_Importances' : model1.feature_importances_
})

In [15]:
select_feature.loc[select_feature['Feature_Importances'] >= 0.01, ]

Unnamed: 0,변수,Feature_Importances
3,증감율_이용건수_일시불_전월,0.010329
4,증감율_이용건수_할부_전월,0.031552
6,증감율_이용건수_체크_전월,0.029117
8,증감율_이용금액_신용_전월,0.017893
9,증감율_이용금액_신판_전월,0.011366
10,증감율_이용금액_일시불_전월,0.02502
11,증감율_이용금액_할부_전월,0.013556
12,증감율_이용금액_CA_전월,0.041841
13,증감율_이용금액_체크_전월,0.065193
15,증감율_이용건수_신용_분기,0.021123


In [29]:
select_features = list(select_feature.loc[select_feature['Feature_Importances'] >= 0.01, '변수'].values)

In [30]:
select_features.append('ID')

In [31]:
select_features.append('기준년월')

In [34]:
df = df[select_features].loc[df['기준년월'] == 12]

In [35]:
df.drop('기준년월', axis = 1, inplace = True)

In [36]:
df

Unnamed: 0,증감율_이용건수_일시불_전월,증감율_이용건수_할부_전월,증감율_이용건수_체크_전월,증감율_이용금액_신용_전월,증감율_이용금액_신판_전월,증감율_이용금액_일시불_전월,증감율_이용금액_할부_전월,증감율_이용금액_CA_전월,증감율_이용금액_체크_전월,증감율_이용건수_신용_분기,...,잔액_신판ca최대한도소진율_r3m,변동률_일시불평잔,변동률_RV일시불평잔,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M,ID
2000000,-0.048162,0.0,0.000000,0.238954,0.238954,0.238954,0.000000,0.000000,0.000000,-0.260577,...,0.831348,0.829721,0.999998,0.591302,1.001020,0.999998,0.000000,0.878859,1.398627,TRAIN_000000
2000001,-0.107727,0.0,0.000000,0.215049,0.215049,0.215049,0.000000,0.000000,0.000000,-0.288178,...,0.596413,0.874079,0.965251,0.901252,0.999998,0.999998,0.000000,0.000000,0.000000,TRAIN_000001
2000002,-0.130251,0.0,0.000000,-0.283922,0.267546,-0.812276,0.200758,-0.030864,0.000000,-0.277053,...,0.880607,1.082143,1.005795,0.585823,0.997353,0.000000,-0.115879,0.187467,-1.198788,TRAIN_000002
2000003,0.007924,0.0,0.000000,0.339386,0.339386,0.269813,-0.170094,0.000000,0.000000,0.252782,...,1.041138,1.019556,0.999998,0.774731,1.003519,0.999998,0.000000,0.781401,1.282494,TRAIN_000003
2000004,1.999996,0.0,-0.056901,1.999725,1.999725,1.999725,0.000000,0.000000,-0.011604,1.999996,...,0.005538,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.762016,0.986860,TRAIN_000004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,0.000000,0.0,0.000000,-0.000026,-0.000026,-0.000026,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.762016,0.986860,TEST_99995
2999996,0.000000,0.0,0.000000,0.300200,0.300200,0.300200,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.894539,0.999998,0.999998,0.999998,0.999998,0.000000,8.564683,11.379632,TEST_99996
2999997,0.000000,0.0,0.000000,0.000049,0.000049,0.000049,0.000000,0.000000,0.000000,0.000000,...,0.013292,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.762016,0.986860,TEST_99997
2999998,-0.048050,0.0,-0.166447,0.196886,0.196886,0.196886,0.000000,0.000000,-0.136399,-0.293027,...,0.379777,0.809328,0.999998,0.333183,0.999998,0.999998,0.000000,-0.241530,0.499079,TEST_99998


In [37]:
df.to_csv('성과정보12월_feat_select.csv', index = False)

In [17]:
select_feature['Feature_Importances'].sum()

1.0

In [15]:
model = CatBoostClassifier(
    task_type='GPU', 
    devices='0'
)

model.fit(train, y_train, eval_set=(x_val, y_val))

Learning rate set to 0.230798
0:	learn: 1.0972686	test: 1.0974755	best: 1.0974755 (0)	total: 63.1ms	remaining: 1m 3s
1:	learn: 0.9069359	test: 0.9072536	best: 0.9072536 (1)	total: 126ms	remaining: 1m 3s
2:	learn: 0.7914876	test: 0.7918001	best: 0.7918001 (2)	total: 186ms	remaining: 1m 1s
3:	learn: 0.7139087	test: 0.7142611	best: 0.7142611 (3)	total: 230ms	remaining: 57.3s
4:	learn: 0.6593446	test: 0.6596779	best: 0.6596779 (4)	total: 268ms	remaining: 53.2s
5:	learn: 0.6201968	test: 0.6204979	best: 0.6204979 (5)	total: 304ms	remaining: 50.3s
6:	learn: 0.5909359	test: 0.5912494	best: 0.5912494 (6)	total: 348ms	remaining: 49.4s
7:	learn: 0.5678683	test: 0.5681417	best: 0.5681417 (7)	total: 393ms	remaining: 48.7s
8:	learn: 0.5504981	test: 0.5507137	best: 0.5507137 (8)	total: 432ms	remaining: 47.6s
9:	learn: 0.5362008	test: 0.5363892	best: 0.5363892 (9)	total: 471ms	remaining: 46.6s
10:	learn: 0.5258111	test: 0.5259967	best: 0.5259967 (10)	total: 501ms	remaining: 45s
11:	learn: 0.5168469	te

<catboost.core.CatBoostClassifier at 0x157cbe82170>

In [16]:
pd.DataFrame({
    '변수' : list(train.columns),
    'Feature_Importances' : list(model.feature_importances_)
}).sort_values('Feature_Importances', ascending = False).head(30)

Unnamed: 0,변수,Feature_Importances
47,혜택수혜율_B0M,15.255615
46,혜택수혜율_R3M,7.079966
34,잔액_신판ca최대한도소진율_r6m,5.296791
37,변동률_일시불평잔,4.551859
39,변동률_할부평잔,4.099979
29,잔액_신판평균한도소진율_r6m,3.606571
36,잔액_신판ca최대한도소진율_r3m,3.422788
32,잔액_신판최대한도소진율_r3m,2.995829
13,증감율_이용금액_체크_전월,2.825223
10,증감율_이용금액_일시불_전월,2.802168
