In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 경고 메시지가 뜨지 않게 설정
import warnings 
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 인코딩
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# 변수선택
from sklearn.model_selection import cross_val_score, train_test_split

# 검정
from scipy import stats
# 로지스틱 모델 만들기
from statsmodels.formula.api import logit

# 다중공선성 확인
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 
from catboost import CatBoostClassifier

In [4]:
df = pd.read_csv('data/회원정보_전처리2.csv')

In [5]:
df.select_dtypes(include = 'O')

Unnamed: 0,ID,연령,Segment,가입통신회사코드,거주시도명,_1순위신용체크구분,연회비발생카드수_B0M,Life_Stage
0,TRAIN_000000,40대,D,L사,서울,신용,0개,자녀성장(2)
1,TRAIN_000001,30대,E,K사,경기,신용,0개,자녀성장(1)
2,TRAIN_000002,30대,C,S사,서울,신용,0개,자녀출산기
3,TRAIN_000003,40대,D,L사,부산,신용,0개,자녀성장(2)
4,TRAIN_000004,40대,E,K사,광주,신용,0개,자녀성장(1)
...,...,...,...,...,...,...,...,...
2999995,TEST_99995,60대,,K사,경기,신용,0개,노년생활
2999996,TEST_99996,30대,,S사,서울,신용,0개,자녀출산기
2999997,TEST_99997,30대,,S사,경남,신용,0개,자녀성장(1)
2999998,TEST_99998,30대,,S사,경남,신용,0개,가족구축기


In [8]:
df['연회비발생카드수_B0M'] = df['연회비발생카드수_B0M'].str.replace('개', '').str.replace('이상', '').astype(int)

In [10]:
train = df.loc[df['Segment'].notna()]
test = df.loc[df['Segment'].isna()]

In [11]:
train.drop('ID', axis = 1, inplace = True)
test_id = test.pop('ID')

In [12]:
y_train = train.pop('Segment')

In [13]:
# target 변수 인코딩
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [14]:
test.drop('Segment', axis = 1, inplace = True)

In [15]:
x_tr, x_val, y_tr, y_val = train_test_split(train,
                                            y_train,
                                            stratify = y_train,
                                            test_size = 0.25,
                                           random_state = 10)

In [16]:
print(x_tr.shape, x_val.shape, y_tr.shape, y_val.shape)

(1800000, 61) (600000, 61) (1800000,) (600000,)


In [20]:
cat_cols = ['연령', '가입통신회사코드', '거주시도명', '_1순위신용체크구분', 'Life_Stage']

model = CatBoostClassifier(
    task_type='GPU', 
    devices='0'
)

model.fit(train, y_train, cat_features=cat_cols, eval_set=(x_val, y_val))

Learning rate set to 0.230798
0:	learn: 1.0399204	test: 1.0399934	best: 1.0399934 (0)	total: 70.7ms	remaining: 1m 10s
1:	learn: 0.8341512	test: 0.8342701	best: 0.8342701 (1)	total: 134ms	remaining: 1m 6s
2:	learn: 0.7107095	test: 0.7109156	best: 0.7109156 (2)	total: 196ms	remaining: 1m 5s
3:	learn: 0.6280498	test: 0.6282602	best: 0.6282602 (3)	total: 256ms	remaining: 1m 3s
4:	learn: 0.5695139	test: 0.5697495	best: 0.5697495 (4)	total: 318ms	remaining: 1m 3s
5:	learn: 0.5264509	test: 0.5267207	best: 0.5267207 (5)	total: 380ms	remaining: 1m 2s
6:	learn: 0.4919317	test: 0.4922090	best: 0.4922090 (6)	total: 436ms	remaining: 1m 1s
7:	learn: 0.4662832	test: 0.4665685	best: 0.4665685 (7)	total: 487ms	remaining: 1m
8:	learn: 0.4479526	test: 0.4482948	best: 0.4482948 (8)	total: 533ms	remaining: 58.7s
9:	learn: 0.4338992	test: 0.4342880	best: 0.4342880 (9)	total: 584ms	remaining: 57.8s
10:	learn: 0.4218977	test: 0.4222808	best: 0.4222808 (10)	total: 635ms	remaining: 57.1s
11:	learn: 0.4128354	te

<catboost.core.CatBoostClassifier at 0x22495659f30>

In [26]:
pd.DataFrame({
    '변수' : list(train.columns),
    'Feature_Importances' : list(model.feature_importances_)
}).sort_values('Feature_Importances', ascending = False).head(30)

Unnamed: 0,변수,Feature_Importances
9,입회경과개월수_신용,13.984158
60,최종카드발급경과월,6.854082
38,이용금액_R3M_신용체크,6.358371
42,_1순위카드이용금액,6.312145
22,거주시도명,6.189659
5,회원여부_이용가능_카드론,4.808738
59,Life_Stage,4.602741
43,_1순위카드이용건수,4.445551
17,가입통신회사코드,4.089055
2,연령,3.241428
