In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 경고 메시지가 뜨지 않게 설정
import warnings 
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 인코딩
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# 변수선택
from sklearn.model_selection import cross_val_score, train_test_split

# 검정
from scipy import stats
# 로지스틱 모델 만들기
from statsmodels.formula.api import logit

# 다중공선성 확인
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 
from catboost import CatBoostClassifier

In [4]:
df

Unnamed: 0,기준년월,ID,남녀구분코드,연령,Segment,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,...,이용가능여부_해외겸용_신용_본인,이용여부_3M_해외겸용_신용_본인,연회비발생카드수_B0M,기본연회비_B0M,제휴연회비_B0M,청구금액_기본연회비_B0M,청구금액_제휴연회비_B0M,카드신청건수,Life_Stage,최종카드발급경과월
0,201807,TRAIN_000000,2,2,D,1,1,0,1,1,...,0,0,0,0,0,0,0,0,5,22
1,201807,TRAIN_000001,1,1,E,1,1,1,1,1,...,0,0,0,0,0,0,0,0,4,18
2,201807,TRAIN_000002,1,1,C,1,1,0,1,1,...,0,0,0,0,0,0,0,0,6,20
3,201807,TRAIN_000003,2,2,D,1,1,0,1,2,...,1,1,0,0,0,0,0,1,5,17
4,201807,TRAIN_000004,2,2,E,1,1,1,1,1,...,0,0,0,0,0,0,0,1,4,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,201812,TEST_99995,2,4,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2999996,201812,TEST_99996,1,1,,1,1,1,1,1,...,0,0,0,0,0,0,0,0,6,12
2999997,201812,TEST_99997,2,1,,1,1,1,1,1,...,0,0,0,0,0,0,0,0,4,11
2999998,201812,TEST_99998,1,1,,1,1,1,1,3,...,1,1,0,0,0,0,0,0,0,1


In [3]:
df = pd.read_csv('data/회원정보_전처리.csv')

In [4]:
na_box = []

for idx, val in df.isna().sum().items() :
    # print(f'{idx}의 결측치 : {val}')
    if val > 0 :
        na_box.append(idx)

In [5]:
na_box.pop(0)

'Segment'

In [6]:
na_box

['최종유효년월_신용_이용가능', '최종유효년월_신용_이용', '최종카드발급일자']

In [7]:
df.drop(na_box, axis = 1, inplace = True)

In [8]:
train = df.loc[df['Segment'].notna()]
test = df.loc[df['Segment'].isna()]

In [9]:
train.drop('ID', axis = 1, inplace = True)
test.drop('ID', axis = 1, inplace = True)

In [10]:
y_train = train.pop('Segment')

In [11]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [12]:
test.drop('Segment', axis = 1, inplace = True)

In [13]:
x_tr, x_val, y_tr, y_val = train_test_split(train,
                                            y_train,
                                            stratify = y_train,
                                            test_size = 0.25,
                                           random_state = 10)

In [14]:
print(x_tr.shape, x_val.shape, y_tr.shape, y_val.shape)

(1800000, 62) (600000, 62) (1800000,) (600000,)


In [15]:
model1 = XGBClassifier(tree_method = 'gpu_hist', predictor = 'gpu_predictor')
model1.fit(x_tr, y_tr)
model1.feature_importances_

array([0.00637133, 0.00323102, 0.00369884, 0.00413332, 0.01181677,
       0.02733041, 0.00418081, 0.03002553, 0.01246405, 0.02101517,
       0.07554956, 0.004504  , 0.04076469, 0.00896554, 0.00795302,
       0.00383201, 0.00504102, 0.005867  , 0.00405341, 0.0033125 ,
       0.00363755, 0.00459432, 0.00421772, 0.00338583, 0.00483427,
       0.01477608, 0.01132775, 0.00564133, 0.00516889, 0.        ,
       0.00617746, 0.0129966 , 0.00857356, 0.00436028, 0.        ,
       0.01385608, 0.00413895, 0.0043085 , 0.00269176, 0.43925178,
       0.00371687, 0.00252631, 0.01208148, 0.02476328, 0.00513085,
       0.00395516, 0.01374435, 0.00599715, 0.00589265, 0.00534048,
       0.01762141, 0.01005452, 0.00695402, 0.01087519, 0.00303768,
       0.00246048, 0.00066907, 0.        , 0.        , 0.01297113,
       0.00380452, 0.01035468], dtype=float32)

In [16]:
select_feature = pd.DataFrame({
    '변수' : list(x_tr.columns),
    'Feature_Importances' : model1.feature_importances_
})

In [17]:
select_feature

Unnamed: 0,변수,Feature_Importances
0,기준년월,0.006371
1,남녀구분코드,0.003231
2,연령,0.003699
3,회원여부_이용가능,0.004133
4,회원여부_이용가능_CA,0.011817
...,...,...
57,청구금액_기본연회비_B0M,0.000000
58,청구금액_제휴연회비_B0M,0.000000
59,카드신청건수,0.012971
60,Life_Stage,0.003805


In [18]:
select_features = list(select_feature.loc[select_feature['Feature_Importances'] >= 0.01, '변수'].values)

In [19]:
select_features.append('ID')
select_features.append('기준년월')

In [20]:
len(select_features)

22

In [25]:
df = df[select_features].loc[df['기준년월'] == 201812]

In [26]:
df.drop('기준년월', axis = 1, inplace = True)

In [28]:
df = df.reset_index(drop = True)

In [30]:
df.to_csv('회원정보12월_feat_select.csv', index = False)

In [16]:
select_feature.loc[select_feature['변수'].isin(['최종유효년월_신용_이용가능', '최종유효년월_신용_이용', '최종카드발급일자'])]

Unnamed: 0,변수,Feature_Importances
48,최종유효년월_신용_이용가능,0.005102
49,최종유효년월_신용_이용,0.01322
50,최종카드발급일자,0.005716


In [13]:
select_feature.loc[list(select_feature['Feature_Importances'].nlargest(30).index), :]

Unnamed: 0,변수,Feature_Importances
39,이용금액_R3M_신용체크,0.431149
10,입회경과개월수_신용,0.078621
12,이용거절여부_카드론,0.038736
7,소지카드수_유효_신용,0.026608
5,회원여부_이용가능_카드론,0.026415
43,_1순위카드이용금액,0.023483
9,입회일자_신용,0.021034
53,이용여부_3M_해외겸용_본인,0.01804
31,이용가능카드수_신용,0.014123
46,_2순위카드이용금액,0.014028


In [14]:
model2 = LGBMClassifier()