In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 경고 메시지가 뜨지 않게 설정
import warnings 
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 인코딩
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# 변수선택
from sklearn.model_selection import cross_val_score, train_test_split

# 검정
from scipy import stats
# 로지스틱 모델 만들기
from statsmodels.formula.api import logit

# 다중공선성 확인
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 
from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv('data/신용정보_전처리.csv')

In [3]:
df.select_dtypes(include = ['object', 'category'])

Unnamed: 0,ID,한도증액횟수_R12M,카드론동의여부,RV전환가능여부,한도심사요청건수
0,TRAIN_000000,0회,Y,N,0회
1,TRAIN_000001,0회,Y,Z,0회
2,TRAIN_000002,0회,Y,N,0회
3,TRAIN_000003,0회,Y,N,0회
4,TRAIN_000004,0회,Y,Z,0회
...,...,...,...,...,...
2999995,TEST_99995,0회,Y,Z,0회
2999996,TEST_99996,0회,Y,Z,0회
2999997,TEST_99997,0회,Y,Z,0회
2999998,TEST_99998,0회,Y,Z,0회


In [4]:
cat_cols = list(df.select_dtypes(include = ['object', 'category']).loc[:,'한도증액횟수_R12M':].columns)

In [5]:
for col in cat_cols :
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [6]:
df_segment = pd.read_csv('data/회원정보_전처리.csv', usecols = ['Segment'])
df = pd.concat([df, df_segment['Segment']], axis = 1)

In [7]:
train = df.loc[df['Segment'].notna()]
test = df.loc[df['Segment'].isna()]

In [8]:
train.drop('ID', axis = 1, inplace = True)
test.drop('ID', axis = 1, inplace = True)

In [9]:
y_train = train.pop('Segment')

In [10]:
# target 변수 인코딩
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [11]:
test.drop('Segment', axis = 1, inplace = True)

In [12]:
x_tr, x_val, y_tr, y_val = train_test_split(train,
                                            y_train,
                                            stratify = y_train,
                                            test_size = 0.25,
                                           random_state = 10)

In [13]:
print(x_tr.shape, x_val.shape, y_tr.shape, y_val.shape)

(1800000, 39) (600000, 39) (1800000,) (600000,)


In [14]:
model = XGBClassifier(
    task_type='GPU', 
    devices='0'
)

model.fit(train, y_train)
model.feature_importances_

array([0.00894951, 0.01991502, 0.10668384, 0.01627161, 0.04564415,
       0.0893188 , 0.07883863, 0.01042493, 0.0175675 , 0.0318308 ,
       0.01648631, 0.02969253, 0.00967176, 0.02125982, 0.00123216,
       0.03726348, 0.0234516 , 0.01595568, 0.0424957 , 0.03229212,
       0.00627919, 0.02903566, 0.08608183, 0.01141839, 0.01121546,
       0.01413328, 0.01671662, 0.0809307 , 0.0622232 , 0.00285833,
       0.00836025, 0.        , 0.00283686, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.01266431], dtype=float32)

In [15]:
select_feature = pd.DataFrame({
    '변수' : list(x_tr.columns),
    'Feature_Importances' : model.feature_importances_
})

In [16]:
select_features = list(select_feature.loc[select_feature['Feature_Importances'] >= 0.01, '변수'].values)

In [17]:
select_features.append('ID')

In [18]:
select_features.append('기준년월')

In [22]:
df = df[select_features].loc[df['기준년월'] == 12]

In [23]:
df.drop('기준년월', axis = 1, inplace = True)

In [24]:
df.to_csv('신용정보12월_feat_select.csv', index = False)

In [30]:
select_features = pd.DataFrame({
    '변수' : list(train.columns),
    'Feature_Importances' : list(model.feature_importances_)
}).sort_values('Feature_Importances', ascending = False).head(30)

In [32]:
select_features['Feature_Importances'].sum()

99.99944327488363

In [33]:
select_features

Unnamed: 0,변수,Feature_Importances
5,월상환론한도금액,18.619551
2,카드이용한도금액,8.631094
6,CA이자율_할인전,7.765943
9,RV현금서비스이자율_할인전,7.015985
4,일시상환론한도금액,6.054063
1,최초한도금액,4.949352
27,카드이용한도금액_B1M,4.724356
21,상향가능한도금액,4.668782
3,CA한도금액,4.5929
28,카드이용한도금액_B2M,4.587558
