In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 경고 메시지가 뜨지 않게 설정
import warnings 
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 인코딩
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# 검정
from scipy import stats
# 로지스틱 모델 만들기
from statsmodels.formula.api import logit

# 다중공선성 확인
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

In [2]:
df = pd.read_parquet('open/concat/2018_신용정보.parquet')

In [3]:
df

Unnamed: 0,기준년월,ID,최초한도금액,카드이용한도금액,CA한도금액,일시상환론한도금액,월상환론한도금액,CA이자율_할인전,CL이자율_할인전,RV일시불이자율_할인전,...,연체감액여부_R3M,한도심사요청건수,한도요청거절건수,한도심사요청후경과월,한도심사거절후경과월,시장단기연체여부_R6M,시장단기연체여부_R3M,시장연체상환여부_R6M,시장연체상환여부_R3M,rv최초시작후경과일
0,201807,TRAIN_000000,0,19354,7270,0,0,22.995207,18.254978,17.264967,...,0,0회,0,3,3,0,0,0,0,99999999
1,201807,TRAIN_000001,0,9996,5718,41996,90611,14.793821,14.834873,10.622446,...,0,0회,0,3,3,0,0,0,0,322
2,201807,TRAIN_000002,0,88193,35207,0,0,22.014276,17.875321,17.155829,...,0,0회,0,3,3,0,0,0,0,2378
3,201807,TRAIN_000003,0,19062,6531,0,0,22.998014,22.999453,19.293674,...,0,0회,0,3,3,0,0,0,0,99999999
4,201807,TRAIN_000004,0,177222,47149,48000,155020,14.661948,10.897410,10.654587,...,0,0회,0,3,3,0,0,0,0,99999999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,201812,TEST_99995,0,0,0,0,0,14.642972,11.898260,10.674317,...,0,0회,0,3,3,0,0,0,0,99999999
2999996,201812,TEST_99996,0,49025,17876,48022,152797,14.221370,14.901302,10.324015,...,0,0회,0,3,3,0,0,0,0,99999999
2999997,201812,TEST_99997,0,29996,13332,89997,156313,15.207480,11.902432,11.495748,...,0,0회,0,3,3,0,0,0,0,99999999
2999998,201812,TEST_99998,0,42610,17362,90003,204480,15.268772,15.299862,11.264263,...,0,0회,0,3,3,0,0,0,0,99999999


In [4]:
# 데이터의 도수의 종류가 1개인 데이터를 변수에 담는다.
cols_same_value = df.columns[df.nunique() == 1]

In [5]:
cols_same_value

Index(['시장연체상환여부_R3M'], dtype='object')

In [6]:
df.drop('시장연체상환여부_R3M', axis = 1, inplace = True)

### 결측치 처리

In [7]:
# 결측치를 확인한다.
na_box = []

for idx, val in df.isna().sum().items() :
    # print(f'{idx}의 결측치 : {val}')
    if val > 0 :
        na_box.append(idx)

In [8]:
df[na_box].isna().sum() / df.shape[0]

RV신청일자      0.812918
RV전환가능여부    0.012369
dtype: float64

- 결측 비율이 미비한 RV전환가능여부 컬럼은 단순대체하고 대부분 결측치인 RV신청일자는 삭제한다.

In [9]:
df.drop('RV신청일자', axis = 1, inplace = True)

In [10]:
df.fillna({'RV전환가능여부' : df['RV전환가능여부'].mode()[0]}, inplace = True)

In [11]:
df.select_dtypes(include = ['object', 'category'])

Unnamed: 0,ID,자발한도감액횟수_R12M,한도증액횟수_R12M,카드론동의여부,RV전환가능여부,한도심사요청건수
0,TRAIN_000000,0회,0회,Y,N,0회
1,TRAIN_000001,0회,0회,Y,Z,0회
2,TRAIN_000002,0회,0회,Y,N,0회
3,TRAIN_000003,0회,0회,Y,N,0회
4,TRAIN_000004,0회,0회,Y,Z,0회
...,...,...,...,...,...,...
2999995,TEST_99995,0회,0회,Y,Z,0회
2999996,TEST_99996,0회,0회,Y,Z,0회
2999997,TEST_99997,0회,0회,Y,Z,0회
2999998,TEST_99998,0회,0회,Y,Z,0회


In [15]:
df['자발한도감액횟수_R12M'] = df['자발한도감액횟수_R12M'].str.replace('회', '').astype(int)

In [17]:
df['한도증액횟수_R12M'].value_counts()

한도증액횟수_R12M
0회      2644960
1회이상     355040
Name: count, dtype: int64

In [19]:
df['카드론동의여부'].value_counts()

카드론동의여부
Y    2406657
N     593343
Name: count, dtype: int64

In [20]:
df['RV전환가능여부'].value_counts()

RV전환가능여부
Z    2491789
N     508211
Name: count, dtype: int64

In [23]:
df['한도심사요청건수'].value_counts()

한도심사요청건수
0회      2999886
1회이상        114
Name: count, dtype: int64

In [26]:
df['기준년월'] = df['기준년월'] - 201800

In [28]:
df.to_csv('data/신용정보_전처리.csv', index = False)