# DataLoader

In [1]:
import pandas as pd
import numpy as np

import os

In [5]:
data_dir = "../../kaggle_data/creditcard_overdue/"
Raw_data_dir = os.path.join(data_dir, "open/")
pre_data_dir = os.path.join(data_dir, "preprocess/")

Raw_train = pd.read_csv(os.path.join(Raw_data_dir, "train.csv"))
Raw_test = pd.read_csv(os.path.join(Raw_data_dir, "test.csv"))

pre_train = pd.read_csv(os.path.join(pre_data_dir, "train_preprocess.csv"))
pre_test = pd.read_csv(os.path.join(pre_data_dir, "test_preprocess.csv"))

submission = pd.read_csv(os.path.join(Raw_data_dir, "sample_submission.csv"))

In [6]:
Raw_train

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,4.0,-2.0,1.0
26453,26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,,2.0,-47.0,2.0
26454,26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,2.0,-25.0,2.0
26455,26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,1.0,-59.0,2.0


# Preprocessing

## 불필요한 Feature 제거

In [7]:
# Index, FLAG_MOBIL 제거
train = Raw_train.drop(['index', 'FLAG_MOBIL'], axis=1)
test = Raw_test.drop(['index', 'FLAG_MOBIL'], axis=1)

## 범주형 Feature 실수화

In [10]:
train['gender'] = train['gender'].replace(['F', 'M'], [0, 1])
test['gender'] = test['gender'].replace(['F', 'M'], [0, 1])
print('gender : ')
print(train['gender'].value_counts())
print("----------------------------")

train['car'] = train['car'].replace(['N', 'Y'], [0, 1])
test['car'] = test['car'].replace(['N', 'Y'], [0, 1])
print('car : ')
print(train['car'].value_counts())
print("----------------------------")

train['reality'] = train['reality'].replace(['N', 'Y'], [0, 1])
test['reality'] = test['reality'].replace(['N', 'Y'], [0, 1])
print('reality : ')
print(train['reality'].value_counts())
print("----------------------------")

train['edu_type'] = train['edu_type'].replace([ 'Academic degree','Higher education' ,'Secondary / secondary special', 'Incomplete higher', 'Lower secondary'],[4,3,2,1,0])
test['edu_type'] = test['edu_type'].replace([ 'Academic degree','Higher education' ,'Secondary / secondary special', 'Incomplete higher', 'Lower secondary'],[4,3,2,1,0])
print('edu_type : ')
print(train['edu_type'].value_counts())
print("----------------------------")

gender : 
0    17697
1     8760
Name: gender, dtype: int64
----------------------------
car : 
0    16410
1    10047
Name: car, dtype: int64
----------------------------
reality : 
1    17830
0     8627
Name: reality, dtype: int64
----------------------------
edu_type : 
2    17995
3     7162
1     1020
0      257
4       23
Name: edu_type, dtype: int64
----------------------------


## Income total

In [76]:
# income total 전처리

import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
income_counts = train['income_total'].value_counts()
unique_income = income_counts.index

income_counts = pd.DataFrame(income_counts)
income_counts.index = range(len(income_counts))
income_counts['income_value'] = unique_income
income_counts.columns = ['counts', 'income_values']

income_counts

Unnamed: 0,counts,income_values
0,3164,135000.0
1,2233,157500.0
2,2225,180000.0
3,2178,112500.0
4,2170,225000.0
...,...,...
244,1,227250.0
245,1,91530.0
246,1,39600.0
247,1,432000.0


In [42]:
max_income = income_counts['income_values'].max()
min_income = income_counts['income_values'].min()

print(max_income, min_income)

1575000.0 27000.0


## 단순히 분할 구간을 늘리는 것은 소득구간별 data 량 차이 발생을 해결할 수 없음.

In [80]:
num_bins = [5, 7, 10, 14]

train_ = train['income_total'].copy()
train_ = pd.DataFrame(train_)
train_.columns = ['income_total']
test_ = test['income_total'].copy()
test_ = pd.DataFrame(test_)
test_.columns = ['income_total']


train_['income_total'] = train_['income_total']/10000 
test_['income_total'] = test_['income_total']/10000


train_tmp = train_.copy()
train_tmp = pd.DataFrame(train_tmp)
train_tmp.columns = ['income_total']
test_tmp = test_.copy()
test_tmp = pd.DataFrame(test_tmp)
train_tmp.columns = ['income_total']


for num_bins_ in num_bins:
    count, bin_dividers =np.histogram(train_['income_total'],bins=num_bins_)
    bin_names=['소득'+str(i) for i in range(num_bins_) ]

    #bin_dividers는 train기준!!
    train_tmp['income_total']=pd.cut(x=train_['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)
    test_tmp['income_total']=pd.cut(x=test_['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)

    print(f"========[num_bins : {num_bins_}]==========")
    print("train bin counts : ")
    print(train_tmp['income_total'].value_counts())
    print("---------------------------------")

    print("test bin counts : ")
    print(test_tmp['income_total'].value_counts())
    print("=================================")


train bin counts : 
소득0    24542
소득1     1736
소득2      165
소득4        9
소득3        5
Name: income_total, dtype: int64
---------------------------------
test bin counts : 
소득0    9298
소득1     637
소득2      58
소득4       5
소득3       2
Name: income_total, dtype: int64
train bin counts : 
소득0    21601
소득1     4462
소득2      305
소득3       71
소득4        9
소득6        5
소득5        4
Name: income_total, dtype: int64
---------------------------------
test bin counts : 
소득0    8251
소득1    1609
소득2     108
소득3      25
소득6       3
소득4       2
소득5       2
Name: income_total, dtype: int64
train bin counts : 
소득0    16212
소득1     8330
소득2     1530
소득3      206
소득4      121
소득5       44
소득9        5
소득8        4
소득6        3
소득7        2
Name: income_total, dtype: int64
---------------------------------
test bin counts : 
소득0    6248
소득1    3050
소득2     569
소득3      68
소득4      44
소득5      14
소득9       3
소득8       2
소득6       1
소득7       1
Name: income_total, dtype: int64
train bin counts : 
소득1     11123