# DataLoader

In [36]:
import pandas as pd
import numpy as np

import os

np.random.seed(42)

pd.set_option('display.max_columns', 500)

In [37]:
data_dir = "../../../kaggle_data/creditcard_overdue/"
Raw_data_dir = os.path.join(data_dir, "open/")

Raw_train = pd.read_csv(os.path.join(Raw_data_dir, "train.csv"))
Raw_test = pd.read_csv(os.path.join(Raw_data_dir, "test.csv"))

submission = pd.read_csv(os.path.join(Raw_data_dir, "sample_submission.csv"))

In [38]:
Raw_train

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,4.0,-2.0,1.0
26453,26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,,2.0,-47.0,2.0
26454,26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,2.0,-25.0,2.0
26455,26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,1.0,-59.0,2.0


# Preprocessing

## 불필요한 Feature 제거

In [39]:
# Index, FLAG_MOBIL 제거
train = Raw_train.drop(['index', 'FLAG_MOBIL'], axis=1)
test = Raw_test.drop(['index', 'FLAG_MOBIL'], axis=1)

## 음수 feature 구간화

- 팀원 시각화 결과 500개로 구간화 하는 것으로 결정

In [40]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

#minus 변경하고
#구간화 함수
def make_bin(variable, n):
    train[variable]=-train[variable]
    test[variable]=-test[variable]
    count, bin_dividers = np.histogram(train[variable], bins=n) #train의 구간화를 적용

    bin_names=[str(i) for i in range(n)]

    train[variable]=pd.cut(x=train[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    test[variable]=pd.cut(x=test[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    test[variable].fillna(str(0), inplace=True) #test에는 없는 것을 임의의 값으로 채움

In [41]:
make_bin('DAYS_BIRTH', n=500)
make_bin('DAYS_EMPLOYED', n=500)
make_bin('begin_month', n=500)

In [42]:
train["DAYS_BIRTH"].value_counts()

174    156
78     132
184    127
202    125
213    123
      ... 
2        0
495      0
496      0
497      0
3        0
Name: DAYS_BIRTH, Length: 500, dtype: int64

In [43]:
train["DAYS_EMPLOYED"].value_counts()

480    4484
0      4438
481    3940
482    3036
479    2668
       ... 
176       0
175       0
174       0
173       0
250       0
Name: DAYS_EMPLOYED, Length: 500, dtype: int64

In [44]:
train["begin_month"].value_counts()

58     662
91     617
66     612
24     593
83     591
      ... 
203      0
202      0
201      0
200      0
250      0
Name: begin_month, Length: 500, dtype: int64

## Occyp_type

- 많이 출현한 값부터 차례로 가장 높은 숫자 부여 or LebelEncoder
    - case가 19가지 이므로 0~18까지 feature값이 크게 차이남.
- One hot Encoding 으로 시도!

In [45]:
train['occyp_type'].unique()

array([nan, 'Laborers', 'Managers', 'Sales staff',
       'High skill tech staff', 'Core staff', 'Drivers', 'Medicine staff',
       'Accountants', 'Realty agents', 'Security staff', 'Cleaning staff',
       'Private service staff', 'Cooking staff', 'Secretaries',
       'HR staff', 'IT staff', 'Low-skill Laborers',
       'Waiters/barmen staff'], dtype=object)

In [46]:
train['occyp_type'].value_counts()

Laborers                 4512
Core staff               2646
Sales staff              2539
Managers                 2167
Drivers                  1575
High skill tech staff    1040
Accountants               902
Medicine staff            864
Cooking staff             457
Security staff            424
Cleaning staff            403
Private service staff     243
Low-skill Laborers        127
Waiters/barmen staff      124
Secretaries                97
Realty agents              63
HR staff                   62
IT staff                   41
Name: occyp_type, dtype: int64

In [47]:
print(f"nan rate : {np.sum(train['occyp_type'].isnull()) / len(train) * 100}")

nan rate : 30.884076047926822


In [48]:
train['occyp_type'] = train['occyp_type'].fillna("Unknown")
print(np.sum(train['occyp_type'].isnull()))

0


In [49]:
print(f"nan rate : {np.sum(test['occyp_type'].isnull()) / len(test) * 100}")

nan rate : 31.52


In [50]:
test['occyp_type'] = test['occyp_type'].fillna("Unknown")
print(np.sum(test['occyp_type'].isnull()))

0


In [51]:
occyp = train['occyp_type'].value_counts()

print(occyp)

occyp = pd.DataFrame(occyp)

occyp_list = list(occyp.index)

occyp_list

Unknown                  8171
Laborers                 4512
Core staff               2646
Sales staff              2539
Managers                 2167
Drivers                  1575
High skill tech staff    1040
Accountants               902
Medicine staff            864
Cooking staff             457
Security staff            424
Cleaning staff            403
Private service staff     243
Low-skill Laborers        127
Waiters/barmen staff      124
Secretaries                97
Realty agents              63
HR staff                   62
IT staff                   41
Name: occyp_type, dtype: int64


['Unknown',
 'Laborers',
 'Core staff',
 'Sales staff',
 'Managers',
 'Drivers',
 'High skill tech staff',
 'Accountants',
 'Medicine staff',
 'Cooking staff',
 'Security staff',
 'Cleaning staff',
 'Private service staff',
 'Low-skill Laborers',
 'Waiters/barmen staff',
 'Secretaries',
 'Realty agents',
 'HR staff',
 'IT staff']

In [52]:
train = pd.get_dummies(train, columns=['occyp_type'])
test = pd.get_dummies(test, columns=['occyp_type'])

In [53]:
train

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month,credit,occyp_type_Accountants,occyp_type_Cleaning staff,occyp_type_Cooking staff,occyp_type_Core staff,occyp_type_Drivers,occyp_type_HR staff,occyp_type_High skill tech staff,occyp_type_IT staff,occyp_type_Laborers,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Unknown,occyp_type_Waiters/barmen staff
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,177,485,0,0,0,2.0,49,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,105,481,0,0,1,3.0,41,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,326,485,0,1,0,2.0,183,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,211,482,0,1,0,2.0,308,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,210,482,0,0,0,2.0,216,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,125,481,0,0,0,4.0,16,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,217,482,0,0,0,2.0,391,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,68,482,0,0,0,2.0,208,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,69,479,0,0,0,1.0,491,2.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


## child_num

- 5명 초과는 모두 6명으로 처리.

In [54]:
train['child_num'].value_counts()

0     18340
1      5386
2      2362
3       306
4        47
5        10
14        3
7         2
19        1
Name: child_num, dtype: int64

In [55]:
train['child_num'].unique()

array([ 0,  1,  2,  3,  4,  5, 14, 19,  7], dtype=int64)

In [56]:
train["child_num"] = train["child_num"].apply(lambda x: 6 if x > 5 else x)

In [57]:
train

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month,credit,occyp_type_Accountants,occyp_type_Cleaning staff,occyp_type_Cooking staff,occyp_type_Core staff,occyp_type_Drivers,occyp_type_HR staff,occyp_type_High skill tech staff,occyp_type_IT staff,occyp_type_Laborers,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Unknown,occyp_type_Waiters/barmen staff
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,177,485,0,0,0,2.0,49,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,105,481,0,0,1,3.0,41,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,326,485,0,1,0,2.0,183,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,211,482,0,1,0,2.0,308,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,210,482,0,0,0,2.0,216,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,125,481,0,0,0,4.0,16,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,217,482,0,0,0,2.0,391,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,68,482,0,0,0,2.0,208,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,69,479,0,0,0,1.0,491,2.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


## 범주형 Feature 실수화

In [58]:
train['gender'] = train['gender'].replace(['F', 'M'], [0, 1])
test['gender'] = test['gender'].replace(['F', 'M'], [0, 1])
print('gender : ')
print(train['gender'].value_counts())
print("----------------------------")

train['car'] = train['car'].replace(['N', 'Y'], [0, 1])
test['car'] = test['car'].replace(['N', 'Y'], [0, 1])
print('car : ')
print(train['car'].value_counts())
print("----------------------------")

train['reality'] = train['reality'].replace(['N', 'Y'], [0, 1])
test['reality'] = test['reality'].replace(['N', 'Y'], [0, 1])
print('reality : ')
print(train['reality'].value_counts())
print("----------------------------")

train['edu_type'] = train['edu_type'].replace([ 'Academic degree','Higher education' ,'Secondary / secondary special', 'Incomplete higher', 'Lower secondary'],[4,3,2,1,0])
test['edu_type'] = test['edu_type'].replace([ 'Academic degree','Higher education' ,'Secondary / secondary special', 'Incomplete higher', 'Lower secondary'],[4,3,2,1,0])
print('edu_type : ')
print(train['edu_type'].value_counts())
print("----------------------------")

gender : 
0    17697
1     8760
Name: gender, dtype: int64
----------------------------
car : 
0    16410
1    10047
Name: car, dtype: int64
----------------------------
reality : 
1    17830
0     8627
Name: reality, dtype: int64
----------------------------
edu_type : 
2    17995
3     7162
1     1020
0      257
4       23
Name: edu_type, dtype: int64
----------------------------


## income_type

In [59]:
train['income_type'].unique()

array(['Commercial associate', 'Working', 'State servant', 'Pensioner',
       'Student'], dtype=object)

In [60]:
from sklearn import preprocessing

label_encoder=preprocessing.LabelEncoder()

train['income_type']=label_encoder.fit_transform(train['income_type'])
test['income_type']=label_encoder.transform(test['income_type'])
########################################################################
train['family_type']=label_encoder.fit_transform(train['family_type'])
test['family_type']=label_encoder.transform(test['family_type'])
########################################################################
train['house_type']=label_encoder.fit_transform(train['house_type'])
test['house_type']=label_encoder.transform(test['house_type'])

## Income total

In [61]:
# income total 전처리

import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
income_counts = train['income_total'].value_counts()
unique_income = income_counts.index

income_counts = pd.DataFrame(income_counts)
income_counts.index = range(len(income_counts))
income_counts['income_value'] = unique_income
income_counts.columns = ['counts', 'income_values']

income_counts

Unnamed: 0,counts,income_values
0,3164,135000.0
1,2233,157500.0
2,2225,180000.0
3,2178,112500.0
4,2170,225000.0
...,...,...
244,1,227250.0
245,1,91530.0
246,1,39600.0
247,1,432000.0


In [63]:
max_income = income_counts['income_values'].max()
min_income = income_counts['income_values'].min()

print(max_income, min_income)

1575000.0 27000.0


### bins=5

In [64]:
num_bins_ = 5

train_bin5 = train.copy()
test_bin5 = test.copy()

train_bin5['income_total'] = train_bin5['income_total']/10000
test_bin5['income_total'] = test_bin5['income_total']/10000

count, bin_dividers = np.histogram(train_bin5['income_total'], bins=num_bins_)
bin_names=['소득'+str(i) for i in range(num_bins_) ]

train_bin5['income_total']=pd.cut(x=train_bin5['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)
test_bin5['income_total']=pd.cut(x=test_bin5['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)

print(f"========[num_bins : {num_bins_}]==========")
print("train bin counts : ")
print(train_bin5['income_total'].value_counts())

print("shape : ", train_bin5.shape)
print("=================================")

train bin counts : 
소득0    24542
소득1     1736
소득2      165
소득4        9
소득3        5
Name: income_total, dtype: int64
shape :  (26457, 36)


### bins=7

In [65]:
num_bins_ = 7

train_bin7 = train.copy()
test_bin7 = test.copy()

train_bin7['income_total'] = train_bin7['income_total']/10000
test_bin7['income_total'] = test_bin7['income_total']/10000

count, bin_dividers = np.histogram(train_bin7['income_total'], bins=num_bins_)
bin_names=['소득'+str(i) for i in range(num_bins_) ]

train_bin7['income_total']=pd.cut(x=train_bin7['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)
test_bin7['income_total']=pd.cut(x=test_bin7['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)

print(f"========[num_bins : {num_bins_}]==========")
print("train bin counts : ")
print(train_bin7['income_total'].value_counts())

print("shape : ", train_bin7.shape)
print("=================================")

train bin counts : 
소득0    21601
소득1     4462
소득2      305
소득3       71
소득4        9
소득6        5
소득5        4
Name: income_total, dtype: int64
shape :  (26457, 36)


### bins=10

In [66]:
num_bins_ = 10

train_bin10 = train.copy()
test_bin10 = test.copy()

train_bin10['income_total'] = train_bin10['income_total']/10000
test_bin10['income_total'] = test_bin10['income_total']/10000

count, bin_dividers = np.histogram(train_bin10['income_total'], bins=num_bins_)
bin_names=['소득'+str(i) for i in range(num_bins_) ]

train_bin10['income_total']=pd.cut(x=train_bin10['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)
test_bin10['income_total']=pd.cut(x=test_bin10['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)

print(f"========[num_bins : {num_bins_}]==========")
print("train bin counts : ")
print(train_bin10['income_total'].value_counts())

print("shape : ", train_bin10.shape)
print("=================================")

train bin counts : 
소득0    16212
소득1     8330
소득2     1530
소득3      206
소득4      121
소득5       44
소득9        5
소득8        4
소득6        3
소득7        2
Name: income_total, dtype: int64
shape :  (26457, 36)


### income_total Labeling

#### bins=5

In [67]:
label_encoder = LabelEncoder()

train_bin5['income_total'] = label_encoder.fit_transform(train_bin5['income_total'])
test_bin5['income_total'] = label_encoder.transform(test_bin5['income_total'])

train_bin5

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month,credit,occyp_type_Accountants,occyp_type_Cleaning staff,occyp_type_Cooking staff,occyp_type_Core staff,occyp_type_Drivers,occyp_type_HR staff,occyp_type_High skill tech staff,occyp_type_IT staff,occyp_type_Laborers,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Unknown,occyp_type_Waiters/barmen staff
0,0,0,0,0,0,0,3,1,2,177,485,0,0,0,2.0,49,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,1,1,0,0,2,0,1,105,481,0,0,1,3.0,41,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,4,3,1,1,326,485,0,1,0,2.0,183,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,2,1,1,211,482,0,1,0,2.0,308,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,1,1,0,0,2,3,1,1,210,482,0,0,0,2.0,216,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,0,2,0,2,2,1,1,125,481,0,0,0,4.0,16,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26453,0,0,1,1,0,4,3,2,1,217,482,0,0,0,2.0,391,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
26454,0,1,0,0,0,4,2,0,5,68,482,0,0,0,2.0,208,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26455,1,0,1,0,0,4,1,3,1,69,479,0,0,0,1.0,491,2.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


#### bins=7

In [68]:
label_encoder = LabelEncoder()

train_bin7['income_total'] = label_encoder.fit_transform(train_bin7['income_total'])
test_bin7['income_total'] = label_encoder.transform(test_bin7['income_total'])

train_bin7

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month,credit,occyp_type_Accountants,occyp_type_Cleaning staff,occyp_type_Cooking staff,occyp_type_Core staff,occyp_type_Drivers,occyp_type_HR staff,occyp_type_High skill tech staff,occyp_type_IT staff,occyp_type_Laborers,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Unknown,occyp_type_Waiters/barmen staff
0,0,0,0,0,0,0,3,1,2,177,485,0,0,0,2.0,49,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,1,1,0,0,2,0,1,105,481,0,0,1,3.0,41,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,4,3,1,1,326,485,0,1,0,2.0,183,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,2,1,1,211,482,0,1,0,2.0,308,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,1,1,0,0,2,3,1,1,210,482,0,0,0,2.0,216,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,0,2,0,2,2,1,1,125,481,0,0,0,4.0,16,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26453,0,0,1,1,0,4,3,2,1,217,482,0,0,0,2.0,391,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
26454,0,1,0,0,1,4,2,0,5,68,482,0,0,0,2.0,208,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26455,1,0,1,0,0,4,1,3,1,69,479,0,0,0,1.0,491,2.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


#### bins=10

In [69]:
label_encoder = LabelEncoder()

train_bin10['income_total'] = label_encoder.fit_transform(train_bin10['income_total'])
test_bin10['income_total'] = label_encoder.transform(test_bin10['income_total'])

train_bin10

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month,credit,occyp_type_Accountants,occyp_type_Cleaning staff,occyp_type_Cooking staff,occyp_type_Core staff,occyp_type_Drivers,occyp_type_HR staff,occyp_type_High skill tech staff,occyp_type_IT staff,occyp_type_Laborers,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Unknown,occyp_type_Waiters/barmen staff
0,0,0,0,0,1,0,3,1,2,177,485,0,0,0,2.0,49,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,1,1,1,0,2,0,1,105,481,0,0,1,3.0,41,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,2,4,3,1,1,326,485,0,1,0,2.0,183,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,1,0,2,1,1,211,482,0,1,0,2.0,308,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,1,1,0,0,2,3,1,1,210,482,0,0,0,2.0,216,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,0,2,1,2,2,1,1,125,481,0,0,0,4.0,16,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26453,0,0,1,1,0,4,3,2,1,217,482,0,0,0,2.0,391,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
26454,0,1,0,0,1,4,2,0,5,68,482,0,0,0,2.0,208,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26455,1,0,1,0,0,4,1,3,1,69,479,0,0,0,1.0,491,2.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


# Make preprocessed files

In [70]:
process_dir = "../result_file/preprocess_results/"

train_bin5.to_csv(os.path.join(process_dir, "train_income_bin5.csv"), index=False)
train_bin7.to_csv(os.path.join(process_dir, "train_income_bin7.csv"), index=False)
train_bin10.to_csv(os.path.join(process_dir, "train_income_bin10.csv"), index=False)

test_bin5.to_csv(os.path.join(process_dir, "test_income_bin5.csv"), index=False)
test_bin7.to_csv(os.path.join(process_dir, "test_income_bin7.csv"), index=False)
test_bin10.to_csv(os.path.join(process_dir, "test_income_bin10.csv"), index=False)