# Library

In [41]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random

# Data Load & Preprocessing
- 훈련에 필요없는 index 컬럼 삭제.
- missing value를 모두 NAN 문자열로 대체
- dtype object 인 컬럼들을 onehot encoding

In [42]:
train = pd.read_csv('dataset/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 


test = pd.read_csv('dataset/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('dataset/sample_submission.csv')

In [43]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         10000 non-null  object 
 1   car            10000 non-null  object 
 2   reality        10000 non-null  object 
 3   child_num      10000 non-null  int64  
 4   income_total   10000 non-null  float64
 5   income_type    10000 non-null  object 
 6   edu_type       10000 non-null  object 
 7   family_type    10000 non-null  object 
 8   house_type     10000 non-null  object 
 9   DAYS_BIRTH     10000 non-null  int64  
 10  DAYS_EMPLOYED  10000 non-null  int64  
 11  FLAG_MOBIL     10000 non-null  int64  
 12  work_phone     10000 non-null  int64  
 13  phone          10000 non-null  int64  
 14  email          10000 non-null  int64  
 15  occyp_type     10000 non-null  object 
 16  family_size    10000 non-null  float64
 17  begin_month    10000 non-null  float64
dtypes: floa

In [44]:
train.dtypes

gender            object
car               object
reality           object
child_num          int64
income_total     float64
income_type       object
edu_type          object
family_type       object
house_type        object
DAYS_BIRTH         int64
DAYS_EMPLOYED      int64
FLAG_MOBIL         int64
work_phone         int64
phone              int64
email              int64
occyp_type        object
family_size      float64
begin_month      float64
credit           float64
dtype: object

In [45]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [46]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)
train.dtypes

child_num                                   int64
income_total                              float64
DAYS_BIRTH                                  int64
DAYS_EMPLOYED                               int64
FLAG_MOBIL                                  int64
work_phone                                  int64
phone                                       int64
email                                       int64
family_size                               float64
begin_month                               float64
credit                                    float64
gender_F                                  float64
gender_M                                  float64
car_N                                     float64
car_Y                                     float64
reality_N                                 float64
reality_Y                                 float64
income_type_Commercial associate          float64
income_type_Pensioner                     float64
income_type_State servant                 float64


In [47]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)
test.dtypes

child_num                                   int64
income_total                              float64
DAYS_BIRTH                                  int64
DAYS_EMPLOYED                               int64
FLAG_MOBIL                                  int64
work_phone                                  int64
phone                                       int64
email                                       int64
family_size                               float64
begin_month                               float64
gender_F                                  float64
gender_M                                  float64
car_N                                     float64
car_Y                                     float64
reality_N                                 float64
reality_Y                                 float64
income_type_Commercial associate          float64
income_type_Pensioner                     float64
income_type_State servant                 float64
income_type_Student                       float64


In [50]:
train[train['child_num']>=7]
train

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_NAN,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff
0,0,202500.0,-13899,-4709,1,0,0,0,2.0,-6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,247500.0,-11380,-1540,1,0,0,1,3.0,-5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,450000.0,-19087,-4434,1,0,1,0,2.0,-22.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,202500.0,-15088,-2092,1,0,1,0,2.0,-37.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,157500.0,-15037,-2105,1,0,0,0,2.0,-26.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,2,225000.0,-12079,-1984,1,0,0,0,4.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26453,1,180000.0,-15291,-2475,1,0,0,0,2.0,-47.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
26454,0,292500.0,-10082,-2015,1,0,0,0,2.0,-25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26455,0,171000.0,-10145,-107,1,0,0,0,1.0,-59.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
test[test['child_num']>=7]

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_NAN,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff


In [51]:
train_index=train[train['child_num']>=7].index
train = train.drop(train_index)
train

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_NAN,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff
0,0,202500.0,-13899,-4709,1,0,0,0,2.0,-6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,247500.0,-11380,-1540,1,0,0,1,3.0,-5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,450000.0,-19087,-4434,1,0,1,0,2.0,-22.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,202500.0,-15088,-2092,1,0,1,0,2.0,-37.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,157500.0,-15037,-2105,1,0,0,0,2.0,-26.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,2,225000.0,-12079,-1984,1,0,0,0,4.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26453,1,180000.0,-15291,-2475,1,0,0,0,2.0,-47.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
26454,0,292500.0,-10082,-2015,1,0,0,0,2.0,-25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26455,0,171000.0,-10145,-107,1,0,0,0,1.0,-59.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Training
- 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
- lightgbm의 default parameter로 훈련.
- 30번 이상 개선 없을 경우 중단.
- 각 5개의 fold를 훈련하여 저장

In [55]:
train.iloc[9021,:]

child_num                                      1.0
income_total                              112500.0
DAYS_BIRTH                                -14499.0
DAYS_EMPLOYED                              -2860.0
FLAG_MOBIL                                     1.0
work_phone                                     0.0
phone                                          0.0
email                                          0.0
family_size                                    3.0
begin_month                                  -14.0
credit                                         2.0
gender_F                                       1.0
gender_M                                       0.0
car_N                                          0.0
car_Y                                          1.0
reality_N                                      1.0
reality_Y                                      0.0
income_type_Commercial associate               1.0
income_type_Pensioner                          0.0
income_type_State servant      

In [52]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [57]:
train_idx

array([    0,     1,     2, ..., 26447, 26448, 26450])

In [58]:
valid_idx

array([    4,    19,    21, ..., 26440, 26446, 26449])

In [69]:
len(folds[0][0])
#folds[i][j][k]

21160

In [74]:
#9021, 10731, 25313, 25390, 25638
for i in range(len(folds)):
    for j in range(2):
        for k in range(len(folds[i][j])):
            if(folds[i][j][k] == 9021 or folds[i][j][k] ==10731 or folds[i][j][k] ==25313 or folds[i][j][k] == 25390 or folds[i][j][k] == 25638 or folds[i][j][k] == 8462):
                folds[i][j][k]+=1

In [75]:
params={}
params['boosting_type']='gbdt'
params['learning_rate']='0.01'
params['max_depth']='16'
params['n_estimators']='1000'
params['objective']='regression'
params['metric']='multi_logloss'
params['is_training_metric']='True'
params['num_leaves']='31' #중요
params['feature_fraction']='0.7'#몇프로feature랜덤하게 학습 열 샘플링
params['bagging_fraction']='0.7'#행 샘플링
params['seed']='2020'
params['early_stopping_round']='50'
params['min_data_in_leaf']='20' #중요



In [76]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values
    
    lgb = LGBMClassifier(boosting_type='gbdt',
                        learning_rate=0.1,
                         max_depth=30,
                         n_estimators=1500,
                         objective='regression',
                         metric='multi_logloss',
                         is_training_metric=True,
                         num_leaves=150, #중요
                         feature_fraction=0.7,#몇프로feature랜덤하게 학습 열 샘플링
                         bagging_fraction=0.7, #행 샘플링
                         seed=2020,
                         early_stopping_round=100,
                         min_data_in_leaf=5, #중요
                         tree_learner='feature',
                         extra_trees='False'
                        )
    lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], early_stopping_rounds=50,
           verbose=100) #verbose 횟수마다 출력
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.532798	valid_1's multi_logloss: 0.928692
Early stopping, best iteration is:
[9]	training's multi_logloss: 0.812688	valid_1's multi_logloss: 0.876356


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.534151	valid_1's multi_logloss: 0.909092
Early stopping, best iteration is:
[6]	training's multi_logloss: 0.835543	valid_1's multi_logloss: 0.866058


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.52424	valid_1's multi_logloss: 0.937644
Early stopping, best iteration is:
[4]	training's multi_logloss: 0.844906	valid_1's multi_logloss: 0.885799


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.527392	valid_1's multi_logloss: 0.937698
Early stopping, best iteration is:
[6]	training's multi_logloss: 0.830456	valid_1's multi_logloss: 0.884405


Training unti

# Test inference
- 각 fold를 훈련시킨 lightgbm model로 predict.
- 해당 대회는 logloss score를 겨루는 것이기 때문에 각 class의 probability를 얻어야함.
- 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함.
- predict는 class 출력을 해주고 predict_proba는 class별 probability를 출력해줌.
- predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블.

In [77]:
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [80]:
submit.to_csv('submit/20210504_test_submit_ensemble.csv', index=False) # 0.7272812144

In [81]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.102501,0.23838,0.659118
1,26458,0.120039,0.234304,0.645657
2,26459,0.104504,0.244848,0.650649
3,26460,0.118661,0.215382,0.665956
4,26461,0.127595,0.22361,0.648795
5,26462,0.123651,0.240451,0.635898
6,26463,0.136855,0.310834,0.552312
7,26464,0.123434,0.237181,0.639385
8,26465,0.113889,0.222611,0.663499
9,26466,0.122671,0.239682,0.637647
