# Library

In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random

# Data Load & Preprocessing
- 훈련에 필요없는 index 컬럼 삭제.
- missing value를 모두 NAN 문자열로 대체
- dtype object 인 컬럼들을 onehot encoding

In [2]:
train = pd.read_csv('train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 


test = pd.read_csv('test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('sample_submission.csv')

In [3]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [4]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [5]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

# Training
- 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
- lightgbm의 default parameter로 훈련.
- 30번 이상 개선 없을 경우 중단.
- 각 5개의 fold를 훈련하여 저장

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [7]:
params={}
params['boosting_type']='gbdt'
params['learning_rate']='0.01'
params['max_depth']='16'
params['n_estimators']='1000'
params['objective']='regression'
params['metric']='multi_logloss'
params['is_training_metric']='True'
params['num_leaves']='31' #중요
params['feature_fraction']='0.7'#몇프로feature랜덤하게 학습 열 샘플링
params['bagging_fraction']='0.7'#행 샘플링
params['seed']='2020'
params['early_stopping_round']='50'
params['min_data_in_leaf']='20' #중요



In [115]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values
    
    lgb = LGBMClassifier(boosting_type='gbdt',
                        learning_rate=0.1,
                         max_depth=30,
                         n_estimators=1500,
                         objective='regression',
                         metric='multi_logloss',
                         is_training_metric=True,
                         num_leaves=150, #중요
                         feature_fraction=0.7,#몇프로feature랜덤하게 학습 열 샘플링
                         bagging_fraction=0.7, #행 샘플링
                         seed=2020,
                         early_stopping_round=100,
                         min_data_in_leaf=5, #중요
                         tree_learner='feature',
                         extra_trees='False'
                        )
    lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], early_stopping_rounds=50,
           verbose=100) #verbose 횟수마다 출력
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.392433	valid_1's multi_logloss: 0.709202
Early stopping, best iteration is:
[99]	training's multi_logloss: 0.394399	valid_1's multi_logloss: 0.708948


Training until validation scores don't improve for 100 rounds


KeyboardInterrupt: 

# Test inference
- 각 fold를 훈련시킨 lightgbm model로 predict.
- 해당 대회는 logloss score를 겨루는 것이기 때문에 각 class의 probability를 얻어야함.
- 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함.
- predict는 class 출력을 해주고 predict_proba는 class별 probability를 출력해줌.
- predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블.

In [108]:
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [109]:
submit.to_csv('20210503_test_submit_ensemble.csv', index=False) # 0.7272812144

In [110]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.042863,0.085888,0.871249
1,26458,0.195574,0.224078,0.580348
2,26459,0.049339,0.10076,0.8499
3,26460,0.106042,0.117352,0.776606
4,26461,0.082628,0.156054,0.761318
5,26462,0.064476,0.134357,0.801167
6,26463,0.436128,0.555757,0.008115
7,26464,0.100803,0.147238,0.751959
8,26465,0.061283,0.130517,0.8082
9,26466,0.065026,0.271179,0.663795
