### Bayesian Optimization을 이용하여 application과 previous로 만들어진 집합의 하이퍼 파라미터 튜닝

#### 라이브러리 및 데이터 세트 로딩. 이전 application 데이터의 FE 함수 복사

In [29]:
import numpy as np
import pandas as pd
import gc  # 가비지 컬렉션
import time
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

##### 코랩 버전은 Google Drive에서 데이터 세트를 로딩

In [30]:
import os, sys
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [31]:
def get_dataset():
  """
  현재 대출신청정보와 이전 대출이력정보 리턴 
  """
  default_dir = "/content/gdrive/My Drive"
  app_train = pd.read_csv(os.path.join(default_dir, 'application_train.csv'))
  app_test = pd.read_csv(os.path.join(default_dir, 'application_test.csv'))
  apps = pd.concat([app_train, app_test])
  prev = pd.read_csv(os.path.join(default_dir, 'previous_application.csv'))
  
  return apps, prev

apps, prev = get_dataset()

#### 이전 application 데이터의 feature engineering 함수 복사

In [None]:
def get_apps_processed(apps):
    """
    현재 대출이력(훈련용, 테스트용 병합)
    apps의 주요 피처 가공값 생성

    1.EXT_SOURCE_1,EXT_SOURCE_2, EXT_SOURCE_3(평균, 표준편차)
    2.APPS_EXT_SOURCE_STD(평균값)
    3.APPS_ANNUITY_CREDIT_RATIO : 대출금액 대비 매월 갚아야 할 하는 납부금액 비율(AMT_ANNUITY / AMT_CREDIT)
    4.APPS_GOODS_CREDIT_RATIO   : 대출금액 대비 고객상품대출금액 비율(AMT_GOODS_PRICE / AMT_CREDIT)
    5.APPS_ANNUITY_INCOME_RATIO : 소득대비 매월 갚아야 하는 금액의 비율(AMT_ANNUITY / AMT_INCOME_TOTAL)
    6.APPS_CREDIT_INCOME_RATIO :  소득대비 대출금액의 비율(AMT_CREDIT / AMT_INCOME_TOTAL)
    7.APPS_GOODS_INCOME_RATIO   : 소득대비 대출상품금액의 비율(AMT_GOODS_PRICE / AMT_INCOME_TOTAL)
    8.APPS_CNT_FAM_INCOME_RATIO : 가족수 대비 소득금액비율(AMT_INCOME_TOTAL / CNT_FAM_MEMBERS)
    9.APPS_EMPLOYED_BIRTH_RATIO : 연령대비 재직기간 비율(DAYS_EMPLOYEED / DAYS_BIRTH)
    10.APPS_INCOME_EMPLOYED_RATIO : 재직기간 대비 소득비율(AMT_INCOME_TOTAL / DAYS_EMPLOYEED)
    11.APPS_INCOME_BIRTH_RATIO    : 연령대비 소득비율(AMT_INCOME_TOTAL / DAYS_BIRTH)
    12.APPS_CAR_BIRTH_RATIO    : 연령대비 소유차의 연식( OWN_CAR_AGE / DAYS_BIRTH)
    13.APPS_CAR_EMPLOYED_RATIO : 재직기간대비 소유차의 연식(OWN_CAR_AGE / DAYS_EMPLOYEED)
    """
    # EXT_SOURCE_X FEATURE 가공
    apps['APPS_EXT_SOURCE_MEAN'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps['APPS_EXT_SOURCE_STD'].fillna(apps['APPS_EXT_SOURCE_STD'].mean())
    
    # AMT_CREDIT 비율로 Feature 가공
    apps['APPS_ANNUITY_CREDIT_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_CREDIT']
    apps['APPS_GOODS_CREDIT_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_CREDIT']
    
    # AMT_INCOME_TOTAL 비율로 Feature 가공
    apps['APPS_ANNUITY_INCOME_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_INCOME_TOTAL']
    apps['APPS_CREDIT_INCOME_RATIO'] = apps['AMT_CREDIT']/apps['AMT_INCOME_TOTAL']
    apps['APPS_GOODS_INCOME_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_INCOME_TOTAL']
    apps['APPS_CNT_FAM_INCOME_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['CNT_FAM_MEMBERS']
    
    # DAYS_BIRTH, DAYS_EMPLOYED 비율로 Feature 가공
    apps['APPS_EMPLOYED_BIRTH_RATIO'] = apps['DAYS_EMPLOYED']/apps['DAYS_BIRTH']
    apps['APPS_INCOME_EMPLOYED_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_EMPLOYED']
    apps['APPS_INCOME_BIRTH_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_BIRTH']
    apps['APPS_CAR_BIRTH_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_BIRTH']
    apps['APPS_CAR_EMPLOYED_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_EMPLOYED']
    
    return apps

#### previous 데이터 가공후 인코딩 및 최종 데이터 집합 생성하는 함수 선언

In [None]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

def get_prev_processed(prev):
    # 대출 신청 금액과 실제 대출액/대출 상품금액 차이 및 비율
    """
    과거대출이력 데이터 
    1.PREV_CREDIT_DIFF : 대출금액과 대출신청금액의 차이(AMT_APPLICATION - AMT_CREDIT)
    2.PREV_GOODS_DIFF : 대출신청금액과 상품대출금액의 차(AMT_APPLICATION - AMT_GOODS_PRICE)
    3.PREV_CREDIT_APPL_RATIO : 대출금액 / 대출신청금액(AMT_CREDIT / AMT_APPLICATION)
    4.PREV_GOODS_APPL_RATIO : 고객상품대출금액 / 대출신청금액(AMT_GOODS_PRICE / AMT_APPLICATION)
    """
    prev['PREV_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
    prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
    prev['PREV_CREDIT_APPL_RATIO'] = prev['AMT_CREDIT']/prev['AMT_APPLICATION']
    # prev['PREV_ANNUITY_APPL_RATIO'] = prev['AMT_ANNUITY']/prev['AMT_APPLICATION']
    prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE']/prev['AMT_APPLICATION']
    
    # 널값 정제 
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # 첫번째 만기일과 마지막 만기일까지의 기간

    prev['PREV_DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
    
    # 매월 납부 금액과 납부 횟수 곱해서 전체 납부 금액 구함. 
    # 이자율 계산
    all_pay = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
    # 전체 납부 금액 대비 AMT_CREDIT 비율을 구하고 여기에 다시 납부횟수로 나누어서 이자율 계산. 
    prev['PREV_INTERESTS_RATE'] = (all_pay/prev['AMT_CREDIT'] - 1)/prev['CNT_PAYMENT']
        
    return prev
    
    
def get_prev_amt_agg(prev):
    # 새롭게 생성된 대출 신청액 대비 다른 금액 차이 및 비율로 aggregation 수행. 
    """
    과거대출이력의 데이터에 대한 일부 컬럼에 대한 집계 파생변수 생성
    """
    agg_dict = {
         # 기존 컬럼. 
        'SK_ID_CURR':['count'],
        'AMT_CREDIT':['mean', 'max', 'sum'],
        'AMT_ANNUITY':['mean', 'max', 'sum'], 
        'AMT_APPLICATION':['mean', 'max', 'sum'],
        'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
        'AMT_GOODS_PRICE':['mean', 'max', 'sum'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
        # 가공 컬럼
        'PREV_CREDIT_DIFF':['mean', 'max', 'sum'], 
        'PREV_CREDIT_APPL_RATIO':['mean', 'max'],
        'PREV_GOODS_DIFF':['mean', 'max', 'sum'],
        'PREV_GOODS_APPL_RATIO':['mean', 'max'],
        'PREV_DAYS_LAST_DUE_DIFF':['mean', 'max', 'sum'],
        'PREV_INTERESTS_RATE':['mean', 'max']
    }
    prev_group = prev.groupby('SK_ID_CURR')
    prev_amt_agg = prev_group.agg(agg_dict)

    # multi index 컬럼을 '_'로 연결하여 컬럼명 변경
    prev_amt_agg.columns = ["PREV_"+ "_".join(x).upper() for x in prev_amt_agg.columns.ravel()]
    
    return prev_amt_agg

def get_prev_refused_appr_agg(prev):
  """
  과거 대출건중에서 승인된건 , 거부된 건수 
  """
  # 원래 groupby 컬럼 + 세부 기준 컬럼으로 groupby 수행. 세분화된 레벨로 aggregation 수행 한 뒤에 unstack()으로 컬럼레벨로 변형. 
  prev_refused_appr_group = prev[prev['NAME_CONTRACT_STATUS'].isin(['Approved', 'Refused'])].groupby(['SK_ID_CURR', 'NAME_CONTRACT_STATUS'])
  prev_refused_appr_agg = prev_refused_appr_group['SK_ID_CURR'].count().unstack()
  # 컬럼명 변경. 
  prev_refused_appr_agg.columns = ['PREV_APPROVED_COUNT', 'PREV_REFUSED_COUNT' ]
  # NaN값은 모두 0으로 변경. 
  prev_refused_appr_agg = prev_refused_appr_agg.fillna(0)  
  return prev_refused_appr_agg

def get_prev_agg(prev):
  """
  1.과거대출 이력건에 대한 파생변수 생성
  2.과거 대출건의 승인 및 거부건수 비율 집합 생성
  3.1)번과 2번 LEFT조인해서 리턴
  """
  prev = get_prev_processed(prev)
  prev_amt_agg = get_prev_amt_agg(prev)
  prev_refused_appr_agg = get_prev_refused_appr_agg(prev)
  
  # prev_amt_agg와 조인. 
  prev_agg = prev_amt_agg.merge(prev_refused_appr_agg, on='SK_ID_CURR', how='left')
  # SK_ID_CURR별 과거 대출건수 대비 APPROVED_COUNT 및 REFUSED_COUNT 비율 생성. 
  prev_agg['PREV_REFUSED_RATIO'] = prev_agg['PREV_REFUSED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
  prev_agg['PREV_APPROVED_RATIO'] = prev_agg['PREV_APPROVED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
  # 'PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT' 컬럼 drop 
  prev_agg = prev_agg.drop(['PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT'], axis=1)
  
  return prev_agg

def get_apps_all_with_prev_agg(apps, prev):
  """
  집계된 데이터 리넡
  """
  apps_all =  get_apps_processed(apps)
  prev_agg = get_prev_agg(prev)
  print('prev_agg shape:', prev_agg.shape)
  print('apps_all before merge shape:', apps_all.shape)
  apps_all = apps_all.merge(prev_agg, on='SK_ID_CURR', how='left')
  print('apps_all after merge with prev_agg shape:', apps_all.shape)
  
  return apps_all

def get_apps_all_encoded(apps_all):
  """
  범주형변수의 인코딩
  """
  object_columns = apps_all.dtypes[apps_all.dtypes == 'object'].index.tolist()
  for column in object_columns:
      apps_all[column] = pd.factorize(apps_all[column])[0]
  
  return apps_all

def get_apps_all_train_test(apps_all):
  """
  train, test셋 분리 
  """
  apps_all_train = apps_all[~apps_all['TARGET'].isnull()]
  apps_all_test = apps_all[apps_all['TARGET'].isnull()]

  apps_all_test = apps_all_test.drop('TARGET', axis=1)
  
  return apps_all_train, apps_all_test
    
def train_apps_all(apps_all_train):
  """
  LGBMClassifier로 fit
  """
  ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
  target_app = apps_all_train['TARGET']

  train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
  print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)
  clf = LGBMClassifier(
              nthread=4,
              n_estimators=2000,
              learning_rate=0.01,
              num_leaves=32,
              colsample_bytree=0.8,
              subsample=0.8,
              max_depth=8,
              reg_alpha=0.04,
              reg_lambda=0.07,
              min_child_weight=40,
              silent=-1,
              verbose=-1,
              )

  clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
              early_stopping_rounds= 100)
  
  return clf

##### 최종 집합 생성 및 인코딩, 학습/테스트 데이터 분리, 학습/검증 피처와 타겟 데이터 분리

In [None]:
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = apps_all_train['TARGET']
train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)

prev_agg shape: (338857, 41)
apps_all before merge shape: (356255, 135)
apps_all after merge with prev_agg shape: (356255, 176)


#### Bayesian Optimization 

In [None]:
# bayesian optimization 패키지 설치
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp37-none-any.whl size=11687 sha256=9d3b82d37b112d1f736d912afd9c0b7c8dd893f79821f076caa34f44ab21d4c5
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


### Bayesian Optimization

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

##### **함수의 입력값 search 범위(하이퍼 파라미터 별 입력 범위) 를 설정**

In [None]:
bayesian_params = {
    'max_depth': (6, 16),
    'num_leaves':(24,  64),
    'min_child_samples': (10, 200),
    'min_child_weight': (1, 50),
    'subsamples': (0.5, 1),  # 0부터 1까지의 범위를 가짐
    'colsample_bytree': (0.5, 1),
    'max_bin': (10, 500), 
    'reg_lambda': (0.001, 10),
    'reg_alpha': (0.01, 50)
    }

##### **최대 값을 구할 함수 선언.**
* iteration 시 마다 hyperparameter를 입력받아 classifier 학습하고 roc_auc_score값을 반환 

In [None]:
def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, 
                 subsamples, colsample_bytree, max_bin, reg_lambda, reg_alpha):
  params = {
      "n_estimators":500, "learning_rate":0.02,
      'max_depth': int(round(max_depth)),
      'num_leaves': int(round(num_leaves)), 
      'min_child_samples': int(round(min_child_samples)),
      'min_child_weight': int(round(min_child_weight)),
      'subsamples':max(min(subsamples, 1), 0),
      'colsample_bytree':max(min(colsample_bytree, 1), 0),
      'max_bin':  max(int(round(max_bin)),10),
      'reg_lambda': max(reg_lambda,0),
      'reg_alpha': max(reg_alpha, 0)
  }
  lgb_model = LGBMClassifier(**params)
  lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
  valid_proba = lgb_model.predict_proba(valid_x)[:, 1]
  roc_auc = roc_auc_score(valid_y, valid_proba)
  
  return roc_auc

##### BayesianOptimization 객체 생성 후 **함수 반환값이 최대가 되는 입력값 search**를 위한 iteration 수행

In [None]:
# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=0)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. 
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsam... |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.769441	training's binary_logloss: 0.246055	valid_1's auc: 0.755295	valid_1's binary_logloss: 0.248931
[200]	training's auc: 0.787337	training's binary_logloss: 0.238455	valid_1's auc: 0.766208	valid_1's binary_logloss: 0.244205
[300]	training's auc: 0.798843	training's binary_logloss: 0.234009	valid_1's auc: 0.771365	valid_1's binary_logloss: 0.242347
[400]	training's auc: 0.807905	training's binary_logloss: 0.230658	valid_1's auc: 0.773881	valid_1's binary_logloss: 0.241451
[500]	training's auc: 0.816051	training's binary_logloss: 0.227681	valid_1's auc: 0.775474	valid_1's binary_logloss: 0.240888
Did not meet early stopping. Best itera

##### Iteration 수행 결과 출력

In [None]:
# BayesianOptimization객체의 res는 iteration 수행 시마다 모든 함수 반환결과와 그때의 파라미터 결과값을 가지고 있음. 
lgbBO.res

[{'params': {'colsample_bytree': 0.7744067519636624,
   'max_bin': 360.44278952248555,
   'max_depth': 12.027633760716439,
   'min_child_samples': 113.52780476941041,
   'min_child_weight': 21.75908516760633,
   'num_leaves': 49.835764522666246,
   'reg_alpha': 21.884984691022,
   'reg_lambda': 8.917838234820016,
   'subsamples': 0.9818313802505146},
  'target': 0.7754739702428558},
 {'params': {'colsample_bytree': 0.6917207594128889,
   'max_bin': 397.94526866050563,
   'max_depth': 11.288949197529044,
   'min_child_samples': 117.92846660784714,
   'min_child_weight': 46.35423527634039,
   'num_leaves': 26.841442327915477,
   'reg_alpha': 4.36559369208002,
   'reg_lambda': 0.20316375600581688,
   'subsamples': 0.916309922773969},
  'target': 0.7753573432834208},
 {'params': {'colsample_bytree': 0.8890783754749252,
   'max_bin': 436.30595264094137,
   'max_depth': 15.78618342232764,
   'min_child_samples': 161.8401272011775,
   'min_child_weight': 23.61248875039366,
   'num_leaves': 55

##### Iteration 결과 Dictionary에서 최대 target값을 가지는 index 추출하고 그때의 parameter 값을 추출.  

In [None]:
# dictionary에 있는 target값을 모두 추출
target_list = []
for result in lgbBO.res:
  target = result['target']
  target_list.append(target)
print(target_list)

# 가장 큰 target값을 가지는 순번(index)를 추출
print('maximum target index :', np.argmax(target_list))

[0.7754739702428558, 0.7753573432834208, 0.7773320021744632, 0.7749940765618106, 0.7741980660329925, 0.7743139273609485, 0.7720062711953142, 0.7756544353008543, 0.7727560197921964, 0.7764914515903136, 0.7768626430742653, 0.777577672983937, 0.7769813334264206, 0.7762975625662807, 0.7774589032094291, 0.7771821464911522, 0.7780482186540827, 0.7777908139862375, 0.7779424074306232, 0.7755680269581322, 0.7766501692197265, 0.7764958579424354, 0.7758560235277473, 0.7766452895103834, 0.7759978146653764, 0.7773965376012826, 0.7771233405928487, 0.7765192494137196, 0.7768137856198469, 0.775890440410018]
maximum target index : 16


In [None]:
np.array(target_list)

array([0.77547397, 0.77535734, 0.777332  , 0.77499408, 0.77419807,
       0.77431393, 0.77200627, 0.77565444, 0.77275602, 0.77649145,
       0.77686264, 0.77757767, 0.77698133, 0.77629756, 0.7774589 ,
       0.77718215, 0.77804822, 0.77779081, 0.77794241, 0.77556803,
       0.77665017, 0.77649586, 0.77585602, 0.77664529, 0.77599781,
       0.77739654, 0.77712334, 0.77651925, 0.77681379, 0.77589044])

In [None]:
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

{'target': 0.7780482186540827, 'params': {'colsample_bytree': 0.7436762853652641, 'max_bin': 494.6519941194128, 'max_depth': 14.273550154831288, 'min_child_samples': 25.784295891426886, 'min_child_weight': 2.6913774511864754, 'num_leaves': 57.965834800533756, 'reg_alpha': 2.4699153309230764, 'reg_lambda': 9.797178761741076, 'subsamples': 0.5823503102854222}}


##### 최적화된 하이퍼 파라미터를 기반으로 재 테스트 
{'target': 0.7780482186540827, 'params': {'colsample_bytree': 0.7436762853652641, 'max_bin': 494.6519941194128, 'max_depth': 14.273550154831288, 'min_child_samples': 25.784295891426886, 'min_child_weight': 2.6913774511864754, 'num_leaves': 57.965834800533756, 'reg_alpha': 2.4699153309230764, 'reg_lambda': 9.797178761741076, 'subsample': 0.5823503102854222}}

In [None]:
def train_apps_all(apps_all_train):
  """
  위에서 나온 최적 파라미터 기반으로 다시 테스트 
  """
  ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
  target_app = apps_all_train['TARGET']

  train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
  print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)

  clf = LGBMClassifier(
                      nthread=4,
                      n_estimators=1000,
                      learning_rate=0.02,
                      max_depth = 14,
                      num_leaves=58,
                      colsample_bytree=0.743,
                      subsample=0.582,
                      max_bin=495,
                      reg_alpha=2.469,
                      reg_lambda=9.797,
                      min_child_weight=3,
                      min_child_samples=26,
                      silent=-1,
                      verbose=-1)
  clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, early_stopping_rounds= 100)
      
  return clf

In [None]:
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
clf = train_apps_all(apps_all_train)

prev_agg shape: (338857, 41)
apps_all before merge shape: (356255, 135)
apps_all after merge with prev_agg shape: (356255, 176)
train shape: (215257, 174) valid shape: (92254, 174)
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.778505	training's binary_logloss: 0.243199	valid_1's auc: 0.759267	valid_1's binary_logloss: 0.247651
[200]	training's auc: 0.800126	training's binary_logloss: 0.233974	valid_1's auc: 0.769733	valid_1's binary_logloss: 0.242908
[300]	training's auc: 0.816202	training's binary_logloss: 0.227823	valid_1's auc: 0.77418	valid_1's binary_logloss: 0.241285
[400]	training's auc: 0.829678	training's binary_logloss: 0.222716	valid_1's auc: 0.77678	valid_1's binary_logloss: 0.240349
[500]	training's auc: 0.840759	training's binary_logloss: 0.218456	valid_1's auc: 0.777721	valid_1's binary_logloss: 0.239991
[600]	training's auc: 0.850681	training's binary_logloss: 0.214684	valid_1's auc: 0.778337	valid_1's binary_logloss: 0.239793
[7

In [None]:
preds = clf.predict_proba(apps_all_test.drop('SK_ID_CURR', axis=1))[:, 1 ]
apps_all_test['TARGET'] = preds
# SK_ID_CURR과 TARGET 값만 csv 형태로 생성. 코랩 버전은 구글 드라이브 절대 경로로 입력  
default_dir = "/content/gdrive/My Drive"
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv(os.path.join(default_dir,'prev_baseline_tuning_01.csv'), index=False)

### cross validation 으로 hyper parameter 재 tuning
* 오히려 성능이 떨어질수도 있음.
* 파이썬용으로 만들어진 LightGBM이므로 파라미터명은 lightGBM과는 약간 다름.

In [32]:
bayesian_params = {
    'max_depth': (6, 16), # 트리의 깊이
    'num_leaves': (24, 64), # 리프노드의 갯수 
    'min_data_in_leaf': (10, 200), # min_child_samples, 리프노드에 최소한의 필요한 데이터 갯수 (과적합제어 목표)
    'min_child_weight':(1, 50),
    'bagging_fraction':(0.5, 1.0), # subsample - 트리가 커져서 과적합되는 것을 제어하기 위해 데이터샘플링 비율 지정
    'feature_fraction': (0.5, 1.0), # colsample_bytree, 피처의 선택비율 지정
    'max_bin':(10, 500),
    'lambda_l2':(0.001, 10), # reg_lambda
    'lambda_l1': (0.01, 50) # reg_alpha
}

In [37]:
import lightgbm as lgb

# 파이썬 Wrapper용 lightgbm
train_data = lgb.Dataset(data=ftr_app, label=target_app, free_raw_data=False)
def lgb_roc_eval_cv(max_depth, num_leaves, min_data_in_leaf, min_child_weight, bagging_fraction, 
                 feature_fraction,  max_bin, lambda_l2, lambda_l1):   
    params = {
        "num_iterations":500, "learning_rate":0.02,
        'early_stopping_rounds':100, 'metric':'auc',
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 실수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_data_in_leaf': int(round(min_data_in_leaf)),
        'min_child_weight': int(round(min_child_weight)),
        'bagging_fraction': max(min(bagging_fraction, 1), 0), 
        'feature_fraction': max(min(feature_fraction, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'lambda_l2': max(lambda_l2,0),
        'lambda_l1': max(lambda_l1, 0)
    }
    # 파이썬 lightgbm의 cv 메소드를 사용- cross validation
    # cross_val_score도 쓸수도있지만.
    cv_reuslt = lgb.cv(params, train_data, nfold = 3, metics = ['auc'], verbose_eval=100, seed = 0, early_stopping_rounds=50)
    return max(cv_result['auc-mean'])   

In [38]:
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

{'target': 0.7780482186540827, 'params': {'colsample_bytree': 0.7436762853652641, 'max_bin': 494.6519941194128, 'max_depth': 14.273550154831288, 'min_child_samples': 25.784295891426886, 'min_child_weight': 2.6913774511864754, 'num_leaves': 57.965834800533756, 'reg_alpha': 2.4699153309230764, 'reg_lambda': 9.797178761741076, 'subsamples': 0.5823503102854222}}


In [39]:
def train_apps_all(apps_all_train):
    ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    target_app = apps_all_train['TARGET']

    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
    print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)
    clf = LGBMClassifier(
                nthread=4,
                n_estimators=1000,
                learning_rate=0.02,
                max_depth = 14,
                num_leaves=58,
                colsample_bytree=0.743,
                subsample= 0.582,
                max_bin=495,
                reg_alpha=2.47,
                reg_lambda = 9.797,
                min_child_weight=3,
                min_child_samples=26,
                silent=-1,
                verbose=-1,
                )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
    
    return clf

In [47]:
#1.파생변수 생성
#2.카테고리형을 레이블 인코딩
#3.훈련, 테스트용 데이터셋 분리 
#4.LightGBM 훈련
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
clf = train_apps_all(apps_all_train)

prev_agg shape: (338857, 41)
apps_all before merge shape: (356255, 135)
apps_all after merge with prev_agg shape: (356255, 176)
train shape: (215257, 174) valid shape: (92254, 174)
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.778553	training's binary_logloss: 0.243214	valid_1's auc: 0.759151	valid_1's binary_logloss: 0.247681
[200]	training's auc: 0.800338	training's binary_logloss: 0.233945	valid_1's auc: 0.769981	valid_1's binary_logloss: 0.242867
[300]	training's auc: 0.816293	training's binary_logloss: 0.227839	valid_1's auc: 0.774466	valid_1's binary_logloss: 0.241216
[400]	training's auc: 0.829769	training's binary_logloss: 0.222764	valid_1's auc: 0.77684	valid_1's binary_logloss: 0.240344
[500]	training's auc: 0.84098	training's binary_logloss: 0.218495	valid_1's auc: 0.778048	valid_1's binary_logloss: 0.239917
[600]	training's auc: 0.85073	training's binary_logloss: 0.214754	valid_1's auc: 0.778701	valid_1's binary_logloss: 0.239708
[70

In [48]:
preds = clf.predict_proba(apps_all_test.drop('SK_ID_CURR', axis = 1))[:, 1]
apps_all_test['TARGET'] = preds

default_dir = "/content/gdrive/My Drive"
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv(os.path.join(default_dir, 'prev_baseline_tuning_02.csv'), index = False)