# Hyperparameter

- 복잡한 모형일 수록 결정해야 할 파라미터값이 많아짐
- 기본적으로 후보들을 선정해 두고 결과값을 보고 방향성을 결정함
- 이를 편하게 해 줄 라이브러리를 소개하도록 함
    - 책에서 소개: GridSearchCV
    - kaggle에서 본 방법: BayseOpt
    
# GridSearchCV
- 이름에서 알 수 있듯이 grid로 파라미터를 두고 CV도 한번에 하게 해줌
- 절차:
    1. grdi_param 설정
    2. k-fold CV를 위한 k 설정
    3. 모형선정 및 customize할 값 적용  
\#### 현재 early_stopping이 적용되지 않아 파악중..

# Bayesian Optimization
- 이 패키지는 파라미터를 grid로 설정하는 것이 아니라 범위로 설정
- 범위내에 값들을 랜덤하게 돌아줌
- tree를 위한 패키지가 아니라 general purpose목적임
    - 따라서 직접 설정해줘야 할 부분이 많음(CV전략 등)


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv('../1.clustering/titanic.csv')

target = data['Survived']
data = data.drop(['PassengerId', 'Survived'], axis=1)

def sex(a):
    if a == 'male':
        return 0
    else:
        return 1
    
def emb(a):
    if a == 'S':
        return 0
    elif a == 'Q':
        return 1
    else:
        return 2
    
data['Sex'] = data['Sex'].map(sex)
data['Embarked'] = data['Embarked'].map(emb)

train_X, test_X, train_y, test_y = train_test_split(data, target, test_size=0.3, random_state=2019, shuffle=True)

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# params = {
#     'booster': 'gbtree',
#     'tree_method': 'hist',
#     'random_state': 0,
#     'n_jobs': -1,
# #     'max_depth': 9
# }
# grid_params= {
#     'n_estimators': [10, 20, 50],
#     'max_depth': [3, 6, 9, 12],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8, 0.9],
# }

params= {
    'boosting': 'gbdt',
    'learning_rate': 0.1,
    'metric': 'auc',
    'random_state': 0,
}

grid_params= {
    'n_estimators': [10, 20, 50],
    'max_depth': [6, 9, 12],
    'min_child_samples': [10, 20, 50], 
    'num_leaves': [20, 30, 40], 
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
}

# clf = XGBClassifier(**params)
clf = LGBMClassifier(**params)

grid_clf = GridSearchCV(clf, grid_params,verbose=1, cv=3)


In [None]:
# fit_params = {
#     'eval_set': [[test_X, test_y]],
#     'early_stopping_rounds': 100, 
#     'eval_metric': 'mae', 
# }
grid_clf.fit(train_X, train_y)


In [None]:
print(grid_clf.best_score_)
print(grid_clf.best_estimator_)

In [None]:
print(accuracy_score(test_y, grid_clf.predict(test_X)))

In [None]:
train_X = train_X.reset_index()
train_y = train_y.reset_index()
test_X = test_X.reset_index()
test_y = test_y.reset_index()

In [28]:
from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier
bounds_LGB = {
#     'n_estimators': (10, 100),
    'max_depth': (3, 15),
    'min_child_samples': (5, 30),
    'num_leaves': (10, 50),
    'subsample': (0.5, 0.9),
    'colsample_bytree': (0.5, 0.9),
}

init_points = 10
n_iter = 20


In [29]:
def LGB_bayesian(
#     n_estimators,
    max_depth,
    min_child_samples,
    num_leaves,
    subsample,
    colsample_bytree
):
    # LightGBM expects next three parameters need to be integer. 
#     n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_child_samples = int(min_child_samples)
    num_leaves = int(num_leaves)

#     assert type(num_leaves) == int
#     assert type(max_depth) == int
    

    params = {
        'boosting': 'gbdt',
        'n_estimators': 400,
        'learning_rate': 0.1,
        'max_depth': max_depth,
        'min_child_samples': min_child_samples,
        'num_leaves': num_leaves,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'random_state': 0,
    }    
    
    ## set clf options
    clf = LGBMClassifier(**params).fit(train_X, train_y, early_stopping_rounds=100,eval_set=[(test_X, test_y)], eval_metric='auc', verbose=0)
    
    score = accuracy_score(test_y, clf.predict(test_X))

    return score

In [30]:
optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=0)


In [31]:
optimizer.maximize(init_points=init_points, n_iter=n_iter)

|   iter    |  target   | colsam... | max_depth | min_ch... | num_le... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7897  [0m | [0m 0.7195  [0m | [0m 11.58   [0m | [0m 20.07   [0m | [0m 31.8    [0m | [0m 0.6695  [0m |
| [0m 2       [0m | [0m 0.785   [0m | [0m 0.7584  [0m | [0m 8.251   [0m | [0m 27.29   [0m | [0m 48.55   [0m | [0m 0.6534  [0m |
| [95m 3       [0m | [95m 0.8037  [0m | [95m 0.8167  [0m | [95m 9.347   [0m | [95m 19.2    [0m | [95m 47.02   [0m | [95m 0.5284  [0m |
| [0m 4       [0m | [0m 0.7757  [0m | [0m 0.5349  [0m | [0m 3.243   [0m | [0m 25.82   [0m | [0m 41.13   [0m | [0m 0.848   [0m |
| [0m 5       [0m | [0m 0.7757  [0m | [0m 0.8914  [0m | [0m 12.59   [0m | [0m 16.54   [0m | [0m 41.22   [0m | [0m 0.5473  [0m |
| [0m 6       [0m | [0m 0.8037  [0m | [0m 0.756   [0m | [0m 4.72    [0m | [0m 28.62   [0m | [0m 30.8

In [32]:
optimizer.max

{'target': 0.8271028037383178,
 'params': {'colsample_bytree': 0.6659763018365787,
  'max_depth': 3.4332290366901885,
  'min_child_samples': 5.020233588443391,
  'num_leaves': 10.354120982723458,
  'subsample': 0.5659155483353682}}