## GridSearchでXGBoostのパラメタを決める

In [12]:
import pandas as pd
import numpy as np
# GridSearch
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics.scorer import make_scorer
# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
# Time
import time

In [24]:
# 評価指標をコンテストで提示されているMean Absolute Percentage Errorにする
def mape( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score

In [17]:
train_x = pd.read_csv("data/processed_train_goto_x_v7.csv").drop(['id','pj_no'],axis=1)
train_y = pd.read_csv("data/processed_train_goto_y_v7.csv").drop(['id'], axis=1)

In [29]:
params = {
    'gamma':0.1
}

target_params={
    'n_estimators':[300,500,700,900],
#    'max_depth':[3,4,5],
#    'min_child_weight':[1,2,3]

}

#    'gamma':[0.0,0.1,0.2],
#   'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]
#    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
#    'learning_rate':[0.5, 0.2, 0.1, 0.05]

In [31]:
my_scorer = make_scorer(mape, greater_is_better=False)
gs = GridSearchCV(estimator = XGBRegressor(**params,seed=42),param_grid=target_params, cv=5,n_jobs=-1, scoring=my_scorer,verbose=3)
s = time.perf_counter()
gs.fit(train_x, train_y.values.ravel())
print(f"Best Parameters: {gs.best_params_}")
print(f"Best Score: {gs.best_score_}")
e = time.perf_counter()-s
print(f"elapsed time:  {e}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] n_estimators=300 ................................................
[CV] n_estimators=300 ................................................
[CV] n_estimators=300 ................................................
[CV] n_estimators=300 ................................................
len = 1292,   score = 10.897652659275787
len = 1293,   score = 10.321258166846661
len = 1292,   score = 10.553984789337013
len = 5169,   score = 7.724359170756811
[CV] ...... n_estimators=300, score=-10.897652659275787, total=  32.8s
len = 5168,   score = 7.828255352860683
[CV] ...... n_estimators=300, score=-10.321258166846661, total=  32.8s
[CV] n_estimators=300 ................................................
[CV] n_estimators=500 ................................................
len = 1292,   score = 12.888711824183753
len = 5169,   score = 7.582416291936954
[CV] ...... n_estimators=300, score=-10.553984789337013, total=  32.8s
[CV] n_estimators

[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  5.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  5.6min finished


Best Parameters: {'n_estimators': 500}
Best Score: -11.148181933631413
elapsed time:  372.0426614349999


In [33]:
params = {
    'n_estimators':500,
    'gamma':0.1
}

target_params={
    'max_depth':[3,4,5],
    'min_child_weight':[1,2,3],
}

#    'gamma':[0.0,0.1,0.2],
#   'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]
#    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
#    'learning_rate':[0.5, 0.2, 0.1, 0.05]

my_scorer = make_scorer(mape, greater_is_better=False)
gs = GridSearchCV(estimator = XGBRegressor(**params,seed=42),param_grid=target_params, cv=5,n_jobs=-1, scoring=my_scorer,verbose=3)
s = time.perf_counter()
gs.fit(train_x, train_y.values.ravel())
print(f"Best Parameters: {gs.best_params_}")
print(f"Best Score: {gs.best_score_}")
e = time.perf_counter()-s
print(f"elapsed time:  {e}")

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] max_depth=3, min_child_weight=1 .................................
[CV] max_depth=3, min_child_weight=1 .................................
[CV] max_depth=3, min_child_weight=1 .................................
[CV] max_depth=3, min_child_weight=1 .................................
len = 1292,   score = 10.539767343171834
len = 1293,   score = 10.28051184135713
len = 1292,   score = 10.89324457607987
len = 1292,   score = 12.63728793533475
len = 5168,   score = 6.927473364991198
len = 5169,   score = 6.6271634416317164
[CV]  max_depth=3, min_child_weight=1, score=-10.539767343171834, total=  46.3s
[CV]  max_depth=3, min_child_weight=1, score=-10.28051184135713, total=  46.4s
[CV] max_depth=3, min_child_weight=1 .................................
[CV] max_depth=3, min_child_weight=2 .................................
len = 5169,   score = 6.836337426154289
[CV]  max_depth=3, min_child_weight=1, score=-10.89324457607987, total=  

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  5.8min


len = 1292,   score = 11.220689205856862
len = 5169,   score = 5.010060327603177
[CV]  max_depth=4, min_child_weight=2, score=-11.220689205856862, total= 1.2min
[CV] max_depth=4, min_child_weight=3 .................................
len = 1292,   score = 10.711149817375151
len = 1293,   score = 10.708706637584399
len = 5169,   score = 5.288778438588936
[CV]  max_depth=4, min_child_weight=3, score=-10.711149817375151, total= 1.2min
[CV] max_depth=4, min_child_weight=3 .................................
len = 5168,   score = 5.153738652879293
[CV]  max_depth=4, min_child_weight=3, score=-10.708706637584399, total= 1.2min
[CV] max_depth=5, min_child_weight=1 .................................
len = 1292,   score = 10.702782078337865
len = 5169,   score = 4.999154524041156
[CV]  max_depth=4, min_child_weight=3, score=-10.702782078337865, total= 1.2min
[CV] max_depth=5, min_child_weight=1 .................................
len = 1292,   score = 12.395446388770232
len = 5169,   score = 4.9316738

[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 12.8min finished


Best Parameters: {'max_depth': 3, 'min_child_weight': 2}
Best Score: -11.097483760616656
elapsed time:  803.2149406080007


In [35]:
params = {
    'n_estimators':500,
    'max_depth':3,
    'min_child_weight':2
}

target_params={
    'gamma':[0.0,0.1,0.2],
    'subsample':[0.7,0.8,0.9,1.0]
}

#    'gamma':[0.0,0.1,0.2],
#   'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]
#    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
#    'learning_rate':[0.5, 0.2, 0.1, 0.05]

my_scorer = make_scorer(mape, greater_is_better=False)
gs = GridSearchCV(estimator = XGBRegressor(**params,seed=42),param_grid=target_params, cv=5,n_jobs=-1, scoring=my_scorer,verbose=3)
s = time.perf_counter()
gs.fit(train_x, train_y.values.ravel())
print(f"Best Parameters: {gs.best_params_}")
print(f"Best Score: {gs.best_score_}")
e = time.perf_counter()-s
print(f"elapsed time:  {e}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] gamma=0.0, subsample=0.7 ........................................
[CV] gamma=0.0, subsample=0.7 ........................................
[CV] gamma=0.0, subsample=0.7 ........................................
[CV] gamma=0.0, subsample=0.7 ........................................
len = 1293,   score = 10.540508342664294
len = 1292,   score = 10.8075566859318
len = 1292,   score = 10.611851343419808
len = 1292,   score = 12.510752389219393
len = 5168,   score = 6.779081393366777
[CV]  gamma=0.0, subsample=0.7, score=-10.540508342664294, total=  43.0s
len = 5169,   score = 6.722464658632171
[CV]  gamma=0.0, subsample=0.7, score=-10.8075566859318, total=  42.9s
[CV] gamma=0.0, subsample=0.7 ........................................
[CV] gamma=0.0, subsample=0.8 ........................................
len = 5169,   score = 6.662120915564046
[CV]  gamma=0.0, subsample=0.7, score=-10.611851343419808, total=  43.0s
[CV] gamma=0.0

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  4.7min


len = 1292,   score = 11.353300707883001
len = 5169,   score = 6.6187859232578585
[CV]  gamma=0.1, subsample=0.7, score=-11.353300707883001, total=  45.7s
[CV] gamma=0.1, subsample=0.8 ........................................
len = 1293,   score = 10.149318203621855
len = 5168,   score = 6.674212924130337
[CV]  gamma=0.1, subsample=0.8, score=-10.149318203621855, total=  46.9s
[CV] gamma=0.1, subsample=0.8 ........................................
len = 1292,   score = 10.807437031102854
len = 5169,   score = 6.693669076798389
[CV]  gamma=0.1, subsample=0.8, score=-10.807437031102854, total=  47.0s
[CV] gamma=0.1, subsample=0.9 ........................................
len = 1292,   score = 10.630957361994883
len = 5169,   score = 6.652431709071278
[CV]  gamma=0.1, subsample=0.8, score=-10.630957361994883, total=  47.1s
[CV] gamma=0.1, subsample=0.9 ........................................
len = 1292,   score = 12.532922638495142
len = 5169,   score = 6.268801713518611
[CV]  gamma=0.1, s

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 12.1min finished


Best Parameters: {'gamma': 0.0, 'subsample': 0.9}
Best Score: -11.03699029004363
elapsed time:  764.6176496519984


In [36]:
params = {
    'n_estimators':500,
    'max_depth':3,
    'min_child_weight':2,
    'gamma':0.0,
    'subsample':0.9
}

target_params={
    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
    'learning_rate':[0.5, 0.2, 0.1, 0.05]
}

#    'gamma':[0.0,0.1,0.2],
#   'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]


my_scorer = make_scorer(mape, greater_is_better=False)
gs = GridSearchCV(estimator = XGBRegressor(**params,seed=42),param_grid=target_params, cv=5,n_jobs=-1, scoring=my_scorer,verbose=3)
s = time.perf_counter()
gs.fit(train_x, train_y.values.ravel())
print(f"Best Parameters: {gs.best_params_}")
print(f"Best Score: {gs.best_score_}")
e = time.perf_counter()-s
print(f"elapsed time:  {e}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] colsample_bytree=0.6, learning_rate=0.5 .........................
[CV] colsample_bytree=0.6, learning_rate=0.5 .........................
[CV] colsample_bytree=0.6, learning_rate=0.5 .........................
[CV] colsample_bytree=0.6, learning_rate=0.5 .........................
len = 1293,   score = 12.215149192871975
len = 1292,   score = 12.11466145862557
len = 1292,   score = 13.757768497249035
len = 1292,   score = 12.233814231007987
len = 5168,   score = 3.7737548027039063
[CV]  colsample_bytree=0.6, learning_rate=0.5, score=-12.215149192871975, total=  28.0s
[CV] colsample_bytree=0.6, learning_rate=0.5 .........................
len = 5169,   score = 3.6353517157957818
[CV]  colsample_bytree=0.6, learning_rate=0.5, score=-12.11466145862557, total=  28.0s
len = 5169,   score = 3.663752448242737
[CV]  colsample_bytree=0.6, learning_rate=0.5, score=-13.757768497249035, total=  28.0s
[CV] colsample_bytree=0.6, learning

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  3.1min


len = 1292,   score = 12.880071728286413
len = 1292,   score = 11.165624583327915
len = 1293,   score = 10.718433879358043
len = 5169,   score = 3.6587880902724987
len = 5168,   score = 5.445323750611631
[CV]  colsample_bytree=0.7, learning_rate=0.2, score=-10.718433879358043, total=  35.2s
[CV]  colsample_bytree=0.7, learning_rate=0.5, score=-12.880071728286413, total=  35.3s
len = 5169,   score = 5.51665396265316
[CV]  colsample_bytree=0.7, learning_rate=0.2, score=-11.165624583327915, total=  34.9s
[CV] colsample_bytree=0.7, learning_rate=0.2 .........................
[CV] colsample_bytree=0.7, learning_rate=0.2 .........................
[CV] colsample_bytree=0.7, learning_rate=0.1 .........................
len = 1292,   score = 10.854579615767745
len = 5169,   score = 5.343253568040646
[CV]  colsample_bytree=0.7, learning_rate=0.2, score=-10.854579615767745, total=  35.3s
[CV] colsample_bytree=0.7, learning_rate=0.1 .........................
len = 1292,   score = 11.23810585430318


len = 1292,   score = 12.793006783874926
len = 5169,   score = 7.80515614126181
[CV]  colsample_bytree=0.8, learning_rate=0.05, score=-11.247813170698256, total=  44.5s
[CV] colsample_bytree=0.9, learning_rate=0.5 .........................
len = 5169,   score = 7.527458547978169
[CV]  colsample_bytree=0.8, learning_rate=0.05, score=-12.793006783874926, total=  44.7s
[CV] colsample_bytree=0.9, learning_rate=0.5 .........................
len = 1293,   score = 11.89818586971114
len = 1292,   score = 12.230050257862517
len = 5168,   score = 3.8029872359775654
[CV]  colsample_bytree=0.9, learning_rate=0.5, score=-11.89818586971114, total=  51.4s
[CV] colsample_bytree=0.9, learning_rate=0.5 .........................
len = 5169,   score = 3.8995946946472495
[CV]  colsample_bytree=0.9, learning_rate=0.5, score=-12.230050257862517, total=  51.4s
[CV] colsample_bytree=0.9, learning_rate=0.2 .........................
len = 1292,   score = 11.831561810594009
len = 1292,   score = 13.03431203277547

[CV]  colsample_bytree=1.0, learning_rate=0.1, score=-10.721194890053104, total=  46.8s
len = 1292,   score = 12.356423250385728
[CV] colsample_bytree=1.0, learning_rate=0.05 ........................
len = 5169,   score = 6.357529378736886
[CV]  colsample_bytree=1.0, learning_rate=0.1, score=-12.356423250385728, total=  46.7s
[CV] colsample_bytree=1.0, learning_rate=0.05 ........................
len = 1292,   score = 11.108216854559036
len = 5169,   score = 6.545687264946997
[CV]  colsample_bytree=1.0, learning_rate=0.1, score=-11.108216854559036, total=  46.7s
[CV] colsample_bytree=1.0, learning_rate=0.05 ........................
len = 1293,   score = 10.361952833966756
len = 5168,   score = 7.906709202180556
[CV]  colsample_bytree=1.0, learning_rate=0.05, score=-10.361952833966756, total=  46.8s
[CV] colsample_bytree=1.0, learning_rate=0.05 ........................
len = 1292,   score = 10.75734494287164
len = 5169,   score = 7.902140926165771
[CV]  colsample_bytree=1.0, learning_rat

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 17.7min finished


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1}
Best Score: -11.030268335919628
elapsed time:  1089.884131502


In [None]:
params = {
    'n_estimators':500,
    'max_depth':3,
    'min_child_weight':2,
    'gamma':0.0,
    'subsample':0.9,
    'colsample_bytree':0.8,
    'learning_rate':0.1
}