## GridSearchでXGBoostのパラメタを決める

In [1]:
import pandas as pd
import numpy as np
# GridSearch
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics.scorer import make_scorer
# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
# Time
import time

In [2]:
# 評価指標をコンテストで提示されているMean Absolute Percentage Errorにする
def mape( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score

In [3]:
train_x = pd.read_csv("data/processed_train_goto_x_v12.csv").drop(['id','pj_no'],axis=1)
train_y = pd.read_csv("data/processed_train_goto_y_v12.csv").drop(['id'], axis=1)

In [4]:
params = {
    'gamma':0.1
}

target_params={
    'n_estimators':[300,500,700,900],
#    'max_depth':[3,4,5],
#    'min_child_weight':[1,2,3]

}

#    'gamma':[0.0,0.1,0.2],
#   'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]
#    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
#    'learning_rate':[0.5, 0.2, 0.1, 0.05]

In [5]:
my_scorer = make_scorer(mape, greater_is_better=False)
gs = GridSearchCV(estimator = XGBRegressor(**params,seed=42),param_grid=target_params, cv=5,n_jobs=-1, scoring=my_scorer,verbose=3)
s = time.perf_counter()
gs.fit(train_x, train_y.values.ravel())
print(f"Best Parameters: {gs.best_params_}")
print(f"Best Score: {gs.best_score_}")
e = time.perf_counter()-s
print(f"elapsed time:  {e}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 14.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 14.5min finished


Best Parameters: {'n_estimators': 500}
Best Score: -10.777443128365404
elapsed time:  949.778719721


In [6]:
params = {
    'n_estimators':500,
    'gamma':0.1
}

target_params={
    'max_depth':[3,4,5],
    'min_child_weight':[1,2,3],
}

#    'gamma':[0.0,0.1,0.2],
#   'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]
#    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
#    'learning_rate':[0.5, 0.2, 0.1, 0.05]

my_scorer = make_scorer(mape, greater_is_better=False)
gs = GridSearchCV(estimator = XGBRegressor(**params,seed=42),param_grid=target_params, cv=5,n_jobs=-1, scoring=my_scorer,verbose=3)
s = time.perf_counter()
gs.fit(train_x, train_y.values.ravel())
print(f"Best Parameters: {gs.best_params_}")
print(f"Best Score: {gs.best_score_}")
e = time.perf_counter()-s
print(f"elapsed time:  {e}")

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 19.6min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 36.0min finished


Best Parameters: {'max_depth': 3, 'min_child_weight': 1}
Best Score: -10.777443128365404
elapsed time:  2236.5697002139996


In [None]:
params = {
    'n_estimators':500,
    'max_depth':3,
    'min_child_weight':2
}

target_params={
    'gamma':[0.0,0.1,0.2],
    'subsample':[0.7,0.8,0.9,1.0]
}

#    'gamma':[0.0,0.1,0.2],
#   'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]
#    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
#    'learning_rate':[0.5, 0.2, 0.1, 0.05]

my_scorer = make_scorer(mape, greater_is_better=False)
gs = GridSearchCV(estimator = XGBRegressor(**params,seed=42),param_grid=target_params, cv=5,n_jobs=-1, scoring=my_scorer,verbose=3)
s = time.perf_counter()
gs.fit(train_x, train_y.values.ravel())
print(f"Best Parameters: {gs.best_params_}")
print(f"Best Score: {gs.best_score_}")
e = time.perf_counter()-s
print(f"elapsed time:  {e}")

In [None]:
params = {
    'n_estimators':500,
    'max_depth':3,
    'min_child_weight':2,
    'gamma':0.0,
    'subsample':0.9
}

target_params={
    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
    'learning_rate':[0.5, 0.2, 0.1, 0.05]
}

#    'gamma':[0.0,0.1,0.2],
#   'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]


my_scorer = make_scorer(mape, greater_is_better=False)
gs = GridSearchCV(estimator = XGBRegressor(**params,seed=42),param_grid=target_params, cv=5,n_jobs=-1, scoring=my_scorer,verbose=3)
s = time.perf_counter()
gs.fit(train_x, train_y.values.ravel())
print(f"Best Parameters: {gs.best_params_}")
print(f"Best Score: {gs.best_score_}")
e = time.perf_counter()-s
print(f"elapsed time:  {e}")

In [None]:
params = {
    'n_estimators':500,
    'max_depth':3,
    'min_child_weight':2,
    'gamma':0.0,
    'subsample':0.9,
    'colsample_bytree':0.8,
    'learning_rate':0.1
}