### XGBoost パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk3')

In [3]:
train_pkl.shape

(55323, 15)

In [4]:
train_pkl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55323 entries, 0 to 55582
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   id                                   55323 non-null  int64  
 1   accommodates                         55323 non-null  int8   
 2   bathrooms                            55323 non-null  int8   
 3   bedrooms                             55323 non-null  int8   
 4   beds                                 55323 non-null  int8   
 5   host_response_rate                   55323 non-null  float64
 6   number_of_reviews                    55323 non-null  int8   
 7   y                                    55323 non-null  float64
 8   cancellation_policy_moderate         55323 non-null  uint8  
 9   cancellation_policy_strict           55323 non-null  uint8  
 10  cancellation_policy_super_strict_30  55323 non-null  uint8  
 11  cancellation_policy_super_st

In [5]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [6]:
# ターゲットと特徴量の分割
train_X_tmp = train.copy()
train_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
train_X = train_X_tmp.iloc[:].values
train_y = train.y.values

### 検証するパラメータの設定

In [7]:
grid_param1 = {
    'max_depth': [i for i in range(3, 11, 2)],
#     'min_child_weight': [1, 2, 3, 4, 6, 10],
#     'gamma': [0.0, 0.1, 0.2, 2.0, 10.0],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


### グリッドサーチ

In [8]:
gs = GridSearchCV(estimator=XGBRegressor({'verbosity':0}), param_grid=grid_param1, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [None]:
gs.best_params_

In [9]:
gs.fit(train_X, train_y)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0,
                                    max_depth={'verbosity': 0},
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, 9], 'random_state': [42]},
      

In [10]:
gs.best_params_

{'max_depth': 3, 'random_state': 42}

In [11]:
grid_param2 = {
    'max_depth': [3],
    'min_child_weight': [i for i in range(1, 11)],
#     'gamma': [0.0, 0.1, 0.2, 2.0, 10.0],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [12]:
gs = GridSearchCV(estimator=XGBRegressor({'verbosity':0}), param_grid=grid_param2, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [13]:
gs.fit(train_X, train_y)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0,
                                    max_depth={'verbosity': 0},
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [3],
                         'min_child_we

In [14]:
gs.best_score_

-16058.269773271113

In [15]:
gs.best_params_

{'max_depth': 3, 'min_child_weight': 1, 'random_state': 42}

In [None]:
gs.cv_results_['mean_test_score']

In [27]:
grid_param3 = {
    'max_depth': [3],
    'min_child_weight': [1],
    'gamma': [0.0, 0.1, 0.2, 2.0, 5.0, 10.0],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [28]:
gs = GridSearchCV(estimator=XGBRegressor({'verbosity':0}), param_grid=grid_param3, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [29]:
gs.fit(train_X, train_y)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0,
                                    max_depth={'verbosity': 0},
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    re..., reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'gamma': [0.0, 0.1, 0.2, 2.0, 5.0, 10.0],
                    

In [30]:
gs.best_score_

-16058.269773271113

In [31]:
gs.best_params_

{'gamma': 0.0, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 42}

In [32]:
gs.cv_results_['mean_test_score']

array([-16058.26977327, -16058.26977327, -16058.26977327, -16058.26977327,
       -16058.26977327, -16058.26977327])

In [46]:
grid_param4 = {
    'max_depth': [3],
    'min_child_weight': [1],
    'gamma': [0.0],
    'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [47]:
gs = GridSearchCV(estimator=XGBRegressor({'verbosity':0}), param_grid=grid_param4, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [48]:
gs.fit(train_X, train_y)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0,
                                    max_depth={'verbosity': 0},
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    re...,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'gamma': [0.0], 'max_depth': [3],
                         'min_child_weight

In [49]:
gs.best_score_

-16034.114189469377

In [50]:
gs.best_params_

{'gamma': 0.0,
 'max_depth': 3,
 'min_child_weight': 1,
 'random_state': 42,
 'subsample': 0.8}

In [51]:
gs.cv_results_['mean_test_score']

array([-16062.34004881, -16076.84934172, -16051.13196865, -16034.11418947,
       -16058.26977327])

In [52]:
grid_param5 = {
    'max_depth': [3],
    'min_child_weight': [1],
    'gamma': [0.0],
    'subsample': [0.8],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [53]:
gs = GridSearchCV(estimator=XGBRegressor({'verbosity':0}), param_grid=grid_param5, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [54]:
gs.fit(train_X, train_y)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0,
                                    max_depth={'verbosity': 0},
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    re...
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
                         'gamm

In [55]:
gs.best_score_

-16032.867963036126

In [56]:
gs.best_params_

{'colsample_bytree': 0.7,
 'gamma': 0.0,
 'max_depth': 3,
 'min_child_weight': 1,
 'random_state': 42,
 'subsample': 0.8}

In [58]:
gs.cv_results_['mean_test_score']

array([-16095.56054424, -16059.08475562, -16032.86796304, -16047.10451295,
       -16034.11418947])

In [59]:
grid_param6 = {
    'max_depth': [3],
    'min_child_weight': [1],
    'gamma': [0.0],
    'subsample': [0.8],
    'colsample_bytree': [0.7],
    'reg_alpha': [1, 0.1, 0.01, 0.001, 0.0001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [60]:
gs = GridSearchCV(estimator=XGBRegressor({'verbosity':0}), param_grid=grid_param6, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [61]:
gs.fit(train_X, train_y)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0,
                                    max_depth={'verbosity': 0},
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    re...
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'colsample_bytree': [0.7], 'gamma': [0.0],
                         'max_depth': [3], 'min_child_weight': [1],
                         'random_state': [42],

In [62]:
gs.best_score_

-16032.867963921355

In [63]:
gs.best_params_

{'colsample_bytree': 0.7,
 'gamma': 0.0,
 'max_depth': 3,
 'min_child_weight': 1,
 'random_state': 42,
 'reg_alpha': 0.0001,
 'subsample': 0.8}

In [64]:
gs.cv_results_['mean_test_score']

array([-16034.9957791 , -16032.86932177, -16032.86810986, -16032.86797631,
       -16032.86796392])

In [65]:
grid_param7 = {
    'max_depth': [3],
    'min_child_weight': [1],
    'gamma': [0.0],
    'subsample': [0.8],
    'colsample_bytree': [0.7],
    'reg_alpha': [0.0001],
    'reg_lambda': [1, 0.1, 0.01, 0.001, 0.0001],
    'random_state': [42]
}


In [66]:
gs = GridSearchCV(estimator=XGBRegressor({'verbosity':0}), param_grid=grid_param7, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [67]:
gs.fit(train_X, train_y)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0,
                                    max_depth={'verbosity': 0},
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    re...
             iid='deprecated', n_jobs=None,
             param_grid={'colsample_bytree': [0.7], 'gamma': [0.0],
                         'max_depth': [3], 'min_child_weight': [1],
                         'random_state': [42], 'reg_alpha': [0.0001],
                         'reg_lambda': 

In [68]:
gs.best_score_

-16032.867963921355

In [69]:
gs.best_params_

{'colsample_bytree': 0.7,
 'gamma': 0.0,
 'max_depth': 3,
 'min_child_weight': 1,
 'random_state': 42,
 'reg_alpha': 0.0001,
 'reg_lambda': 1,
 'subsample': 0.8}

In [70]:
gs.cv_results_['mean_test_score']

array([-16032.86796392, -16035.47366247, -16046.68611061, -16041.4903475 ,
       -16041.50569681])

In [71]:
XGB = XGBRegressor(**gs.best_params_)

In [72]:
XGB = XGB.fit(train_X, train_y)



In [73]:
XGB.feature_importances_

array([0.16535676, 0.14066575, 0.28692836, 0.01158463, 0.04725209,
       0.0148497 , 0.01217799, 0.00422753, 0.00350342, 0.01086116,
       0.20995809, 0.08027383, 0.0123607 ], dtype=float32)

In [76]:
sorted(
    zip(map(lambda x: round(x, 3), XGB.feature_importances_), train.iloc[:, [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14]].columns),
    reverse=True)

[(0.287, 'beds'),
 (0.21, 'room_type_Shared room'),
 (0.165, 'bathrooms'),
 (0.141, 'bedrooms'),
 (0.08, 'property_type_num'),
 (0.047, 'number_of_reviews'),
 (0.015, 'cancellation_policy_moderate'),
 (0.012, 'host_response_rate'),
 (0.012, 'cancellation_policy_strict'),
 (0.011, 'room_type_Private room'),
 (0.004, 'cancellation_policy_super_strict_60'),
 (0.004, 'cancellation_policy_super_strict_30')]

In [77]:
# ターゲットと特徴量の分割
# test_x = test.iloc[:, 1:].values
# test_y = test.Survived.values

### テストデータで予測

In [78]:
# ターゲットと特徴量の分割
test_X_tmp = test.copy()
test_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
test_x = test_X_tmp.iloc[:].values
test_y = test.y.values

In [79]:
test_x.shape, test_y.shape

((11065, 13), (11065,))

In [80]:
pred_y = XGB.predict(test_x)

In [81]:
np.sqrt(mean_squared_error(test_y, pred_y))

127.93039620586933

### 検証データで予測

In [82]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [83]:
valid.shape

(18528, 14)

In [84]:
# ID の保存
valid_pass = valid.id.values

In [85]:
valid_X = valid.iloc[:, 1:].values

In [86]:
valid_X.shape, train_X.shape

((18528, 13), (44258, 13))

In [87]:
pred_valid_y = XGB.predict(valid_X)

In [88]:
pred_valid_y.shape

(18528,)

In [89]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [90]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [91]:
result_df.to_csv("./XGB_9.csv",  header=False)