### XGBoost パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [5]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk2')

In [6]:
train_pkl.shape

(55175, 12)

In [7]:
train_pkl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55175 entries, 0 to 55582
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      55175 non-null  int64  
 1   accommodates            55175 non-null  float16
 2   bathrooms               55175 non-null  float16
 3   bedrooms                55175 non-null  float16
 4   beds                    55175 non-null  float16
 5   cleaning_fee            55175 non-null  int8   
 6   host_has_profile_pic    55175 non-null  int8   
 7   host_identity_verified  55175 non-null  int8   
 8   instant_bookable        55175 non-null  int8   
 9   number_of_reviews       55175 non-null  float16
 10  review_scores_rating    55175 non-null  float16
 11  y                       55175 non-null  float64
dtypes: float16(6), float64(1), int64(1), int8(4)
memory usage: 2.1 MB


In [10]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [11]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:-1]
train_y = train.y

### 検証するパラメータの設定

In [13]:
grid_param = {
    'max_depth': [5, 6, 7, 9],
    'min_child_weight': [1, 2, 3, 4, 6, 10],
    'gamma': [0.0, 0.1, 0.2, 2.0, 10.0],
    'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
    'reg_alpha': [1, 0.1, 0.01, 0.001],
    'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


### グリッドサーチ

In [14]:
gs = GridSearchCV(estimator=XGBRegressor({'verbosity':0}), param_grid=grid_param, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [None]:
gs.fit(train_X, train_y)

### 一番良かったパラメータで学習

In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [None]:
gs.cv_results_['mean_test_score']

In [None]:
XGB = XGBClassifier(**gs.best_params_)

In [None]:
XGB = XGB.fit(train_X, train_y)

In [None]:
XGB.feature_importances_

In [None]:
sorted(
    zip(map(lambda x: round(x, 3), XGB.feature_importances_), train.iloc[:, 1:].columns),
    reverse=True)

In [None]:
# ターゲットと特徴量の分割
# test_x = test.iloc[:, 1:].values
# test_y = test.Survived.values

### テストデータで予測

In [None]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:-1]
test_y = test.y

In [None]:
test_x.shape, test_y.shape

In [None]:
pred_y = XGB.predict(test_x)

In [None]:
np.sqrt(mean_squared_error(test_y, pred_y))

### 検証データで予測

In [None]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [None]:
valid.shape

In [None]:
# ID の保存
valid_pass = valid.id.values

In [None]:
valid_X = valid.iloc[:, 1:]

In [None]:
valid_X.shape, train_X.shape

In [None]:
pred_valid_y = XGB.predict(valid_X)

In [None]:
pred_valid_y.shape

In [None]:
type(valid_pass), type(pred_valid_y)

In [None]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [None]:
result_df.to_csv("./XGB_3.csv",  header=False)