### XGBoost パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./pd_train.pk2')

In [3]:
train_pkl.shape

(891, 15)

In [4]:
# Age, Fare は削除
# ダミー変数の先頭は削除
train_pkl.drop(['Age', 'Fare', 'Sex_female', 'Embarked_C', 'Pclass_1'], inplace=True, axis=1)

In [5]:
# 学習が終わらないので、特徴量を上位５つにする
train_pkl.drop(['Parch', 'Embarked_S', 'Embarked_Q'], inplace=True, axis=1)

In [6]:
train_pkl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   SibSp     891 non-null    int64  
 2   Age_bin   891 non-null    float64
 3   Fare_bin  891 non-null    float64
 4   Sex_male  891 non-null    uint8  
 5   Pclass_2  891 non-null    uint8  
 6   Pclass_3  891 non-null    uint8  
dtypes: float64(2), int64(2), uint8(3)
memory usage: 30.6 KB


In [7]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [8]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:]
train_y = train.Survived

### 検証するパラメータの設定

In [9]:
grid_param = {
    'n_estimators':[50, 100, 300],
    'max_depth': [2, 3, 4 , 5, 6],
    'min_child_weight': [1, 2, 3, 4, 6, 10],
    'gamma': [0.0, 0.1, 0.2, 2.0, 10.0],
    'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
    'reg_alpha': [1, 0.1, 0.01, 0.001],
    'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


### グリッドサーチ

In [10]:
gs = GridSearchCV(estimator=XGBClassifier({'verbosity':0}), param_grid=grid_param, scoring='accuracy', cv=5, return_train_score=False)

In [11]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth={'verbosity': 0},
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lam...
             param_grid={'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
                         'gamma': [0.0, 0.1, 0.2, 2.0, 10.0],
                         'max_depth': [2, 3, 4, 5, 6],
                         'min_child_weight': [1, 2, 3, 4, 6, 10],
                         'n_estimators': [50, 100, 300], 'rand

### 一番良かったパラメータで学習

In [12]:
gs.best_score_

0.8356544863587118

In [13]:
gs.best_params_

{'colsample_bytree': 1.0,
 'gamma': 0.0,
 'max_depth': 3,
 'min_child_weight': 3,
 'n_estimators': 50,
 'random_state': 42,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'subsample': 1.0}

In [14]:
gs.cv_results_['mean_test_score']

array([0.81172067, 0.82156013, 0.82016153, ..., 0.7794445 , 0.78085295,
       0.78366   ])

In [15]:
XGB = XGBClassifier(**gs.best_params_)

In [16]:
XGB = XGB.fit(train_X, train_y)

In [17]:
XGB.feature_importances_

array([0.04129649, 0.05680508, 0.04787331, 0.65819424, 0.02006559,
       0.17576528], dtype=float32)

In [18]:
sorted(
    zip(map(lambda x: round(x, 3), XGB.feature_importances_), train.iloc[:, 1:].columns),
    reverse=True)

[(0.658, 'Sex_male'),
 (0.176, 'Pclass_3'),
 (0.057, 'Age_bin'),
 (0.048, 'Fare_bin'),
 (0.041, 'SibSp'),
 (0.02, 'Pclass_2')]

In [19]:
# ターゲットと特徴量の分割
# test_x = test.iloc[:, 1:].values
# test_y = test.Survived.values

### テストデータで予測

In [20]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:]
test_y = test.Survived

In [21]:
test_x.shape, test_y.shape

((179, 6), (179,))

In [22]:
pred_y = XGB.predict(test_x)

In [23]:
confusion_matrix(test_y, pred_y)

array([[97,  8],
       [25, 49]], dtype=int64)

In [24]:
accuracy_score(test_y, pred_y)

0.8156424581005587

### 検証データで予測

In [25]:
# 検証データ読み込み
valid = pd.read_pickle('./pd_test.pk2')

In [26]:
valid.shape

(418, 15)

In [27]:
# ID の保存
valid_pass = valid.PassengerId.values

In [28]:
valid_X = valid.iloc[:, 1:]

In [29]:
valid_X.drop(['Age', 'Fare', 'Sex_female', 'Embarked_C', 'Pclass_1'], inplace=True, axis=1)

In [30]:
valid_X.drop(['Parch', 'Embarked_S', 'Embarked_Q'], inplace=True, axis=1)

In [31]:
valid_X.shape, train_X.shape

((418, 6), (712, 6))

In [32]:
# valid_X_2 = valid_X.rename(columns={'SibSp':'f0'})

In [33]:
# valid_X_2.rename(columns={'Parch':'f1', 'Age_bin':'f2', 'Fare_bin':'f3', 'Sex_male':'f4', 'Embarked_Q':'f5', 'Embarked_S':'f6', 'Pclass_2':'f7', 'Pclass_3':'f8'}, inplace=True)

In [34]:
pred_valid_y = XGB.predict(valid_X)

In [35]:
pred_valid_y.shape

(418,)

In [36]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [37]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['Survived'])

In [38]:
result_df.to_csv("./XGB_3.csv", index_label='PassengerId')