### XGBoost パラメータチューニング

In [2]:
import numpy as np
import pandas as pd

import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, make_scorer, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [3]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk1')

In [4]:
train_pkl.shape

(850, 11)

In [5]:
train_pkl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          850 non-null    int16  
 1   T_Bil        850 non-null    float16
 2   D_Bil        850 non-null    float16
 3   ALP          850 non-null    float32
 4   ALT_GPT      850 non-null    float32
 5   AST_GOT      850 non-null    float32
 6   TP           850 non-null    float16
 7   Alb          850 non-null    float16
 8   AG_ratio     850 non-null    float16
 9   Gender_Male  850 non-null    uint8  
 10  disease      850 non-null    int64  
dtypes: float16(5), float32(3), int16(1), int64(1), uint8(1)
memory usage: 27.5 KB


In [6]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [8]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, :-1]
train_y = train.disease.values

### 検証するパラメータの設定

In [9]:
grid_param1 = {
    'max_depth': [i for i in range(3, 11, 2)],
#     'min_child_weight': [1, 2, 3, 4, 6, 10],
#     'gamma': [0.0, 0.1, 0.2, 2.0, 10.0],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


### グリッドサーチ

In [11]:
gs = GridSearchCV(estimator=XGBClassifier({'verbosity':0}), param_grid=grid_param1, scoring='accuracy', cv=5, return_train_score=False)

In [12]:
gs.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [13]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth={'verbosity': 0},
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, 9], 'random_state': [42]},
             pre_dispatch='2*n_jobs', refit=True, 

In [14]:
gs.best_params_

{'max_depth': 5, 'random_state': 42}

In [15]:
grid_param2 = {
    'max_depth': [5],
    'min_child_weight': [i for i in range(1, 11)],
#     'gamma': [0.0, 0.1, 0.2, 2.0, 10.0],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [20]:
gs = GridSearchCV(estimator=XGBClassifier({'verbosity':0}), param_grid=grid_param2, scoring='accuracy', cv=5, return_train_score=False)

In [21]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth={'verbosity': 0},
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [5],
                         'min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    

In [22]:
gs.best_score_

0.8544117647058824

In [23]:
gs.best_params_

{'max_depth': 5, 'min_child_weight': 4, 'random_state': 42}

In [None]:
gs.cv_results_['mean_test_score']

In [36]:
grid_param3 = {
    'max_depth': [5],
    'min_child_weight': [4],
    'gamma': [0.0, 0.1, 0.2, 2.0, 5.0, 10.0],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [37]:
gs = GridSearchCV(estimator=XGBClassifier({'verbosity':0}), param_grid=grid_param3, scoring='accuracy', cv=5, return_train_score=False)

In [38]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth={'verbosity': 0},
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'gamma': [0.0, 0.1, 0.2, 2.0, 5.0, 10.0],
                         'max_depth': [5], 'min_child_weig

In [39]:
gs.best_score_

0.8544117647058824

In [40]:
gs.best_params_

{'gamma': 0.0, 'max_depth': 5, 'min_child_weight': 4, 'random_state': 42}

In [35]:
gs.cv_results_['mean_test_score']

array([0.85441176, 0.84852941, 0.84411765, 0.83823529, 0.83970588,
       0.83382353])

In [41]:
grid_param4 = {
    'max_depth': [5],
    'min_child_weight': [4],
    'gamma': [0.0],
    'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [42]:
gs = GridSearchCV(estimator=XGBClassifier({'verbosity':0}), param_grid=grid_param4, scoring='accuracy', cv=5, return_train_score=False)

In [43]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth={'verbosity': 0},
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'gamma': [0.0], 'max_depth': [5],
                         'min_child_weight': [4], 'random_state': 

In [44]:
gs.best_score_

0.8544117647058824

In [45]:
gs.best_params_

{'gamma': 0.0,
 'max_depth': 5,
 'min_child_weight': 4,
 'random_state': 42,
 'subsample': 1.0}

In [46]:
gs.cv_results_['mean_test_score']

array([0.83529412, 0.84264706, 0.84558824, 0.85      , 0.85441176])

In [47]:
grid_param5 = {
    'max_depth': [5],
    'min_child_weight': [4],
    'gamma': [0.0],
    'subsample': [1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
#     'reg_alpha': [1, 0.1, 0.01, 0.001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [48]:
gs = GridSearchCV(estimator=XGBClassifier({'verbosity':0}), param_grid=grid_param5, scoring='accuracy', cv=5, return_train_score=False)

In [49]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth={'verbosity': 0},
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
                         'gamma': [0.0], 'max_depth':

In [50]:
gs.best_score_

0.8588235294117647

In [51]:
gs.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0.0,
 'max_depth': 5,
 'min_child_weight': 4,
 'random_state': 42,
 'subsample': 1.0}

In [52]:
gs.cv_results_['mean_test_score']

array([0.85882353, 0.85294118, 0.84852941, 0.84558824, 0.85441176])

In [53]:
grid_param6 = {
    'max_depth': [5],
    'min_child_weight': [4],
    'gamma': [0.0],
    'subsample': [1.0],
    'colsample_bytree': [0.5],
    'reg_alpha': [1, 0.1, 0.01, 0.001, 0.0001],
#     'reg_lambda': [1, 0.1, 0.01, 0.001],
    'random_state': [42]
}


In [54]:
gs = GridSearchCV(estimator=XGBClassifier({'verbosity':0}), param_grid=grid_param6, scoring='accuracy', cv=5, return_train_score=False)

In [55]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth={'verbosity': 0},
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lam...
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'colsample_bytree': [0.5], 'gamma': [0.0],
                         'max_depth': [5], 'min_child_weight

In [56]:
gs.best_score_

0.8588235294117647

In [57]:
gs.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0.0,
 'max_depth': 5,
 'min_child_weight': 4,
 'random_state': 42,
 'reg_alpha': 0.0001,
 'subsample': 1.0}

In [58]:
gs.cv_results_['mean_test_score']

array([0.85147059, 0.85294118, 0.85147059, 0.85588235, 0.85882353])

In [59]:
grid_param7 = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [3, 4, 5],
    'gamma': [0.0, 0.1, 1.0],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0,4, 0,5, 0.6, 0.7],
    'reg_alpha': [1, 0.1, 0.01, 0.0001],
    'reg_lambda': [0.1, 1],
    'random_state': [42]
}


In [60]:
gs = GridSearchCV(estimator=XGBClassifier({'verbosity':0}), param_grid=grid_param7, scoring='accuracy', cv=5, return_train_score=False)

In [61]:
gs.fit(train_X, train_y)

xgboost.core.XGBoostError: value 4 for Parameter colsample_bytree exceed bound [0,1]

xgboost.core.XGBoostError: value 5 for Parameter colsample_bytree exceed bound [0,1]



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth={'verbosity': 0},
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lam...
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'colsample_bytree': [0, 4, 0, 5, 0.6, 0.7],
                         'gamma': [0.0, 0.1, 1.0], 'max_depth': [3, 5, 7],
                         'min_child_weight': [3, 4, 5], 'random_s

In [62]:
gs.best_score_

0.8691176470588236

In [63]:
gs.best_params_

{'colsample_bytree': 0.6,
 'gamma': 0.1,
 'max_depth': 5,
 'min_child_weight': 3,
 'random_state': 42,
 'reg_alpha': 0.01,
 'reg_lambda': 0.1,
 'subsample': 0.8}

In [None]:
# gs.cv_results_['mean_test_score']

In [64]:
XGB = XGBClassifier(**gs.best_params_)

In [65]:
XGB = XGB.fit(train_X, train_y)

In [66]:
XGB.feature_importances_

array([0.04241877, 0.19481507, 0.09560716, 0.07967296, 0.17115228,
       0.1332022 , 0.06207384, 0.05149817, 0.08740786, 0.08215169],
      dtype=float32)

In [67]:
sorted(
    zip(map(lambda x: round(x, 3), XGB.feature_importances_), train.iloc[:, :-1].columns),
    reverse=True)

[(0.195, 'T_Bil'),
 (0.171, 'ALT_GPT'),
 (0.133, 'AST_GOT'),
 (0.096, 'D_Bil'),
 (0.087, 'AG_ratio'),
 (0.082, 'Gender_Male'),
 (0.08, 'ALP'),
 (0.062, 'TP'),
 (0.051, 'Alb'),
 (0.042, 'Age')]

### テストデータで予測

In [69]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, :-1]
test_y = test.disease.values

In [70]:
test_x.shape, test_y.shape

((170, 10), (170,))

In [71]:
pred_y = XGB.predict(test_x)

In [72]:
confusion_matrix(test_y, pred_y)

array([[88,  5],
       [15, 62]], dtype=int64)

In [73]:
accuracy_score(test_y, pred_y)

0.8823529411764706

### 検証データで予測

In [74]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk1')

In [75]:
valid.shape

(350, 10)

In [76]:
# ID の保存
# valid_pass = valid.id.values

In [77]:
# valid_X = valid.iloc[:, 1:].values
valid_X = valid.copy()

In [78]:
valid_X.shape, train_X.shape

((350, 10), (680, 10))

In [79]:
pred_valid_y = XGB.predict(valid_X)

In [80]:
pred_valid_y.shape

(350,)

In [None]:
# type(valid_pass), type(pred_valid_y)

In [81]:
result_df = pd.DataFrame(pred_valid_y)

In [82]:
result_df.to_csv("./XGB_2.csv",  header=False)