ハイパーパラメータ : 各アルゴリズムに付随して、アルゴリズムの挙動を制御するための値

ハイパーパラメータの調整

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 乳がんに関するデータセットの読み込み
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()

x = dataset.data
t = dataset.target

In [3]:
x.shape, t.shape

((569, 30), (569,))

In [4]:
from sklearn.model_selection import train_test_split

# テスト用データセット：その他 ＝ 20 ： 80 
x_train_val, x_test, t_train_val, t_test = train_test_split(x, t, test_size=0.2, random_state=1)

In [5]:
# 検証用データセット：学習用データセット＝ 30 ： 70
x_train, x_val, t_train, t_val = train_test_split(x_train_val, t_train_val, test_size=0.3, random_state=1)

In [6]:
x_train.shape, x_val.shape, x_test.shape

((318, 30), (137, 30), (114, 30))

In [7]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=0)

In [8]:
dtree.fit(x_train, t_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [9]:
print('train score : ', dtree.score(x_train, t_train))
print('validation score : ', dtree.score(x_val, t_val))

train score :  1.0
validation score :  0.927007299270073


In [10]:
# ハイパーパラメータを設定して、モデルの定義
dtree = DecisionTreeClassifier(max_depth=10, min_samples_split=30, random_state=0)

dtree.fit(x_train, t_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=30,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [11]:
print('train score : ', dtree.score(x_train, t_train))
print('validation score : ', dtree.score(x_val, t_val))
print('test score :', dtree.score(x_test, t_test))

train score :  0.9308176100628931
validation score :  0.9562043795620438
test score : 0.9298245614035088


グリッドサーチ
メリット：指定した範囲を網羅するため、ある程度漏れがなくハイパーパラメータの探索を行うことができる
デメリット：場合によっては、数十～数百パターンの組合せを計算するため学習に時間を要する

In [12]:
# GridSearchCV クラスのインポート
from sklearn.model_selection import GridSearchCV

In [13]:
# 学習に使用するアルゴリズムの定義
estimator = DecisionTreeClassifier(random_state=0)

In [14]:
# 探索するハイパーパラメータと範囲の定義
param_grid = [{
    'max_depth': [3, 20, 50],
    'min_samples_split': [3, 20, 30]
}]

In [15]:
# データセット分割数を定義
cv = 25

In [16]:
# GridSearchCV クラスを用いたモデルの定義
tuned_model = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           cv=cv, return_train_score=False)

In [17]:
# モデルの学習＆検証
tuned_model.fit(x_train_val, t_train_val)

GridSearchCV(cv=25, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=0, splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'max_depth': [3, 20, 50],
                          'min_samples_split': [3, 20, 30]}],
            

In [18]:
# 検証結果の確認
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.00288964,0.00266539,0.002615,0.00350039,0.00332966,0.00329563,0.00330612,0.00359442,0.00334761
std_fit_time,0.00017267,0.000100564,0.000135717,0.000327662,0.000248534,0.000217132,0.000215581,0.000782221,0.000277153
mean_score_time,0.000224857,0.000193357,0.000188313,0.000213146,0.000187654,0.000209341,0.000179033,0.00024189,0.000192709
std_score_time,3.61064e-05,1.8227e-05,1.20772e-05,5.01853e-05,1.10375e-05,3.92483e-05,6.54707e-06,8.02257e-05,1.8028e-05
param_max_depth,3,3,3,20,20,20,50,50,50
param_min_samples_split,3,20,30,3,20,30,3,20,30
params,"{'max_depth': 3, 'min_samples_split': 3}","{'max_depth': 3, 'min_samples_split': 20}","{'max_depth': 3, 'min_samples_split': 30}","{'max_depth': 20, 'min_samples_split': 3}","{'max_depth': 20, 'min_samples_split': 20}","{'max_depth': 20, 'min_samples_split': 30}","{'max_depth': 50, 'min_samples_split': 3}","{'max_depth': 50, 'min_samples_split': 20}","{'max_depth': 50, 'min_samples_split': 30}"
split0_test_score,0.947368,0.894737,0.894737,1,0.894737,0.894737,1,0.894737,0.894737
split1_test_score,0.842105,0.842105,0.842105,0.947368,0.842105,0.842105,0.947368,0.842105,0.842105
split2_test_score,0.894737,0.894737,0.894737,0.947368,0.947368,0.947368,0.947368,0.947368,0.947368


In [19]:
estimator = DecisionTreeClassifier(random_state=0)
cv = 25
param_grid = [{
    'max_depth': [5, 10, 15] , 
    'min_samples_split': [10, 12, 15]
}]

In [20]:
# モデルの定義
tuned_model = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           cv=cv, return_train_score=False)

# モデルの学習
tuned_model.fit(x_train_val, t_train_val)

GridSearchCV(cv=25, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=0, splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'max_depth': [5, 10, 15],
                          'min_samples_split': [10, 12, 15]}],
           

In [21]:
# 学習結果の確認
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.00347357,0.00338617,0.00332185,0.00331402,0.00333806,0.00330737,0.00328679,0.00334264,0.00338163
std_fit_time,0.0002479,0.000266077,0.000193795,0.000215884,0.000274147,0.000231278,0.000172157,0.000290247,0.000339899
mean_score_time,0.000232983,0.000207396,0.000193748,0.000190582,0.000206251,0.00019228,0.000189247,0.000197105,0.000192022
std_score_time,6.79836e-05,6.66459e-05,2.83562e-05,3.0874e-05,5.41231e-05,2.62789e-05,2.46959e-05,2.86676e-05,1.47294e-05
param_max_depth,5,5,5,10,10,10,15,15,15
param_min_samples_split,10,12,15,10,12,15,10,12,15
params,"{'max_depth': 5, 'min_samples_split': 10}","{'max_depth': 5, 'min_samples_split': 12}","{'max_depth': 5, 'min_samples_split': 15}","{'max_depth': 10, 'min_samples_split': 10}","{'max_depth': 10, 'min_samples_split': 12}","{'max_depth': 10, 'min_samples_split': 15}","{'max_depth': 15, 'min_samples_split': 10}","{'max_depth': 15, 'min_samples_split': 12}","{'max_depth': 15, 'min_samples_split': 15}"
split0_test_score,1,1,1,1,1,1,1,1,1
split1_test_score,0.947368,0.947368,0.842105,0.947368,0.947368,0.842105,0.947368,0.947368,0.842105
split2_test_score,0.947368,0.947368,0.947368,0.947368,0.947368,0.947368,0.947368,0.947368,0.947368


In [22]:
# 最も予測精度の高かったハイパーパラメータの確認
tuned_model.best_params_

{'max_depth': 5, 'min_samples_split': 10}

In [23]:
# 最も予測精度の高かったモデルの引き継ぎ
best_model = tuned_model.best_estimator_

# モデルの検証
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193


In [24]:
# RandomizedSearchCV クラスのインポート
from sklearn.model_selection import RandomizedSearchCV

In [25]:
# 学習に使用するアルゴリズム
estimator = DecisionTreeClassifier(random_state=0)

In [26]:
list(range(1, 10, 2))

[1, 3, 5, 7, 9]

In [27]:
# ハイパーパラメータを探索する範囲の指定
param_distributions = {
    'max_depth': list(range(5, 100, 2)),
    'min_samples_split': list(range(2, 50, 1))
}

In [28]:
# 試行回数の指定
n_iter = 1000

In [29]:
cv = 5

In [30]:
# モデルの定義
tuned_model = RandomizedSearchCV(
    estimator=estimator, 
    param_distributions=param_distributions, 
    n_iter=n_iter, cv=cv, 
    random_state=0, return_train_score=False
)

In [31]:
# モデルの学習＆検証
tuned_model.fit(x_train_val, t_train_val)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=0,
             

In [32]:
# 学習結果の確認（スコアの高い順に表示）
pd.DataFrame(tuned_model.cv_results_).sort_values('rank_test_score').T

Unnamed: 0,213,136,127,566,936,330,563,314,309,77,...,567,147,555,557,146,560,142,141,554,0
mean_fit_time,0.00259638,0.002981,0.00298429,0.00261312,0.00264835,0.002668,0.00276165,0.00266428,0.00282421,0.00316815,...,0.00258722,0.00252461,0.0025219,0.00259938,0.00262876,0.00259876,0.00268626,0.00266767,0.00260191,0.00299253
std_fit_time,9.51902e-05,0.000368247,0.00039495,0.000207031,0.000179954,0.00023454,0.00018656,7.87332e-05,0.000169325,0.000205277,...,0.000211705,0.000124455,9.87075e-05,0.000209376,0.000110817,0.000132888,0.000329797,0.000250636,0.00012269,0.000267622
mean_score_time,0.000200653,0.000220633,0.000223064,0.000195599,0.000186777,0.000190783,0.000197172,0.000193548,0.000210142,0.000263596,...,0.000188351,0.000184631,0.000203991,0.000196218,0.000209236,0.000218201,0.000190258,0.000190592,0.000192261,0.000342131
std_score_time,3.2237e-05,3.84251e-05,1.3738e-05,1.85505e-05,1.62125e-06,9.65098e-06,7.61597e-06,7.02004e-06,1.51756e-05,3.35585e-05,...,3.59688e-06,1.92869e-06,3.47357e-05,1.31083e-05,4.10356e-05,6.13779e-05,4.30212e-06,3.84911e-06,5.77071e-06,0.000127267
param_min_samples_split,10,10,10,10,10,10,10,10,10,10,...,28,36,30,47,48,27,48,45,44,30
param_max_depth,83,27,99,57,31,25,79,17,13,65,...,55,43,41,57,43,85,61,33,23,9
params,"{'min_samples_split': 10, 'max_depth': 83}","{'min_samples_split': 10, 'max_depth': 27}","{'min_samples_split': 10, 'max_depth': 99}","{'min_samples_split': 10, 'max_depth': 57}","{'min_samples_split': 10, 'max_depth': 31}","{'min_samples_split': 10, 'max_depth': 25}","{'min_samples_split': 10, 'max_depth': 79}","{'min_samples_split': 10, 'max_depth': 17}","{'min_samples_split': 10, 'max_depth': 13}","{'min_samples_split': 10, 'max_depth': 65}",...,"{'min_samples_split': 28, 'max_depth': 55}","{'min_samples_split': 36, 'max_depth': 43}","{'min_samples_split': 30, 'max_depth': 41}","{'min_samples_split': 47, 'max_depth': 57}","{'min_samples_split': 48, 'max_depth': 43}","{'min_samples_split': 27, 'max_depth': 85}","{'min_samples_split': 48, 'max_depth': 61}","{'min_samples_split': 45, 'max_depth': 33}","{'min_samples_split': 44, 'max_depth': 23}","{'min_samples_split': 30, 'max_depth': 9}"
split0_test_score,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,...,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088
split1_test_score,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,...,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099
split2_test_score,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,...,0.934066,0.945055,0.934066,0.945055,0.945055,0.934066,0.945055,0.945055,0.945055,0.934066


In [33]:
# 最も予測精度の高かったハイパーパラメータの確認
tuned_model.best_params_

{'min_samples_split': 10, 'max_depth': 23}

In [34]:
# 最も予測精度の高かったモデルの引き継ぎ
best_model = tuned_model.best_estimator_

In [35]:
# モデルの検証
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193


ベイズ最適化
探索：まだ試していない値の範囲でハイパーパラメータを更新して、予測精度がどう変化するか情報を得る
活用：探索で得られた情報をもとに、予測精度が高まる可能性が高い範囲にハイパーパラメータを更新する


In [36]:
import optuna

Optuna
最初に関数 objective を定義
1. ハイパーパラメータごとに探索範囲を指定
2. 学習に使用するアルゴリズムを指定
3. 学習の実行、検証結果の表示

In [37]:
from sklearn.model_selection import cross_val_score

In [38]:
def objective(trial, x, t, cv):
    # ① ハイパーパラメータごとに探索範囲を指定
    max_depth = trial.suggest_int('max_depth', 2, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)

    # ② 学習に使用するアルゴリズムを指定
    estimator = DecisionTreeClassifier(
      max_depth = max_depth,
      min_samples_split = min_samples_split
    )

    # ③ 学習の実行、検証結果の表示
    print('Current_params : ', trial.params)
    accuracy = cross_val_score(estimator, x, t, cv=cv).mean()
    return accuracy

In [39]:
# study オブジェクトの作成（最大化）
study = optuna.create_study(direction='maximize')

In [40]:
# K 分割交差検証の K
cv = 5
# 目的関数の最適化
study.optimize(lambda trial: objective(trial, x_train_val, t_train_val, cv), n_trials=10)

print(study.best_trial)

Current_params :  {'max_depth': 81, 'min_samples_split': 98}


[I 2020-06-22 19:14:06,123] Finished trial#0 with value: 0.9208791208791209 with parameters: {'max_depth': 81, 'min_samples_split': 98}. Best is trial#0 with value: 0.9208791208791209.


Current_params :  {'max_depth': 3, 'min_samples_split': 34}


[I 2020-06-22 19:14:06,184] Finished trial#1 with value: 0.9208791208791209 with parameters: {'max_depth': 3, 'min_samples_split': 34}. Best is trial#0 with value: 0.9208791208791209.


Current_params :  {'max_depth': 73, 'min_samples_split': 55}


[I 2020-06-22 19:14:06,249] Finished trial#2 with value: 0.9186813186813187 with parameters: {'max_depth': 73, 'min_samples_split': 55}. Best is trial#0 with value: 0.9208791208791209.


Current_params :  {'max_depth': 5, 'min_samples_split': 96}


[I 2020-06-22 19:14:06,313] Finished trial#3 with value: 0.9186813186813187 with parameters: {'max_depth': 5, 'min_samples_split': 96}. Best is trial#0 with value: 0.9208791208791209.


Current_params :  {'max_depth': 40, 'min_samples_split': 73}


[I 2020-06-22 19:14:06,374] Finished trial#4 with value: 0.9208791208791209 with parameters: {'max_depth': 40, 'min_samples_split': 73}. Best is trial#0 with value: 0.9208791208791209.


Current_params :  {'max_depth': 11, 'min_samples_split': 28}


[I 2020-06-22 19:14:06,438] Finished trial#5 with value: 0.9186813186813187 with parameters: {'max_depth': 11, 'min_samples_split': 28}. Best is trial#0 with value: 0.9208791208791209.


Current_params :  {'max_depth': 34, 'min_samples_split': 9}


[I 2020-06-22 19:14:06,500] Finished trial#6 with value: 0.9428571428571428 with parameters: {'max_depth': 34, 'min_samples_split': 9}. Best is trial#6 with value: 0.9428571428571428.


Current_params :  {'max_depth': 32, 'min_samples_split': 18}


[I 2020-06-22 19:14:06,560] Finished trial#7 with value: 0.9274725274725275 with parameters: {'max_depth': 32, 'min_samples_split': 18}. Best is trial#6 with value: 0.9428571428571428.


Current_params :  {'max_depth': 85, 'min_samples_split': 45}


[I 2020-06-22 19:14:06,621] Finished trial#8 with value: 0.9186813186813187 with parameters: {'max_depth': 85, 'min_samples_split': 45}. Best is trial#6 with value: 0.9428571428571428.


Current_params :  {'max_depth': 94, 'min_samples_split': 31}


[I 2020-06-22 19:14:06,684] Finished trial#9 with value: 0.9186813186813187 with parameters: {'max_depth': 94, 'min_samples_split': 31}. Best is trial#6 with value: 0.9428571428571428.


FrozenTrial(number=6, value=0.9428571428571428, datetime_start=datetime.datetime(2020, 6, 22, 19, 14, 6, 440206), datetime_complete=datetime.datetime(2020, 6, 22, 19, 14, 6, 499758), params={'max_depth': 34, 'min_samples_split': 9}, distributions={'max_depth': IntUniformDistribution(high=100, low=2, step=1), 'min_samples_split': IntUniformDistribution(high=100, low=2, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=6, state=TrialState.COMPLETE)


In [41]:
# 最も予測精度の高かったハイパーパラメータの確認
study.best_params

{'max_depth': 34, 'min_samples_split': 9}

In [42]:
# 最適なハイパーパラメータを設定したモデルの定義
best_model = DecisionTreeClassifier(**study.best_params)

# ** とアスタリスクを 2 つ付け、さっきのハイパーパラメータをモデルのインスタンス化を行う際に引数に渡すことで、ハイパーパラメータを設定

# モデルの学習
best_model.fit(x_train_val, t_train_val)

# モデルの検証
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193
