このノートブックではOptunaの使い方を確認します.

In [1]:
from typing import Any, Dict, Tuple, List
import category_encoders as ce
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from pandas import DataFrame
import optuna

df = sns.load_dataset('titanic')
df.head()

# 必要な特徴量を抽出
feature_names = [
    'class',
    'sex',
    'age',
    'sibsp',
    'parch',
    'fare',
    'embark_town',
    'deck',
]
df_x = df[feature_names]
df_y = df['survived']

class IntOrdEncoder(ce.OrdinalEncoder):
    def __init__(self, cols, mapping, handle_unknown):
        super().__init__(cols=cols, mapping=mapping, handle_unknown=handle_unknown)
        self.cols = cols

    def transform(self, *args, **kwargs):
        """xはpd.DataFrameです.
        """
        x = super().transform(*args, **kwargs)
        for col in self.cols:
            x[col] = x[col].astype(int)

        return x

    def fit_transform(self, *args, **kwargs):
        """xはpd.DataFrameです.
        """
        x = super().fit_transform(*args, **kwargs)
        for col in self.cols:
            x[col] = x[col].astype(int)

        return x

  from .autonotebook import tqdm as notebook_tqdm


データを訓練データとテストデータに分割します. テストデータはハイパーパラメータ最適化には使用せず, 最適なハイパーパラメータで訓練されたモデルを評価するために使用されます.

In [2]:
x_tr, x_te, y_tr, y_te = train_test_split(df_x, df_y, test_size=0.33, shuffle=True, random_state=42)

Optunaは目的関数に対してベイズ最適化を適用します.
目的関数はハイパーパラメータを受け取り, そのハイパーパラメータで訓練されたモデルの性能指標を返します.
まず, Catboostのハイパーパラメータの探索範囲を決める関数を作成します.

In [3]:
def suggest_params(trial: optuna.Trial) -> Dict:
    return {
        "depth": trial.suggest_int("depth", 1, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", np.exp(-7.0), 1.0),
        "random_strength": trial.suggest_int("random_strength", 1, 20),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 10),
    }

次に, 与えられたデータに対して目的関数を返す関数を定義します.

In [4]:
def titanic_cat_encoder() -> Tuple[List[str], ce.OrdinalEncoder]:  # 戻り値の型は後で書く
    cols = ['class', 'sex', 'embark_town', 'deck']
    mapping = [
        {"col": "class", "mapping": {"First": 0, "Second": 1, "Third": 2}},
        {"col": "sex", "mapping": {"male": 0, "female": 1}},
        {"col": "embark_town", "mapping": {"Southampton": 0, "Cherbourg": 1, "Queenstown": 2}},
        {"col": "deck", "mapping": {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6}},
    ]
    return (cols, IntOrdEncoder(cols=cols, mapping=mapping, handle_unknown='value'))


def create_objective(x: DataFrame, y: DataFrame) -> Any:  # 戻り値の型は後で書く
    (cols, encoder) = titanic_cat_encoder()

    def objective(trial: optuna.Trial) -> Any:  # 戻り値の型は後で書く
        params = suggest_params(trial)

        # # cat_featuresを正しい値に設定
        # cat_features = None if params["cat_features"] == "none" else cols
        # params["cat_features"] = cat_features
        params["cat_features"] = cols

        # パイプラインを構成
        clf = CatBoostClassifier(
            **params,
            verbose=False
        )
        pipe = make_pipeline(encoder, clf)

        # スコアを計算
        score = cross_val_score(pipe, x, y, cv=5).mean()

        return score

    return objective


ベイズ最適化を実行します.

In [5]:
objective = create_objective(df_x, df_y)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, show_progress_bar=True)


[32m[I 2022-08-21 13:34:57,470][0m A new study created in memory with name: no-name-1da426af-8ee8-459c-a8ae-1af935f85155[0m
  self._init_valid()
 10%|█         | 1/10 [00:10<01:35, 10.56s/it]

[32m[I 2022-08-21 13:35:08,045][0m Trial 0 finished with value: 0.8204569706860838 and parameters: {'depth': 8, 'learning_rate': 0.03555373248189771, 'random_strength': 11, 'l2_leaf_reg': 1.2997824258612722}. Best is trial 0 with value: 0.8204569706860838.[0m


 20%|██        | 2/10 [01:09<05:11, 38.89s/it]

[32m[I 2022-08-21 13:36:06,773][0m Trial 1 finished with value: 0.8058502291130502 and parameters: {'depth': 11, 'learning_rate': 0.5908369949288038, 'random_strength': 15, 'l2_leaf_reg': 1.387353752438766}. Best is trial 0 with value: 0.8204569706860838.[0m


 30%|███       | 3/10 [01:31<03:40, 31.46s/it]

[32m[I 2022-08-21 13:36:29,392][0m Trial 2 finished with value: 0.8125792480070304 and parameters: {'depth': 10, 'learning_rate': 0.011867932161248292, 'random_strength': 5, 'l2_leaf_reg': 1.7718168065122994}. Best is trial 0 with value: 0.8204569706860838.[0m


 40%|████      | 4/10 [01:38<02:08, 21.47s/it]

[32m[I 2022-08-21 13:36:35,550][0m Trial 3 finished with value: 0.8114430983616849 and parameters: {'depth': 4, 'learning_rate': 0.7387742749641937, 'random_strength': 1, 'l2_leaf_reg': 2.680346723946653}. Best is trial 0 with value: 0.8204569706860838.[0m


 50%|█████     | 5/10 [01:41<01:14, 14.93s/it]

[32m[I 2022-08-21 13:36:38,873][0m Trial 4 finished with value: 0.8025045508756511 and parameters: {'depth': 1, 'learning_rate': 0.2474464652320711, 'random_strength': 11, 'l2_leaf_reg': 1.3902794817016133}. Best is trial 0 with value: 0.8204569706860838.[0m


 60%|██████    | 6/10 [02:35<01:52, 28.16s/it]

[32m[I 2022-08-21 13:37:32,729][0m Trial 5 finished with value: 0.8069801016885318 and parameters: {'depth': 11, 'learning_rate': 0.22499663260270944, 'random_strength': 1, 'l2_leaf_reg': 7.397452635032828}. Best is trial 0 with value: 0.8204569706860838.[0m


 70%|███████   | 7/10 [02:39<01:00, 20.30s/it]

[32m[I 2022-08-21 13:37:36,830][0m Trial 6 finished with value: 0.8305442219571904 and parameters: {'depth': 2, 'learning_rate': 0.22758695601379678, 'random_strength': 11, 'l2_leaf_reg': 4.747348088599291}. Best is trial 6 with value: 0.8305442219571904.[0m


 80%|████████  | 8/10 [02:43<00:30, 15.30s/it]

[32m[I 2022-08-21 13:37:41,419][0m Trial 7 finished with value: 0.8193208210407381 and parameters: {'depth': 3, 'learning_rate': 0.051900176818467494, 'random_strength': 11, 'l2_leaf_reg': 8.467877145676594}. Best is trial 6 with value: 0.8305442219571904.[0m


 90%|█████████ | 9/10 [02:50<00:12, 12.56s/it]

[32m[I 2022-08-21 13:37:47,961][0m Trial 8 finished with value: 0.8193019898311468 and parameters: {'depth': 5, 'learning_rate': 0.03338091846911143, 'random_strength': 7, 'l2_leaf_reg': 7.826509560521461}. Best is trial 6 with value: 0.8305442219571904.[0m


100%|██████████| 10/10 [04:23<00:00, 26.35s/it]

[32m[I 2022-08-21 13:39:20,951][0m Trial 9 finished with value: 0.8092398468394955 and parameters: {'depth': 12, 'learning_rate': 0.04370546436205171, 'random_strength': 13, 'l2_leaf_reg': 1.2477177451657504}. Best is trial 6 with value: 0.8305442219571904.[0m



