このノートブックでは, これまでに定義した関数を用いてOptunaによるハイパーパラメータ最適化の結果をMLFlow trackingで記録します.

In [1]:
from typing import Any, Dict, Tuple, List
import category_encoders as ce
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from pandas import DataFrame
import optuna
import mlflow

df = sns.load_dataset('titanic')
df.head()

# 必要な特徴量を抽出
feature_names = [
    'class',
    'sex',
    'age',
    'sibsp',
    'parch',
    'fare',
    'embark_town',
    'deck',
]
df_x = df[feature_names]
df_y = df['survived']

class IntOrdEncoder(ce.OrdinalEncoder):
    def __init__(self, cols, mapping, handle_unknown):
        super().__init__(cols=cols, mapping=mapping, handle_unknown=handle_unknown)
        self.cols = cols

    def transform(self, *args, **kwargs):
        """xはpd.DataFrameです.
        """
        x = super().transform(*args, **kwargs)
        for col in self.cols:
            x[col] = x[col].astype(int)

        return x

    def fit_transform(self, *args, **kwargs):
        """xはpd.DataFrameです.
        """
        x = super().fit_transform(*args, **kwargs)
        for col in self.cols:
            x[col] = x[col].astype(int)

        return x


x_tr, x_te, y_tr, y_te = train_test_split(df_x, df_y, test_size=0.33, shuffle=True, random_state=42)


def titanic_cat_encoder() -> Tuple[List[str], ce.OrdinalEncoder]:
    cols = ['class', 'sex', 'embark_town', 'deck']
    mapping = [
        {"col": "class", "mapping": {"First": 0, "Second": 1, "Third": 2}},
        {"col": "sex", "mapping": {"male": 0, "female": 1}},
        {"col": "embark_town", "mapping": {"Southampton": 0, "Cherbourg": 1, "Queenstown": 2}},
        {"col": "deck", "mapping": {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6}},
    ]
    return (cols, IntOrdEncoder(cols=cols, mapping=mapping, handle_unknown='value'))


def fit_eval(x, y, encoder, params, cols, nested):
    with mlflow.start_run(nested=nested):
        mlflow.log_param("iterations", params["iterations"])
        mlflow.log_param("cat_features", params["cat_features"])

        # cat_featuresを正しい値に設定
        cat_features = None if params["cat_features"] == "none" else cols
        params["cat_features"] = cat_features

        # パイプラインを構成
        clf = CatBoostClassifier(
            **params,
            verbose=False
        )
        pipe = make_pipeline(encoder, clf)

        # スコアを計算
        score = cross_val_score(pipe, x, y, cv=5).mean()
        mlflow.log_metric("cv_score", score)

        return score


def suggest_params(trial: optuna.Trial) -> Dict:
    # Catboostの繰り返し回数
    iterations = trial.suggest_categorical("iterations", [200, 1000])

    # cat_featuresは本来は列名のリストとして与えられますが,
    # ここではログを表示した時の見やすさのために仮の値をセットしています.
    cat_features = trial.suggest_categorical("cat_features", ["none", "given"])

    return {
        "iterations": iterations,
        "cat_features": cat_features,
    }


Optunaで利用するための目的関数を定義します. 既に下位のRunで訓練と評価を行いログを取る関数を定義しているので, ハイパーパラメータに対する評価を返す関数を定義します.

In [2]:
def create_objective(x: DataFrame, y: DataFrame) -> Any:  # 戻り値の型は後で書く
    (cols, encoder) = titanic_cat_encoder()

    def objective(trial: optuna.Trial) -> Any:  # 戻り値の型は後で書く
        params = suggest_params(trial)

        return fit_eval(x, y, encoder, params, cols, nested=True)

    return objective

あとはMLFlowの上位のRunを開始し, この関数をOptunaに与えます.

In [3]:
with mlflow.start_run():
    objective = create_objective(x_tr, y_tr)
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10, show_progress_bar=True)


[32m[I 2022-06-26 11:22:08,232][0m A new study created in memory with name: no-name-4726e414-719b-4477-a0e9-c6cac307e00c[0m
  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2022-06-26 11:22:13,617][0m Trial 0 finished with value: 0.82390756302521 and parameters: {'iterations': 1000, 'cat_features': 'given'}. Best is trial 0 with value: 0.82390756302521.[0m
[32m[I 2022-06-26 11:22:16,349][0m Trial 1 finished with value: 0.8155042016806722 and parameters: {'iterations': 1000, 'cat_features': 'none'}. Best is trial 0 with value: 0.82390756302521.[0m
[32m[I 2022-06-26 11:22:21,800][0m Trial 2 finished with value: 0.82390756302521 and parameters: {'iterations': 1000, 'cat_features': 'given'}. Best is trial 0 with value: 0.82390756302521.[0m
[32m[I 2022-06-26 11:22:27,471][0m Trial 3 finished with value: 0.82390756302521 and parameters: {'iterations': 1000, 'cat_features': 'given'}. Best is trial 0 with value: 0.82390756302521.[0m
[32m[I 2022-06-26 11:22:28,663][0m Trial 4 finished with value: 0.8205462184873948 and parameters: {'iterations': 200, 'cat_features': 'given'}. Best is trial 0 with value: 0.82390756302521.[0m
[32m[I 2022-06-26

先ほどのノートブックと同様にしてMLFlow tracking UIで結果を可視化できます.