# やること

- mlflowの使い方
- クラサバでやる

## mlflow install

In [4]:
# !pip install mlflow

## mlflow serverを起動

コンソールをたてて起動しておく

In [14]:
# mlflow server -h 0.0.0.0 -p 5000

## import

In [98]:
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import load_digits
import numpy as np
import pandas as pd

import pickle
import os

## データの用意

In [99]:
d = load_digits()
data = d['data']
d.keys()

dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])

In [100]:
X = pd.DataFrame(data)
y = d['target']
y_target_names = [str(i) for i in d['target_names'].tolist()]

## 分析する

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=71)

In [102]:
X_train.shape, X_test.shape

((1257, 64), (540, 64))

In [103]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=71)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=71, verbose=0,
                       warm_start=False)

In [104]:
# 精度確認
y_pred = rf_clf.predict(X_test)
cr_print = classification_report(y_pred=y_pred, y_true=y_test, target_names=y_target_names)
print(cr_print)

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        52
           1       0.93      0.63      0.75        63
           2       0.82      0.94      0.87        52
           3       0.81      0.78      0.79        59
           4       0.94      0.94      0.94        65
           5       0.96      0.82      0.88        56
           6       0.92      0.98      0.95        45
           7       0.81      0.94      0.87        49
           8       0.77      0.66      0.71        50
           9       0.65      0.84      0.73        49

    accuracy                           0.85       540
   macro avg       0.85      0.85      0.85       540
weighted avg       0.86      0.85      0.85       540



## mlflowへ保存

In [105]:
# mlflow serverのurlを指定する → 指定しない場合はカレントディレクトリにログが保存される
remote_server_uri = 'http://host.docker.internal:5000/'
mlflow.set_tracking_uri(remote_server_uri)

# 対象のexperimentを指定する
# experiment is 何？ → モデルのリポジトリ名的なものと思ってもらえれば
mlflow.set_experiment('mnist')

INFO: 'mnist' does not exist. Creating a new experiment


experimentが存在しない場合は自動で作成される

In [106]:
model_save_path = os.path.join(os.getcwd(), 'tmp.obj')
with open(model_save_path, mode='wb') as f:
    pickle.dumps(rf_clf)

In [109]:
model_save_path

'/app/notebooks/tmp.obj'

In [107]:
!ls -lht

total 128K
-rw-r--r-- 1 root root    0 Nov 30 02:38 tmp.obj
-rw-r--r-- 1 root root 123K Nov 30 02:37 1_チュートリアル.ipynb
drwxr-xr-x 3 root root   96 Nov 30 02:10 mlruns
-rw-r--r-- 1 root root  555 Nov 29 11:25 2_Kaggleデータ分析でMLFlowを実践する.ipynb


In [111]:
with mlflow.start_run() as run:
    # paramterの保存
    for k, v in rf_clf.get_params().items():
        mlflow.log_param(k, v)
    
    # mlflow.log_params(rf_clf.get_params())  でも良い
    # mlflow.xxxs 系はdictが渡せる

    # metrics(精度とかその辺)の保存
    mlflow.log_metric('accuracy', accuracy_score(y_pred=y_pred, y_true=y_test))
    mlflow.log_metric('precision', precision_score(y_pred=y_pred, y_true=y_test, average='micro'))
    mlflow.log_metric('recall', recall_score(y_pred=y_pred, y_true=y_test, average='micro'))
    mlflow.log_metric('f1_score', f1_score(y_pred=y_pred, y_true=y_test, average='micro'))
    
    # その他的な値を保存したい場合は、tagで保存可能
    mlflow.set_tag('example', 10)
    mlflow.set_tag('example2', 'test')
    
    # モデルの保存。。。はできない
    # mlflow.log_artifact(model_save_path)

## 保存されているか確認

In [112]:
# http://localhost:5000/ を参照

## 保存されているか、プログラマブルに確認

In [113]:
# client側で取得方法
client = mlflow.tracking.MlflowClient(tracking_uri=remote_server_uri)

# experiment_nameでexperimentを検索
target_experiment = client.get_experiment_by_name(experiment_name)
target_experiment

<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='mnist', tags={}>

In [86]:
run_ids = client.search_runs([target_experiment.experiment_id])

In [114]:
run_ids[0]

<Run: data=<RunData: metrics={'accuracy': 0.848148148148,
 'f1_score': 0.848148148148,
 'precision': 0.848148148148,
 'recall': 0.848148148148}, params={'bootstrap': 'True',
 'class_weight': 'None',
 'criterion': 'gini',
 'max_depth': '3',
 'max_features': 'auto',
 'max_leaf_nodes': 'None',
 'min_impurity_decrease': '0.0',
 'min_impurity_split': 'None',
 'min_samples_leaf': '1',
 'min_samples_split': '2',
 'min_weight_fraction_leaf': '0.0',
 'n_estimators': '100',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '71',
 'verbose': '0',
 'warm_start': 'False'}, tags={'example': '10',
 'example2': 'test',
 'mlflow.source.name': '/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'root'}>, info=<RunInfo: artifact_uri='./mlruns/1/191c732924bf4e7b912ca5afec6ab281/artifacts', end_time=1575079873092, experiment_id='1', lifecycle_stage='active', run_id='191c732924bf4e7b912ca5afec6ab281', run_uuid='191c732924bf4e7b912ca5afec6a

In [115]:
len(run_ids)

4