In [13]:
import os
from azureml.core import Workspace
from azureml.core import Environment
from azureml.core import ScriptRunConfig, Experiment
from azureml.widgets import RunDetails
from azureml.core import Model
from azureml.core.runconfig import RunConfiguration, DockerConfiguration
from azureml.train.hyperdrive import BayesianParameterSampling
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, uniform


In [14]:
# AMLワークスペースへの接続
ws = Workspace.from_config()

In [15]:
# training script
source_dir = "train"
script_name = "train_bert.py"

# environment file
environment_file = os.path.join("train", "train_bert_env.yml")

# azure ml settings
environment_name = "pl-env-lang"
experiment_name = "hp-tuning-bert-livedoor"
compute_name = "shuit-gpu-clus01"

In [16]:
# 学習環境作成、初回のみ長時間
env = Environment.from_conda_specification(environment_name, environment_file)

docker_config = DockerConfiguration(use_docker=True)
env.docker.base_image = (
    "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04"
)


In [18]:
ps = BayesianParameterSampling(
    {
        '--bert_lr': uniform(0.0001, 0.001),
        '--output_lr': uniform(0.001, 0.01) 
    }
)


In [22]:
# 学習設定
src = ScriptRunConfig(
    source_directory=source_dir,
    script=script_name,
    arguments=["--batch_size", 256, "--max_epochs", 40, "--gpus", 1],
    compute_target=compute_name,
    docker_runtime_config=docker_config,
    environment=env,
)


In [24]:
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     ##policy=policy,
                                     primary_metric_name='val_acc',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=40,
                                     max_concurrent_runs=2)


In [25]:
# 実行
run = Experiment(ws, experiment_name).submit(config=hyperdrive_config)
run.wait_for_completion(show_output=True)


RunId: HD_028a523c-7b7f-4513-9541-c9d056931a65
Web View: https://ml.azure.com/runs/HD_028a523c-7b7f-4513-9541-c9d056931a65?wsid=/subscriptions/902f236f-44df-463a-a5cb-1516ab2a9cd2/resourcegroups/shuit-common/workspaces/shuit-ml-workspace&tid=72f988bf-86f1-41af-91ab-2d7cd011db47

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-06-03T04:53:40.536495][API][INFO]Experiment created<END>\n""<START>[2021-06-03T04:53:41.347791][GENERATOR][INFO]Trying to sample '2' jobs from the hyperparameter space<END>\n""<START>[2021-06-03T04:53:41.606406][GENERATOR][INFO]Successfully sampled '2' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_028a523c-7b7f-4513-9541-c9d056931a65
Web View: https://ml.azure.com/runs/HD_028a523c-7b7f-4513-9541-c9d056931a65?wsid=/subscriptions/902f236f-44df-463a-a5cb-1516ab2a9cd2/resourcegroups/shuit-common/workspaces/shuit-ml-workspace&tid=72f988bf-86f1-41af-91ab-2d7cd011db47



{'runId': 'HD_028a523c-7b7f-4513-9541-c9d056931a65',
 'target': 'shuit-gpu-clus01',
 'status': 'Completed',
 'startTimeUtc': '2021-06-03T04:53:40.186913Z',
 'endTimeUtc': '2021-06-03T09:10:14.232317Z',
 'properties': {'primary_metric_config': '{"name": "val_acc", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '647eba09-2816-4657-baad-46cd3c5d611d',
  'score': '0.92578125',
  'best_child_run_id': 'HD_028a523c-7b7f-4513-9541-c9d056931a65_28',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://shuitmlstorage.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_028a523c-7b7f-4513-9541-c9d056931a65/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=PKfsAX4DeZb6QcbjFIFBCdDqc4Tg%2B%2FzKSd7owwxYA4M%3D&st=2021-06-03T09%3A01%3A11Z&se=2021-06-03T17%3A11%3A11Z&sp=r'},
 'submittedBy': 'Shunta Ito'}

In [27]:
best_run = run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])


['--batch_size', '256', '--max_epochs', '40', '--gpus', '1', '--bert_lr', '0.00016016274069493184', '--output_lr', '0.001027013969309762']


In [28]:
# モデル登録
best_run.register_model(
    model_name="bert-livedoor-model",
    model_path=os.path.join('outputs', 'model.onnx'),
    model_framework=Model.Framework.ONNX,
)

Model(workspace=Workspace.create(name='shuit-ml-workspace', subscription_id='902f236f-44df-463a-a5cb-1516ab2a9cd2', resource_group='shuit-common'), name=bert-livedoor-model, id=bert-livedoor-model:6, version=6, tags={}, properties={})