In [19]:
import os
from azureml.core import Workspace
from azureml.core import Environment
from azureml.core import ScriptRunConfig, Experiment
from azureml.widgets import RunDetails
from azureml.core import Model
from azureml.core.runconfig import PyTorchConfiguration
from azureml.core.runconfig import MpiConfiguration
from azureml.core.runconfig import RunConfiguration, DockerConfiguration


In [20]:
# AMLワークスペースへの接続
ws = Workspace.from_config()

In [21]:
# training script
source_dir = "train"
script_name = "dist_train_bert.py"

# environment file
environment_file = os.path.join("train", "dist_train_bert_env.yml")

# azure ml settings
environment_name = "pl-env-lang"
experiment_name = "dist-bert-livedoor"
compute_name = "shuit-gpu-clus04"

In [25]:
# 学習環境作成、初回のみ長時間
env = Environment.from_conda_specification(environment_name, environment_file)

docker_config = DockerConfiguration(use_docker=True)

env.docker.base_image = (
    "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04:20211221.v1"
)

In [32]:
# 学習設定

num_nodes = 4
num_gpus = 1
target_batch_size = 256

cluster = ws.compute_targets[compute_name]
#dist_config = PyTorchConfiguration(node_count=2)
dist_config = MpiConfiguration(node_count=num_nodes)

src = ScriptRunConfig(
    source_directory=source_dir,
    script=script_name,
    arguments=[
        "--batch_size", target_batch_size/(num_nodes*num_gpus),
        "--max_epochs", 20,
        "--gpus", num_gpus,
        "--accelerator", "ddp",
        "--num_nodes", num_nodes
    ],
    compute_target=cluster,
    environment=env,
    docker_runtime_config=docker_config,
    distributed_job_config=dist_config,
)



In [33]:
# 実行
run = Experiment(ws, experiment_name).submit(src)
run.wait_for_completion(show_output=True)

RunId: dist-bert-livedoor_1642040107_49a2312c
Web View: https://ml.azure.com/runs/dist-bert-livedoor_1642040107_49a2312c?wsid=/subscriptions/902f236f-44df-463a-a5cb-1516ab2a9cd2/resourcegroups/shuit-common/workspaces/shuit-ml-workspace&tid=72f988bf-86f1-41af-91ab-2d7cd011db47

Streaming azureml-logs/55_azureml-execution-tvmps_1c29e4c0a064c7b364af2b81c08db49b19c6305a96beb098631bdc279c306da9_p.txt

2022-01-13T02:23:05Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/shuit-ml-workspace/azureml/dist-bert-livedoor_1642040107_49a2312c/mounts/workspaceblobstore -- stdout/stderr: 
2022-01-13T02:23:06Z Failed to start nvidia-fabricmanager due to exit status 5 with output Failed to start nvidia-fabricmanager.service: Unit nvidia-fabricmanager.service not found.
. Please ignore this if the GPUs don't utilize NVIDIA® NVLink® switches.
2022-01-13T02:23:07Z Starting output-watcher...
2022-01-13T02:23:07Z IsDedicatedCompute == False, starting polling for Low-Pri

{'runId': 'dist-bert-livedoor_1642040107_49a2312c',
 'target': 'shuit-gpu-clus04',
 'status': 'Completed',
 'startTimeUtc': '2022-01-13T02:23:00.492081Z',
 'endTimeUtc': '2022-01-13T02:33:09.026733Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '1643985d-7fb5-4f7f-9a9d-3981af313466',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json',
  'azureml.git.repository_uri': 'https://github.com/ShuntaIto/azureml-pl-sample.git',
  'mlflow.source.git.repoURL': 'https://github.com/ShuntaIto/azureml-pl-sample.git',
  'azureml.git.branch': 'main',
  'mlflow.source.git.branch': 'main',
  'azureml.git.commit': 'e5ae7b0a06d72f7b1371675f42ef9708cc8ea2c5',
  'mlflow.source.git.commit': 'e5ae7b0a06d72f7b1371675f42ef9708cc8ea2c5',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'dist_train_bert.py',
  'command': '',
  'useAbsolutePat

In [13]:
# モデル登録
run.register_model(
    model_name="bert-livedoor-model",
    model_path=os.path.join('outputs', 'model.ckpt'),
    model_framework=Model.Framework.PYTORCH,
)

ModelPathNotFoundException: ModelPathNotFoundException:
	Message: Could not locate the provided model_path outputs/model.onnx in the set of files uploaded to the run: ['azureml-logs/55_azureml-execution-tvmps_0f93e9792dd9ffbd80a0b0cbeb00a8fab0b14be474a32b815609bb1f4bffa494_d.txt', 'azureml-logs/55_azureml-execution-tvmps_6ad0f50cd79742bed158768f299d376ef743927df3983464d8dc993376693eb2_d.txt', 'azureml-logs/65_job_prep-tvmps_0f93e9792dd9ffbd80a0b0cbeb00a8fab0b14be474a32b815609bb1f4bffa494_d.txt', 'azureml-logs/65_job_prep-tvmps_6ad0f50cd79742bed158768f299d376ef743927df3983464d8dc993376693eb2_d.txt', 'azureml-logs/70_driver_log_0.txt', 'azureml-logs/70_driver_log_1.txt', 'azureml-logs/70_mpi_log.txt', 'azureml-logs/75_job_post-tvmps_0f93e9792dd9ffbd80a0b0cbeb00a8fab0b14be474a32b815609bb1f4bffa494_d.txt', 'azureml-logs/75_job_post-tvmps_6ad0f50cd79742bed158768f299d376ef743927df3983464d8dc993376693eb2_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/0_140_azureml.log', 'logs/azureml/1_119_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.ckpt']
                See https://aka.ms/run-logging for more details.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Could not locate the provided model_path outputs/model.onnx in the set of files uploaded to the run: ['azureml-logs/55_azureml-execution-tvmps_0f93e9792dd9ffbd80a0b0cbeb00a8fab0b14be474a32b815609bb1f4bffa494_d.txt', 'azureml-logs/55_azureml-execution-tvmps_6ad0f50cd79742bed158768f299d376ef743927df3983464d8dc993376693eb2_d.txt', 'azureml-logs/65_job_prep-tvmps_0f93e9792dd9ffbd80a0b0cbeb00a8fab0b14be474a32b815609bb1f4bffa494_d.txt', 'azureml-logs/65_job_prep-tvmps_6ad0f50cd79742bed158768f299d376ef743927df3983464d8dc993376693eb2_d.txt', 'azureml-logs/70_driver_log_0.txt', 'azureml-logs/70_driver_log_1.txt', 'azureml-logs/70_mpi_log.txt', 'azureml-logs/75_job_post-tvmps_0f93e9792dd9ffbd80a0b0cbeb00a8fab0b14be474a32b815609bb1f4bffa494_d.txt', 'azureml-logs/75_job_post-tvmps_6ad0f50cd79742bed158768f299d376ef743927df3983464d8dc993376693eb2_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/0_140_azureml.log', 'logs/azureml/1_119_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.ckpt']\n                See https://aka.ms/run-logging for more details."
    }
}