# How to do distributed training on Azure ML service with Keras using horovod 

This notebook demonstrates how to perform distributed training using Keras with horovod in Azure Machine learning 

Let's import the required Azure ML Packages and defines the needed constants...

In [1]:
import azureml
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import TensorFlow
from azureml.widgets import RunDetails
from azureml.core.runconfig import MpiConfiguration


SUBSCRIPTION_ID = "fe375bc2-9f1a-4909-ad0d-9319806d5e97"
RESOURCE_GROUP = "adb_rg"
WORKSPACE_NAME = "repro"

CLUSTER_NAME = "gpucluster"
PROJECT_FOLDER = "./"

print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.39


## Initialize Azure ML workspace

In [2]:
ws = Workspace(subscription_id = SUBSCRIPTION_ID, 
               resource_group =RESOURCE_GROUP , 
               workspace_name = WORKSPACE_NAME
              )
    
ws.write_config()

## Initialize Azure ML compute

In [3]:
try:
    gpu_cluster = ComputeTarget(workspace=ws,
                                name=CLUSTER_NAME
                               )
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',
                                                           max_nodes=2
                                                          )
    gpu_cluster = ComputeTarget.create(ws,gpu_cluster_name,compute_config)

gpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


## Initialize Tensorflow estimator

In [36]:
estimator = TensorFlow(source_directory=PROJECT_FOLDER,
                       compute_target=gpu_cluster,
                       entry_script='train.py',
                       node_count=2,
                       distributed_training=MpiConfiguration(),
                       use_gpu=True
                      )

framework_version is not specified, defaulting to version 1.13.


## Create experiment and submit run for execution

In [37]:
experiment = Experiment(ws, name="keras_horovod")
run = experiment.submit(estimator)
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…