## Azure ML Experiment Submission

To use this notebook, you need to download `config.json` file from Azure ML Workspace and place it in this folder. This will allow us to get the workspace reference right away:

In [1]:
from azureml.core import Workspace

try:
    ws = Workspace.from_config()
    print(ws.name, ws.location, ws.resource_group, ws.location, sep='\t')
    print('Library configuration succeeded')
except:
    print('Workspace not found')

quick-starts-ws-133730	southcentralus	aml-quickstarts-133730	southcentralus
Library configuration succeeded


Then make sure we have the compute cluster. If the cluster does not exist - we will create it programmatically!

In [8]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cluster_name = "AUTOML-cluster"

# Verify that cluster does not exist already
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2',
                                                           max_nodes=5)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
#Creating daatset in a file
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import os
import pickle
import numpy as np

print('fetching MNIST data...')
mnist = fetch_openml('mnist_784')
mnist['target'] = np.array([int(x) for x in mnist['target']])

# use a random subset of n records to reduce training time.
n = 20000
shuffle_index = np.random.permutation(70000)[:n]
X, y = mnist['data'][shuffle_index], mnist['target'][shuffle_index]

os.makedirs('dataset',exist_ok=True)
with open('dataset/mnist.pkl','wb') as f:
    pickle.dump((X,y),f)

print('Done')


fetching MNIST data...
Done


Now upload the MNIST dataset into the Azure ML Workspace:

In [4]:
ds = ws.get_default_datastore()
ds.upload('./dataset', target_path='mnist_data', overwrite=True, show_progress=True)

Uploading an estimated of 1 files
Uploading ./dataset/mnist.pkl
Uploaded ./dataset/mnist.pkl, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_b86733eee09c4d619819a4ca9907b5b5

In [None]:
# #working on dataset for using it by registering dataset in azure datastore
# from azureml.core.dataset import Dataset

# web_path ='https://dprepdata.blob.core.windows.net/demo/Titanic.csv'
# titanic_ds = Dataset.Tabular.from_delimited_files(path=web_path)



Now let us create training script:

In [5]:
%%writefile mytrain.py
import argparse
import json
import os
from azureml.core import Run
from azureml.core.model import Model
import pickle
import keras
from keras.layers import Dense,Dropout

parser = argparse.ArgumentParser(description='MNIST Train')
parser.add_argument('--data_folder', type=str, dest='data_folder', help='data folder mounting point')
parser.add_argument('--epochs', type=int, default=3)
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--dropout', type=float)
parser.add_argument('--hidden', type=int, default=100)

args = parser.parse_args()

mnist_fn = os.path.join(args.data_folder, 'mnist_data','mnist.pkl')
mnist_fn = 'dataset/mnist.pkl'
with open(mnist_fn,'rb') as f:
    X,y = pickle.load(f)

X /= 255.0
y = keras.utils.to_categorical(y,10)

n = int(0.8*X.shape[0])
x_train = X[0:n]
y_train = y[0:n]
x_test = X[n:]
y_test = y[n:]

model = keras.models.Sequential()
model.add(Dense(args.hidden,input_shape=(784,),activation='relu'))
if args.dropout is not None and args.dropout<1:
    model.add(Dropout(args.dropout))
model.add(Dense(10,activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=args.batch_size,
          epochs=args.epochs,
          verbose=1,
          validation_data=(x_test, y_test))

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

os.makedirs('outputs',exist_ok=True)
model.save('outputs/mnist_model.hdf5')

# Log metrics
run = Run.get_context()
run.log('Test Loss', score[0])
run.log('Accuracy', score[1])

Overwriting mytrain.py


Now let's submit the experiment to run:

In [9]:
from azureml.core import Experiment
from azureml.train.estimator import Estimator

experiment_name = 'Keras-MNIST'
exp = Experiment(workspace=ws, name=experiment_name)
script_params = {
    '--data_folder': ws.get_default_datastore(),
}

est = Estimator(source_directory='.',
                script_params=script_params,
                compute_target=cluster,
                entry_script='mytrain.py',
                pip_packages=['keras','tensorflow']
)

'Estimator' is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or an Azure ML curated environment.


In [10]:
run = exp.submit(est)

Submitting /mnt/batch/tasks/shared/LS_root/mounts/clusters/notebook133730/code/Users/odl_user_133730 directory for run. The size of the directory >= 25 MB, so it can take a few minutes.


### Hyperparameter optimization using Hyperdrive

In [11]:
from azureml.train.hyperdrive import *

param_sampling = RandomParameterSampling({
         '--hidden': choice([50,100,200,300]),
         '--batch_size': choice([64,128]), 
         '--epochs': choice([5,10,50]),
         '--dropout': choice([0.5,0.8,1])
    })

In [12]:
early_termination_policy = MedianStoppingPolicy(evaluation_interval=1, delay_evaluation=0)
hd_config = HyperDriveConfig(estimator=est,
                            hyperparameter_sampling=param_sampling,
                            policy=early_termination_policy,
                            primary_metric_name='Accuracy',
                            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                            max_total_runs=16,
                            max_concurrent_runs=4)
experiment = Experiment(workspace=ws, name='keras-hyperdrive')
hyperdrive_run = experiment.submit(hd_config)



In [13]:
# hyperdrive_run.cancel()
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

### Registrering the Best Model

In [23]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
print(best_run)
print('Best accuracy: {}'.format(best_run_metrics['Accuracy']))

Run(Experiment: keras-hyperdrive,
Id: keras-hyperdrive_1575928171879731_15,
Type: azureml.scriptrun,
Status: Completed)
Best accuracy: 0.9702500104904175


In [24]:
best_run.register_model(model_name='mnist_keras', model_path='outputs/mnist_model.hdf5')

Model(workspace=Workspace.create(name='AzMLWorkspace', subscription_id='d04ba089-715a-45b1-b4a3-2ce0fd60316f', resource_group='AzureMLGroup'), name=mnist_keras, id=mnist_keras:1, version=1, tags={}, properties={})