# Hyperparameter Tuning using HyperDrive
Importing Dependencies. In the cell below, we import all the dependencies that will be needed to complete the project.

In [7]:
import logging
import os
import csv

from matplotlib import pyplot as pyplot
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

import joblib

from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute

from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice

from azureml.widgets import RunDetails

from azureml.data.dataset_factory import TabularDatasetFactory

# Check core SDK version number
print("SDK Version:", azureml.core.VERSION)

SDK Version: 1.31.0


In [2]:
# creating a hyperdrive experiment in our workspace

# initializing a workspace
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

# choosing a name for experiment
experiment_name = 'HyperDrive-Exp007'
project_folder = './HyperDrive-pipeline-project'

# creating the experiment
experiment=Experiment(ws, experiment_name)
experiment.start_logging()
experiment

quick-starts-ws-150619
aml-quickstarts-150619
southcentralus
f5091c60-1c3c-430f-8d81-d802f6bf2414


Name,Workspace,Report Page,Docs Page
HyperDrive-Exp007,quick-starts-ws-150619,Link to Azure Machine Learning studio,Link to Documentation


In [3]:
# creating an AMLCompute cluster for running the experiment

# importing required dependencies
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# Choosing a name for our CPU cluster
amlcompute_cluster_name = "CI-HyperDrive01"

# Verifying that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)
compute_target.get_status()

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


<azureml.core.compute.amlcompute.AmlComputeStatus at 0x7faa64c30198>

# Dataset
Getting data. In the cells below, we check or set up a compute cluster and access the data that will be used in this project. The dataset is external.

In [4]:
#ds = TabularDatasetFactory.from_delimited_files("https://raw.githubusercontent.com/eparamasari/ML_Engineer_ND_Capstone/main/data/heart_failure_clinical_records_dataset.csv")

# entering the dataset's name and description in 'key' and 'description_text' respectively

found = False
key = "Heart Failure Prediction"
description_text = "Heart Failure Prediction DataSet"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 
        print("The dataset is loaded")

if not found:
        # Creating Dataset and register it into Workspace
        example_data = "https://raw.githubusercontent.com/RollyAngell/ML-Azure-Udacity/main/Project%203%20-%20Casptone%20Project%20AutoML%20vs%20HyperDrive/heart%20failure%20clinical%20records%20dataset.csv"
        dataset = Dataset.Tabular.from_delimited_files(example_data)        
        # Registering Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)

The dataset is loaded


# Hyperdrive Configuration
Here we set the model used and the different hyperparameters, termination policy and config settings.

In [8]:
# Creating an early termination policy with Random parameter sampling
early_termination_policy = BanditPolicy(evaluation_interval=1, 
                                        slack_factor=0.2, 
                                        delay_evaluation=5)

# Creating the different parameters that will be used during training
param_sampling = RandomParameterSampling(
     {
        '--n_estimators': choice(20,40), 
        '--min_samples_split': choice(2,4,6)
     })

# Creating an environment, script run config and hyperdrive config
from azureml.core import ScriptRunConfig
from azureml.core.environment import Environment

myenv = Environment.from_conda_specification(name="myenv", file_path="myenv.yml")

src = ScriptRunConfig(compute_target=compute_target,
                    source_directory='./',
                    script='train.py',
                    environment=myenv)

hyperdrive_run_config = HyperDriveConfig(run_config=src,
                                         policy=early_termination_policy,
                                         hyperparameter_sampling=param_sampling,
                                         primary_metric_name="Accuracy",
                                         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                         max_total_runs=50,
                                         max_concurrent_runs=4)

In [9]:
# Submitting the experiment
hyperdrive_run = experiment.submit(hyperdrive_run_config, show_output=True)

# Run Details

There are different models trained with different performances. However, the accuracy seems to be the same.

In the cell below, the RunDetails widget is used to show the different experiments.

In [10]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [11]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_6396c3fb-627a-4fbe-85b8-fa8645480c03
Web View: https://ml.azure.com/runs/HD_6396c3fb-627a-4fbe-85b8-fa8645480c03?wsid=/subscriptions/f5091c60-1c3c-430f-8d81-d802f6bf2414/resourcegroups/aml-quickstarts-150619/workspaces/quick-starts-ws-150619&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-07-12T16:29:33.704664][API][INFO]Experiment created<END>\n""<START>[2021-07-12T16:29:34.270713][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-07-12T16:29:35.716605][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_6396c3fb-627a-4fbe-85b8-fa8645480c03
Web View: https://ml.azure.com/runs/HD_6396c3fb-627a-4fbe-85b8-fa8645480c03?wsid=/subscriptions/f5091c60-1c3c-430f-8d81-d802f6bf2414/resourcegroups/aml-quickstarts-150619/workspaces/quick-starts-ws-150619&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254



{'runId': 'HD_6396c3fb-627a-4fbe-85b8-fa8645480c03',
 'target': 'CI-HyperDrive01',
 'status': 'Completed',
 'startTimeUtc': '2021-07-12T16:29:33.436647Z',
 'endTimeUtc': '2021-07-12T17:16:36.963537Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '69b041dc-f1cc-4920-8676-1afbf12d763b',
  'score': '0.7555555555555555',
  'best_child_run_id': 'HD_6396c3fb-627a-4fbe-85b8-fa8645480c03_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg150619.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_6396c3fb-627a-4fbe-85b8-fa8645480c03/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=HnEh2VVH6Z7m4X6e2rvpeuQHzgrMpeNAJ1z2uXVuJVI%3D&st=2021-07-12T17%3A06%3A55Z&se=2021-07-13T01%3A16%3A55Z&sp=r'},
 'submittedBy': 'ODL_User 150619'}

# Best Model
In the cells below, we get the best model from the hyperdrive experiments and display all the properties of the model.

In [12]:
best_hyperdrive_run = hyperdrive_run.get_best_run_by_primary_metric()
best_hd_run_metrics = best_hyperdrive_run.get_metrics()

print('Best Run Id: ', best_hyperdrive_run.id)
print('\n Best Run Metrics: ', best_hd_run_metrics)

Best Run Id:  HD_6396c3fb-627a-4fbe-85b8-fa8645480c03_0

 Best Run Metrics:  {'The number of trees in the forest:': 20, 'The minimum number of samples required to split an internal node:': 2, 'Accuracy': 0.7555555555555555}


In [13]:
print('\n Accuracy: ', best_hd_run_metrics['Accuracy'])
print('\n N Estimators: ', best_hd_run_metrics['The number of trees in the forest:'])
print('\n Min Samples Split: ', best_hd_run_metrics['The minimum number of samples required to split an internal node:'])


 Accuracy:  0.7555555555555555

 N Estimators:  20

 Min Samples Split:  2


In [14]:
# Saving the best model
os.makedirs("./outputs", exist_ok=True)
joblib.dump(value=best_hyperdrive_run.id,filename='outputs/best_hyperdrive_run_model.joblib')
print("Model has been successfully saved!")

Model has been successfully saved!


# Register the Best Model

In [15]:
# Registering the best model with metrics information
model = best_hyperdrive_run.register_model(model_name='heart_failure_hyperdrive', model_path='outputs/', 
                    properties={'Accuracy': best_hd_run_metrics['Accuracy'],
                                'N Estimators': best_hd_run_metrics['The number of trees in the forest:'],
                                'Min Samples Split': best_hd_run_metrics['The minimum number of samples required to split an internal node:']})

In [16]:
# Listing registered models to verify that the model has been saved
for model in model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

heart_failure_hyperdrive version: 1
	 Accuracy : 0.7555555555555555
	 N Estimators : 20
	 Min Samples Split : 2


AutoML5f44cd97a50 version: 1


