# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import helper
import azureml.core
from azureml.data.datapath import DataPath
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.core.run import Run
from azureml.core.model import Model
from azureml.core.environment import Environment
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice, normal
import os
import time 

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.41.0


## Dataset

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [2]:
# Load workspace from config file present at .\config.json.
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-197544
aml-quickstarts-197544
southcentralus
f5091c60-1c3c-430f-8d81-d802f6bf2414


In [3]:
# Choose a name for experiment
experiment_name = 'Titanic_hyperdrive'
project_folder = './titanic-project-hyperdrive'

experiment=Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
Titanic_hyperdrive,quick-starts-ws-197544,Link to Azure Machine Learning studio,Link to Documentation


In [12]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "Titanic_dataset"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        datastore = ws.get_default_datastore()
        datastore.upload(src_dir='data', target_path='data')
        train_data = datastore.path('data/train_modified.csv')
        
        dataset = Dataset.Tabular.from_delimited_files(train_data, separator=';')        
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description="This is the complete dataset for the capstone project.")
        
dataset_filtered = dataset.keep_columns(["Survived","Pclass","Sex","SibSp","Parch","Fare","Embarked","Age"])
df = dataset_filtered.to_pandas_dataframe()
df = pd.concat([df.drop(columns=['Sex']),pd.get_dummies(df.Sex, prefix='Sex')], axis=1)
df = pd.concat([df.drop(columns=['Embarked']),pd.get_dummies(df.Embarked, prefix='Embarked')], axis=1)

dataset_filtered = Dataset.Tabular.register_pandas_dataframe(df,
                                                            target=datastore,
                                                            name=key+"_filtered",
                                                            description="This is the filtered dataset for the capstone project " \
                                                            "with only those features relevant for training."
                                                            )
df.describe()

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/897fa0cb-73f1-4022-8960-3232707b1c7f/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.523008,0.381594,32.204208,29.520623,0.352413,0.647587,0.188552,0.087542,0.723906
std,0.486592,0.836071,1.102743,0.806057,49.693429,13.399106,0.47799,0.47799,0.391372,0.282787,0.447315
min,0.0,1.0,0.0,0.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,0.0,7.9104,21.5,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,0.0,0.0,14.4542,27.784794,0.0,1.0,0.0,0.0,1.0
75%,1.0,3.0,1.0,0.0,31.0,37.0,1.0,1.0,0.0,0.0,1.0
max,1.0,3.0,8.0,6.0,512.3292,80.0,1.0,1.0,1.0,1.0,1.0


## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [13]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

num_nodes = 5

amlcompute_cluster_name = "ComputeClusterCapstone"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',
                                                           vm_priority = 'lowpriority',
                                                           max_nodes=num_nodes)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [14]:
env = Environment.get(ws, "AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu")


**Hyperparameter selection**

Parameters selected for optimization are ```learning_rate``` and ```max_depth```, they should be selected via choice from a provided list. learning_rate as a generally important hyperparameter for all ML experiments should be varied to optimally reach the globally best metric and to avoid the pitfalls of getting stuck in local error minima at a too low as well as to overshoot at a too high learning rate. max_depth was selected due to its high importance in reaching good differentiation specifically for the gradient regression method. The maximum depth limits the number of nodes in the tree and should therefor be tuned for best performance.

In [18]:
# Early termination policy. This is not required if you are using Bayesian sampling.
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

param_sampling = RandomParameterSampling(
        {
            'max_depth': choice(range(1,8,1)),
            'learning_rate': choice(0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0)
        }
)

estimator = SKLearn(source_directory='.', compute_target=compute_target, 
                vm_size='STANDARD_NC6', vm_priority='lowpriority', entry_script='train.py')

hyperdrive_run_config = HyperDriveConfig(estimator=estimator,
                                         hyperparameter_sampling=param_sampling,
                                         policy=early_termination_policy,
                                         primary_metric_name='Accuracy',
                                         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                         max_total_runs=40,
                                         max_concurrent_runs=4)



In [19]:
#TODO: Submit your experiment
remote_run = experiment.submit(config=hyperdrive_run_config, show_output=True)



## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [20]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

![image](https://user-images.githubusercontent.com/98894580/172343095-f8783ac4-4c10-4415-851b-e6538665ce1a.png)

## Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [23]:
best_run = remote_run.get_best_run_by_primary_metric()
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
Titanic_hyperdrive,HD_60601d35-b576-40b3-a377-40db4d7e78ad_20,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [25]:
best_model = remote_run.get_children_sorted_by_primary_metric()[0]
best_model

{'run_id': 'HD_60601d35-b576-40b3-a377-40db4d7e78ad_20',
 'hyperparameters': '{"learning_rate": 0.5, "max_depth": 2}',
 'best_primary_metric': 0.4441789203844553,
 'status': 'Completed'}

In [28]:
print(best_run.get_metrics())
print(best_run.get_details())

{'Accuracy': 0.4441789203844553, 'Max depth:': 2.0, 'Learning rate:': 0}
{'runId': 'HD_60601d35-b576-40b3-a377-40db4d7e78ad_20', 'target': 'ComputeClusterCapstone', 'status': 'Completed', 'startTimeUtc': '2022-06-05T00:31:00.924632Z', 'endTimeUtc': '2022-06-05T00:32:30.382536Z', 'services': {}, 'properties': {'_azureml.ComputeTargetType': 'amlctrain', 'ContentSnapshotId': '7eac3cb8-9765-4236-93af-92ad11f9114a', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}, 'inputDatasets': [{'dataset': {'id': '748e8b78-9015-4a94-9447-26a9b20ae502'}, 'consumptionDetails': {'type': 'Reference'}}], 'outputDatasets': [], 'runDefinition': {'script': 'train.py', 'command': '', 'useAbsolutePath': False, 'arguments': ['--learning_rate', '0.5', '--max_depth', '2'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'ComputeClusterCapstone', 'dataReferences': {}, 'data': {}, 'outputData': {}, 'datacaches': []

In [39]:
best_run = remote_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()

print('Best run ID: ', best_run.id)
print('Best run accuracy:', best_run_metrics['Accuracy'])

Best run ID:  HD_60601d35-b576-40b3-a377-40db4d7e78ad_20
Best run accuracy: 0.4441789203844553


In [46]:
best_run.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_4bc6d1f2db6720fce6075fd62ecdd77a2bff8b6c77b5cd1fe2406ca084a33684_p.txt',
 'azureml-logs/65_job_prep-tvmps_4bc6d1f2db6720fce6075fd62ecdd77a2bff8b6c77b5cd1fe2406ca084a33684_p.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_4bc6d1f2db6720fce6075fd62ecdd77a2bff8b6c77b5cd1fe2406ca084a33684_p.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/112_azureml.log',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log']

In [48]:
best_run.register_model(model_name = "Titanic_Hyperdrive_model", model_path = './outputs/')

Registering model Titanic_Hyperdrive_model
