# Train and hyperparameter tune on Heart Failure Dataset

Importing dependencies

In [None]:
import azureml.core
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice
import os, shutil

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

## Initialize Workspace

Initialize a workspace object from persisted configuration. 

In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

## Create an Azure ML experiment

Create an [Experiment](https://docs.microsoft.com/en-gb/azure/machine-learning/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace.

In [None]:
# Choose a name for the run history container in the workspace
experiment_name = 'hyperdrive-heart-failure'
experiment = Experiment(ws, experiment_name)

run = experiment.start_logging()

## Create or Attach an AmlCompute cluster

Create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training and hyperparameter tuning.

In [None]:
# choose a name for your cluster
# Compute name should contain only letters, digits, hyphen and should be 2-16 charachters long
cluster_name = "aml-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print(f'{cluster_name} exists already')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    
    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    
    compute_target.wait_for_completion(show_output=True)
    
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

## Hyperdrive Configuration

The model used here is SVM for classification since its capable of generating non-linear decision boundaries, and can achieve high accuracies. It is also more robust to outliers than Logistic Regression.

The hyperdrive settings include the following:
1. A bandit early termination policy is chosen based on slack factor, avoids premature termination of first 5 runs, and then subsequently terminates runs whose primary metric fall outside of the top 10%.

2. The model uses Random Parameter Sampling for finding the inverse regularization strenghth and kernel type.

3. The Hyperdrive configuration is created using SKLearn estimator with the train.py script, hyperparameter sampler, and policy.

In [None]:
# Create an early termination policy. This is not required if you are using Bayesian sampling.
# Specify a Policy
early_termination_policy = BanditPolicy(evaluation_interval=2, delay_evaluation=5, slack_factor=0.1)

# Create the different params that you will be using during training
# Specify parameter sampler
#param_sampling = RandomParameterSampling( {
        #"--C": choice(0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.7, 1.0, 1.3, 1.7,  2.0),
        #"--coef0": choice(0,1,2,3)
    #}
#)

param_sampling = RandomParameterSampling( {
        "--kernel": choice('linear', 'rbf', 'poly', 'sigmoid'),
        "--C": choice(0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.7, 1.0, 1.3, 1.7,  2.0)
    }
)

if "training" not in os.listdir():
    os.mkdir("./training")

# The training logic is in the train.py file.
shutil.copy('train.py', 'training')

# Create your estimator and hyperdrive config
estimator = SKLearn(source_directory="./",
                      entry_script='train.py',
                      compute_target=compute_target)
               
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_run_config = HyperDriveConfig(estimator=estimator,
                                     hyperparameter_sampling=param_sampling,
                                     policy=early_termination_policy,
                                     primary_metric_name='AUC_weighted',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=50,
                                     max_concurrent_runs=4)

In [None]:
# Submit your experiment
hyperdrive_run = experiment.submit(config=hyperdrive_run_config)

## Run Details

In [None]:
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)

## Best Model

In [None]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best Run Id: ', best_run.id)
print('\n AUC_weighted:', best_run_metrics['AUC_weighted'])
print('\n Regularization Strength:', best_run_metrics['Regularization Strength:'])
print('\n Kernel:', best_run_metrics['Kernel:'])

In [None]:
print(best_run.get_file_names())

In [None]:
# Register the model
model = best_run.register_model(model_path='outputs/', model_name='hyperdrive_model',
                   tags={'Training context':'Parameterized SKLearn Estimator', 'type': 'Classification'},
                   properties={'AUC_weighted': best_run_metrics['AUC_weighted']},
                   description = 'Heart Failure Predictor')
model

In [None]:
compute_target.delete()