# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [None]:
from azureml.core import Workspace, Experiment, Environment

from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import choice

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import os
import joblib

import numpy as np
import pandas as pd


In [None]:
ws = Workspace.from_config()
experiment_name = 'capstone-project'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

sklearn_env = Environment.from_conda_specification(name = 'sklearn-env', file_path = './conda_dependencies.yml')

run = experiment.start_logging()

### Creating the compute

In [None]:
cpu_cluster_name = "cpu-cluster"

   # Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

## Dataset

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [None]:
# Create a project_folder if it doesn't exist
if not os.path.isdir('data'):
    os.mkdir('data')
    
if not os.path.exists('project_folder'):
    os.makedirs('project_folder')
    
project_folder="./project_folder/"

In [None]:
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")

train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)

train_df.head()


In [None]:
datastore=ws.get_default_datastore()

In [None]:
datastore.upload(src_dir="./data", target_path="mental_health_clf", show_progress=True)

## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [None]:
# TODO: Create an early termination policy. This is not required if you are using Bayesian sampling.
early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

#TODO: Create the different params that you will be using during training
param_sampling = RandomParameterSampling({
    "n_estimators": choice(200, 400, 1000, 2000),
    "max_depth": choice(range(3,8)),
    "data_path": choice(datastore.path("mental_health_clf/train.csv"))
                                       }
)

if "training" not in os.listdir():
    os.mkdir("./training")

#TODO: Create your estimator and hyperdrive config -used ScriptRunConfig
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=project_folder,
                      script='train.py',
                      compute_target=cpu_cluster,
                      environment=sklearn_env)

hyperdrive_run_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps, 
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     policy=policy,
                                     max_total_runs=12,
                                     max_concurrent_runs=4)

In [None]:
#TODO: Submit your experiment
hyperdrive_run = exp.submit(config=hyperdrive_run_config)

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [None]:
RunDetails(hyperdrive_run).show()

In [None]:
hyperdrive_run.wait_for_completion(show_output=True)

In [None]:
assert(hyperdrive_run.get_status() == "Completed")

## Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [None]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

print(best_run.get_file_names())

In [None]:
accuracy=best_run.get_metrics()["Accuracy"]
print("Accuracy: ",accuracy)

In [None]:
#TODO: Save the best model
model = best_run.register_model(model_name='mental-health-clf', model_path='outputs/model.joblib')

## Free resources

In [None]:
# delete the created compute
cpu_cluster.delete()
# delete the workspace
ws.delete()
