# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [32]:
from azureml.core import Workspace, Experiment, Environment

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import choice

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os
import joblib

import numpy as np
import pandas as pd


In [2]:
import xgboost as xgb

In [4]:
ws = Workspace.from_config()
experiment_name = 'capstone-project'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')


run = experiment.start_logging()

Workspace name: quick-starts-ws-133759
Azure region: southcentralus
Subscription id: 2c48c51c-bd47-40d4-abbe-fb8eabd19c8c
Resource group: aml-quickstarts-133759


In [63]:
my_env = Environment.from_existing_conda_environment(name="myenv")


TypeError: from_existing_conda_environment() missing 1 required positional argument: 'conda_environment_name'

In [65]:
sklearn_env

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20201113.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": true,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "AzureML-AutoML",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge

### Creating the compute

In [8]:
cpu_cluster_name = "cpu-cluster"

   # Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [9]:
# Create a project_folder if it doesn't exist
if not os.path.isdir('data'):
    os.mkdir('data')
    
if not os.path.exists('project_folder'):
    os.makedirs('project_folder')
    
project_folder="./project_folder/"

In [10]:
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")

train_df.head()


Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
1,9JDAGUV3,Why do I get hallucinations?,Drugs
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
3,6UY7DX6Q,Why is life important?,Suicide
4,FYC0FTFB,How could I be helped to go through the depres...,Depression


In [41]:
le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df["label"].tolist())
train_df.head()

Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,1
1,9JDAGUV3,Why do I get hallucinations?,2
2,419WR1LQ,I am stresseed due to lack of financial suppor...,1
3,6UY7DX6Q,Why is life important?,3
4,FYC0FTFB,How could I be helped to go through the depres...,1


In [42]:
train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)

In [43]:
datastore=ws.get_default_datastore()

In [45]:
datastore.upload(src_dir="./data", target_path="mental_health", show_progress=True)

Uploading an estimated of 2 files
Uploading ./data/test.csv
Uploaded ./data/test.csv, 1 files out of an estimated total of 2
Uploading ./data/train.csv
Uploaded ./data/train.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_dfb7d3a7c88a486e843df2dc1c4fe63a

In [29]:
from azureml.data.dataset_factory import TabularDatasetFactory


In [46]:
dataset = TabularDatasetFactory.from_delimited_files(path=datastore.path("mental_health/train.csv"), separator=",")

In [48]:
dataset.register(ws, "mental_health_clf", create_new_version=True)

{
  "source": [
    "('workspaceblobstore', 'mental_health/train.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "41881d46-1353-42a1-9f53-450bce338055",
    "name": "mental_health_clf",
    "version": 2,
    "workspace": "Workspace.create(name='quick-starts-ws-133759', subscription_id='2c48c51c-bd47-40d4-abbe-fb8eabd19c8c', resource_group='aml-quickstarts-133759')"
  }
}

## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [13]:
datastore.path("mental_health_clf/train.csv")

$AZUREML_DATAREFERENCE_ae56145229a54becbb73b739aa6963b3

In [54]:
# TODO: Create an early termination policy. This is not required if you are using Bayesian sampling.
early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

#TODO: Create the different params that you will be using during training
param_sampling = RandomParameterSampling({
    "n_estimators": choice(200, 400, 700, 1000),
    "max_depth": choice(range(2,7))                                      }
)

if "training" not in os.listdir():
    os.mkdir("./training")

#TODO: Create your estimator and hyperdrive config -used ScriptRunConfig
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=project_folder,
                      script='train.py',
                      compute_target=cpu_cluster,
                      environment=sklearn_env)

hyperdrive_run_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=param_sampling, 
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     policy=early_termination_policy,
                                     max_total_runs=12,
                                     max_concurrent_runs=4)

In [55]:
#TODO: Submit your experiment
hyperdrive_run = experiment.submit(config=hyperdrive_run_config)

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [56]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [57]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_d0236654-866e-4791-a4da-7093aeb217c0
Web View: https://ml.azure.com/experiments/capstone-project/runs/HD_d0236654-866e-4791-a4da-7093aeb217c0?wsid=/subscriptions/2c48c51c-bd47-40d4-abbe-fb8eabd19c8c/resourcegroups/aml-quickstarts-133759/workspaces/quick-starts-ws-133759

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-07T06:48:56.643546][API][INFO]Experiment created<END>\n""<START>[2021-01-07T06:48:57.063396][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-01-07T06:48:57.241056][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-01-07T06:48:57.8082314Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_d0236654-866e-4791-a4da-7093aeb217c0
Web View: https://ml.azure.com/experiments/capstone-project/runs/HD_d0236654-866e-4791-a4da-7093aeb217c0?wsid=/subscriptions/2c4

{'runId': 'HD_d0236654-866e-4791-a4da-7093aeb217c0',
 'target': 'cpu-cluster',
 'status': 'Canceled',
 'startTimeUtc': '2021-01-07T06:48:56.438525Z',
 'endTimeUtc': '2021-01-07T07:12:45.212009Z',
 'error': {'error': {'code': 'UserError',
   'message': 'User errors were found in at least one of the child runs.',
   'messageParameters': {},
   'details': []},
  'time': '0001-01-01T00:00:00.000Z'},
   'message': '{\n  "error": {\n    "code": "UserError",\n    "severity": null,\n    "message": "User errors were found in at least one of the child runs.",\n    "messageFormat": null,\n    "messageParameters": {},\n    "referenceCode": null,\n    "detailsUri": null,\n    "target": null,\n    "details": [],\n    "innerError": null,\n    "debugInfo": null\n  },\n  "correlation": null,\n  "environment": null,\n  "location": null,\n  "time": "0001-01-01T00:00:00+00:00",\n  "componentName": null\n}'}],
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_fr

In [58]:
assert(hyperdrive_run.get_status() == "Completed")

AssertionError: 

In [64]:
envs=Environment.list(workspace=ws)


for env in envs:
        print ("name: ", env)
        print("packages: ", envs[env].python.conda_dependencies.serialize_to_string())

name:  AzureML-VowpalWabbit-8.8.0
packages:  channels:
- conda-forge
dependencies:
- python=3.6.2
- pip:
  - azureml-core==1.19.0
  - azureml-defaults==1.19.0
  - azureml-dataset-runtime[fuse,pandas]
name: azureml_769be4b756b756954fa484d1287d5153

name:  AzureML-AutoML
packages:  channels:
- anaconda
- conda-forge
- pytorch
dependencies:
- python=3.6.2
- pip=20.2.4
- pip:
  - azureml-core==1.19.0
  - azureml-pipeline-core==1.19.0
  - azureml-telemetry==1.19.0
  - azureml-defaults==1.19.0
  - azureml-interpret==1.19.0
  - azureml-automl-core==1.19.0
  - azureml-automl-runtime==1.19.0
  - azureml-train-automl-client==1.19.0
  - azureml-train-automl-runtime==1.19.0
  - azureml-dataset-runtime==1.19.0
  - inference-schema
  - py-cpuinfo==5.0.0
  - boto3==1.15.18
  - botocore==1.18.18
- numpy~=1.18.0
- scikit-learn==0.22.1
- pandas~=0.25.0
- py-xgboost<=0.90
- fbprophet==0.5
- holidays==0.9.11
- setuptools-git
- psutil>5.0.0,<6.0.0
name: azureml_8eff28b157f42edcd2424a5aae6c8074

name:  Azur

## Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [None]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

print(best_run.get_file_names())

In [None]:
accuracy=best_run.get_metrics()["Accuracy"]
print("Accuracy: ",accuracy)

In [None]:
#TODO: Save the best model
model = best_run.register_model(model_name='mental-health-clf', model_path='outputs/model.joblib')

## Free resources

In [None]:
# delete the created compute
cpu_cluster.delete()
# delete the workspace
ws.delete()
