In [1]:
%load_ext autoreload
%autoreload 2

import os
import urllib

from azureml.core import  (Workspace,Run,VERSION,
                           Experiment,Datastore)
from azureml.core.runconfig import (RunConfiguration,
                                    DEFAULT_GPU_IMAGE)
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.compute import (AmlCompute, ComputeTarget)
from azureml.exceptions import ComputeTargetException
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import (Pipeline, 
                                   PipelineData)
from azureml.pipeline.steps import (HyperDriveStep,PythonScriptStep)
from azureml.train.dnn import PyTorch
from azureml.train.hyperdrive import *
from azureml.widgets import RunDetails


print('SDK verison', VERSION)

SDK verison 1.0.23


In [2]:
SUBSCRIPTION_ID = 'fe375bc2-9f1a-4909-ad0d-9319806d5e97'
RESOURCE_GROUP = 'amlenv_rg'
WORKSPACE_NAME = 'vienna'

PROJECT_DIR = os.getcwd()
EXPERIMENT_NAME = "customer_churn"
CLUSTER_NAME = "gpu-cluster"
DATA_DIR = os.path.join(PROJECT_DIR,'data')
SCRIPT_DIR = os.path.join(PROJECT_DIR,'train')

SOURCE_URL ='https://amlgitsamples.blob.core.windows.net/churn'
FILE_NAME = 'CATelcoCustomerChurnTrainingSample.csv'

In [3]:
ws = Workspace(workspace_name = WORKSPACE_NAME,
               subscription_id = SUBSCRIPTION_ID ,
               resource_group = RESOURCE_GROUP)

ws.write_config()

print('Workspace loaded:', ws.name)

Workspace loaded: vienna


In [4]:
os.makedirs(DATA_DIR, exist_ok=True)

urllib.request.urlretrieve(os.path.join(SOURCE_URL,FILE_NAME), 
                           filename = os.path.join(DATA_DIR,FILE_NAME))

('/extdrive1/home/sasuke/dev/amlsamples/Customer_churn/data/CATelcoCustomerChurnTrainingSample.csv',
 <http.client.HTTPMessage at 0x7f1349711a58>)

In [5]:
default_store = default_datastore=ws.datastores["workspaceblobstore"]
default_store.upload(src_dir=DATA_DIR, target_path='churn', overwrite=True, show_progress=True)

Uploading /extdrive1/home/sasuke/dev/amlsamples/Customer_churn/data/CATelcoCustomerChurnTrainingSample.csv
Uploaded /extdrive1/home/sasuke/dev/amlsamples/Customer_churn/data/CATelcoCustomerChurnTrainingSample.csv, 1 files out of an estimated total of 1


$AZUREML_DATAREFERENCE_6d47d885c9c04471bb0715385999b37c

In [6]:
cluster_name = "gpu-cluster"

try:
    cluster = ComputeTarget(ws, cluster_name)
    print(cluster_name, "found")
    
except ComputeTargetException:
    print(cluster_name, "not found, provisioning....")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',max_nodes=1)

    
    cluster = ComputeTarget.create(ws, cluster_name, provisioning_config)
    cluster.wait_for_completion(show_output=True)

gpu-cluster found


In [7]:
cd = CondaDependencies()
cd.add_conda_package('pandas')
cd.add_conda_package('matplotlib')
cd.add_conda_package('numpy')
cd.add_conda_package('scikit-learn')


run_config = RunConfiguration(framework="python",
                              conda_dependencies= cd)
run_config.target = cluster
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE
run_config.environment.python.user_managed_dependencies = False

In [8]:
estimator = PyTorch(source_directory=SCRIPT_DIR,
                    conda_packages = ['pandas', 'numpy', 'scikit-learn'],
                    pip_packages = ['gpytorch'],
                    compute_target=cluster,
                    entry_script='svdkl_entry.py',
                    use_gpu=True)

In [9]:
ps = RandomParameterSampling(
    {
        '--batch-size': choice(512, 1024),
        '--epochs': choice(500),
        '--neural-net-lr': loguniform(-6,-2),
        '--likelihood-lr': loguniform(-6,-2),
        '--grid-size': choice(32,64),
        '--grid-bounds': choice(-1,0),
        '--latent-dim': choice(2),
        '--num-mixtures': choice(2,4,6,8)
    }
)

early_termination_policy = BanditPolicy(evaluation_interval=10, slack_factor=0.1)

hd_config = HyperDriveRunConfig(estimator=estimator, 
                                hyperparameter_sampling=ps,
                                policy=early_termination_policy,
                                primary_metric_name='auc', 
                                primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                                max_total_runs=200,
                                max_concurrent_runs=10)

In [10]:
input_dir = DataReference(datastore=default_store,
                          data_reference_name="input_data",
                          path_on_datastore="churn"
                         )

processed_dir = PipelineData(name = 'processed_data',
                             datastore=default_store
                            )

In [11]:
pre_processing = PythonScriptStep(
                            name = 'preprocess dataset',
                            script_name = 'preprocess.py',
                            arguments = ['--input_path', input_dir,\
                                         '--output_path', processed_dir],
                            inputs = [input_dir],
                            outputs = [processed_dir],
                            compute_target = cluster_name,
                            runconfig = run_config
                        )

hd_step = HyperDriveStep(
    name="hyper parameters tunning",
    hyperdrive_run_config=hd_config,
    estimator_entry_script_arguments=['--data-folder', processed_dir],
    inputs=[processed_dir])

In [12]:
pipeline = Pipeline(workspace=ws, steps=[hd_step],default_datastore=default_store)
pipeline_run = Experiment(ws, 'Customer_churn').submit(pipeline)

Created step hyper parameters tunning [097240ea][e55fbda4-a5cf-41c1-ba79-195dd6fe1cdd], (This step is eligible to reuse a previous run's output)
Created step preprocess dataset [aead4ab0][263a9a66-b243-4104-a239-6515b1f2f9cb], (This step is eligible to reuse a previous run's output)
Using data reference input_data for StepId [26e22a39][42a849d1-76be-4501-8196-ebeb90431857], (Consumers of this data are eligible to reuse prior runs.)
Submitted pipeline run: cf188257-60b9-4c85-8a3b-7e72bfd8115f


In [13]:
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [25]:
pipeline_run.cancel()

In [42]:
os.path.join('churn',FILE_NAME)

'churn/CATelcoCustomerChurnTrainingSample.csv'

In [60]:
def mychoice(*options):
    """Specify a discrete set of options to sample from.

    :param options: The list of options to choose from.
    :type options: list
    :return: The stochastic expression.
    :rtype: list
    """
    if len(options) == 0:
        raise AzureMLException("Please specify an input for choice.")

    error_msg = "Choice only accepts single list, single range() or any number of arbitrary comma separated inputs."
    for item in options:
        if isinstance(item, range):
            if len(options) > 1 or not item:
                raise AzureMLException(error_msg)
            return ["choice", [list(item)]]
        if isinstance(item, list):
            if len(options) > 1:
                raise AzureMLException(error_msg)
            return ["choice", [item]]

    return ["choice", [list(options)]]

In [62]:
mychoice((-1,1),(0,1))


['choice', [[(-1, 1), (0, 1)]]]

In [65]:
ps2 = RandomParameterSampling(
    {'test':choice((-1,1),(0,1))})


In [66]:
ps2

<azureml.train.hyperdrive.sampling.RandomParameterSampling at 0x7fe42c00db00>

In [68]:
ps2.SAMPLING_NAME

'RANDOM'