In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n',
       )

run = exp.start_logging(snapshot_directory=None)

Workspace name: ws-1
Azure region: eastus2
Subscription id: cca320dc-f2a1-497e-8a7a-eecff48abbb9
Resource group: rg1


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
cluster_name = "compute-cluster"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

try:
    aml_cluster = ComputeTarget(workspace=ws,name= cluster_name)
    print("Found existing cluster")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size = "Standard_D2_V2",
                                                                                    min_nodes=0,
                                                                                   max_nodes=4)
    aml_cluster = ComputeTarget.create(ws,cluster_name,compute_config)
    
aml_cluster.wait_for_completion(show_output=True)
    

InProgress.........................................
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
param_space={
    '--C':choice(0.001,0.01,0.1,1,10),
    '--max_iter':choice(range(1,100,10))
}
ps = RandomParameterSampling(param_space)
# Specify a Policy
policy = BanditPolicy(slack_amount=0.2,
                     evaluation_interval=1,
                     delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory="./",
                     script="train.py",
                     environment=sklearn_env,
                      compute_target=aml_cluster,
                     arguments=['--C',1.0,'--max_iter',100])

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                    hyperparameter_sampling =ps,
                                    primary_metric_name="Accuracy",
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=6,
                                    max_concurrent_runs=4)

In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
experiment = Experiment(workspace=ws,name='hyperdrive_training')
hyperdrive_run = experiment.submit(config= hyperdrive_config, show_output=True)

In [5]:
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8
Web View: https://ml.azure.com/runs/HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8?wsid=/subscriptions/cca320dc-f2a1-497e-8a7a-eecff48abbb9/resourcegroups/rg1/workspaces/ws-1&tid=433635a2-449e-4337-a0ce-f02710ee940d

Streaming azureml-logs/hyperdrive.txt

[2022-11-30T09:55:29.647118][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space
[2022-11-30T09:55:30.3049315Z][SCHEDULER][INFO]Scheduling job, id='HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8_0' 
[2022-11-30T09:55:30.4505783Z][SCHEDULER][INFO]Scheduling job, id='HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8_1' 
[2022-11-30T09:55:30.5364564Z][SCHEDULER][INFO]Scheduling job, id='HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8_2' 
[2022-11-30T09:55:30.558375][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.
[2022-11-30T09:55:30.6522950Z][SCHEDULER][INFO]Scheduling job, id='HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8_3' 
[2022-11-30T09:55:30.7504550Z]

{'runId': 'HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8',
 'target': 'compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2022-11-30T09:55:29.080699Z',
 'endTimeUtc': '2022-11-30T10:02:31.685062Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"Accuracy","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '16e32a07-7186-4b7e-bce1-9a8d34e4f84d',
  'user_agent': 'python/3.8.5 (Linux-5.15.0-1022-azure-x86_64-with-glibc2.10) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.47.0',
  'space_size': '50',
  'score': '0.9133535660091047',
  'best_child_run_id': 'HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8_5',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8_5'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryValues': {'amlCli

In [6]:
import joblib
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print("best_run_metric ",best_run.get_metrics())
print("best_run_details :",best_run.get_details())


best_run_metric  {'Regularization Strength:': 0.01, 'Max iterations:': 81, 'Accuracy': 0.9133535660091047}
best_run_details : {'runId': 'HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8_5', 'target': 'compute-cluster', 'status': 'Completed', 'startTimeUtc': '2022-11-30T10:00:54.032043Z', 'endTimeUtc': '2022-11-30T10:01:12.063723Z', 'services': {}, 'properties': {'_azureml.ComputeTargetType': 'amlctrain', 'ContentSnapshotId': '16e32a07-7186-4b7e-bce1-9a8d34e4f84d', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}, 'inputDatasets': [], 'outputDatasets': [], 'runDefinition': {'script': 'train.py', 'command': '', 'useAbsolutePath': False, 'arguments': ['--C', '1', '--max_iter', '100', '--C', '0.01', '--max_iter', '81'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'compute-cluster', 'dataReferences': {}, 'data': {}, 'outputData': {}, 'datacaches': [], 'jobName': None, 'maxRunDurationSeconds': 

In [7]:
for file in hyperdrive_run.get_file_names():
    print(file)

azureml-logs/hyperdrive.txt


In [8]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()

In [9]:
hyperdrive_run.get_children_sorted_by_primary_metric(top=1)

[{'run_id': 'HD_64a813de-cc8b-48ca-a593-6a3d9bf594c8_5',
  'hyperparameters': '{"--C": 0.01, "--max_iter": 81}',
  'best_primary_metric': 0.9133535660091047,
  'status': 'Completed'}]

In [10]:
best_run.get_file_names()

['logs/azureml/dataprep/0/backgroundProcess.log',
 'logs/azureml/dataprep/0/backgroundProcess_Telemetry.log',
 'logs/azureml/dataprep/0/rslex.log.2022-11-30-10',
 'outputs/model.joblib',
 'system_logs/cs_capability/cs-capability.log',
 'system_logs/hosttools_capability/hosttools-capability.log',
 'system_logs/lifecycler/execution-wrapper.log',
 'system_logs/lifecycler/lifecycler.log',
 'system_logs/metrics_capability/metrics-capability.log',
 'system_logs/snapshot_capability/snapshot-capability.log',
 'user_logs/std_log.txt']

In [11]:
model=best_run.register_model(model_name='model',model_path='outputs/model.joblib')

In [12]:
best_fitted_model = best_run.download_file('outputs/model.joblib')

In [15]:
from azureml.core import Model
model = Model.register(
    workspace=ws, 
    model_name='capstone-hyperdrive-model', 
    model_path='./model.joblib'
)

Registering model capstone-hyperdrive-model


# Model Deployment


In [13]:
# from azureml.data.dataset_factory import TabularDatasetFactory

# # Create TabularDataset using TabularDatasetFactory
# # Data is available at: 
# # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

# url =  "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

# ds = TabularDatasetFactory.from_delimited_files(path=url)

In [14]:
# from train import clean_data

# # Use the clean_data function to clean your data.
# x, y = clean_data(ds)
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
# x_train['outcome'] = y_train
# x_test['outcome'] = y_test

In [None]:
if "data" not in os.listdir():
    os.mkdir("./data")

In [None]:
x_train.to_csv("data/train_data.csv", index=False)
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='bankmarketing', overwrite=True, show_progress=True)
 
# Upload the training data as a tabular dataset for access during training on remote compute
train_data = TabularDatasetFactory.from_delimited_files(path=ds.path('bankmarketing/train_data.csv'))


In [None]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    compute_target = aml_cluster, 
    training_data=train_data,
    label_column_name='outcome',
    featurization="auto",
    enable_early_stopping = True,
    n_cross_validations=10)

In [None]:
# Submit your automl run

from azureml.core.experiment import Experiment
automl_experiment = Experiment(ws,'automl-experiment')
automl_run = automl_experiment.submit(automl_config, show_output=True)

In [None]:
# Retrieve and save your best automl model.
best_run, fitted_model = automl_run.get_output()
best_run_metrics = best_run.get_metrics()

In [None]:
import joblib
joblib.dump(fitted_model, 'model.pkl')

In [None]:
from azureml.core import Model
model = Model.register(
    workspace=ws, 
    model_name='capstone-hyperdrive-model', 
    model_path='./model.pkl'
)

In [None]:
best_run.get_details()

In [None]:
for step_ in fitted_model.named_steps:
    print(step_)

In [None]:
best_automl_model=best_run.register_model(model_name='model',model_path='outputs/model.joblib')
joblib.dump(value=best_automl_model,filename='automl_model.joblib')
best_fitted_model = best_run.download_file('outputs/automl_model.joblib')

In [None]:
best_run.get_details()


In [None]:
#aml_cluster.delete()