In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n',
       )

run = exp.start_logging(snapshot_directory=None)

Workspace name: *******************
Azure region: ******************
Subscription id:********************
Resource group: ****************


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
cluster_name = "demo-cluster"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

try:
    aml_cluster = ComputeTarget(workspace=ws,name= cluster_name)
    print("Found existing cluster")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size = "Standard_D2_V2",
                                                                                    min_nodes=0,
                                                                                   max_nodes=4)
    aml_cluster = ComputeTarget.create(ws,cluster_name,compute_config)
    
aml_cluster.wait_for_completion(show_output=True)
    

Found existing cluster
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [5]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
param_space={
    '--C':choice(0.001,0.01,0.1,1,10),
    '--max_iter':choice(range(1,100,10))
}
ps = RandomParameterSampling(param_space)
# Specify a Policy
policy = BanditPolicy(slack_amount=0.2,
                     evaluation_interval=1,
                     delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory="./",
                     script="train.py",
                     environment=sklearn_env,
                      compute_target=aml_cluster,
                     arguments=['--C',1.0,'--max_iter',100])

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                    hyperparameter_sampling =ps,
                                    primary_metric_name="Accuracy",
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=6,
                                    max_concurrent_runs=4)

In [37]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
experiment = Experiment(workspace=ws,name='hyperdrive_training')
hyperdrive_run = experiment.submit(config= hyperdrive_config, show_output=True)

In [38]:
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_9b66ed05-6041-48ac-8332-70583b20381e
Web View: https://ml.azure.com/runs/HD_9b66ed05-6041-48ac-8332-70583b20381e?wsid=/subscriptions/bb379b6e-1466-40b0-9fa1-cadacc0825e6/resourcegroups/ml-azure-demo/workspaces/ml-azure-ws1&tid=e121a0b0-3856-4ebc-b09a-5c85afed02f9

Streaming azureml-logs/hyperdrive.txt

[2022-11-19T11:55:06.829029][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space
[2022-11-19T11:55:07.4578754Z][SCHEDULER][INFO]Scheduling job, id='HD_9b66ed05-6041-48ac-8332-70583b20381e_0' 
[2022-11-19T11:55:07.5103935Z][SCHEDULER][INFO]Scheduling job, id='HD_9b66ed05-6041-48ac-8332-70583b20381e_1' 
[2022-11-19T11:55:07.5804907Z][SCHEDULER][INFO]Scheduling job, id='HD_9b66ed05-6041-48ac-8332-70583b20381e_2' 
[2022-11-19T11:55:07.6842291Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_9b66ed05-6041-48ac-8332-70583b20381e_0' 
[2022-11-19T11:55:07.6824320Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_9b66ed05-6041-48ac-8332-70583b20381e_1' 
[2

{'runId': 'HD_9b66ed05-6041-48ac-8332-70583b20381e',
 'target': 'demo-cluster',
 'status': 'Completed',
 'startTimeUtc': '2022-11-19T11:55:06.267872Z',
 'endTimeUtc': '2022-11-19T11:59:53.539662Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"Accuracy","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'ead91f0f-a574-49ad-9273-f9d80a2cfc86',
  'user_agent': 'python/3.8.5 (Linux-5.15.0-1017-azure-x86_64-with-glibc2.10) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.44.0',
  'space_size': '50',
  'score': '0.9141122913505311',
  'best_child_run_id': 'HD_9b66ed05-6041-48ac-8332-70583b20381e_2',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_9b66ed05-6041-48ac-8332-70583b20381e_2'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryValues': {'amlClient

In [39]:
import joblib
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print("best_run_metric ",best_run.get_metrics())
print("best_run_details :",best_run.get_details())


best_run_metric  {'Regularization Strength:': 0.01, 'Max iterations:': 61, 'Accuracy': 0.9141122913505311}
best_run_details : {'runId': 'HD_9b66ed05-6041-48ac-8332-70583b20381e_2', 'target': 'demo-cluster', 'status': 'Completed', 'startTimeUtc': '2022-11-19T11:56:49.289322Z', 'endTimeUtc': '2022-11-19T11:57:07.64661Z', 'services': {}, 'properties': {'_azureml.ComputeTargetType': 'amlctrain', 'ContentSnapshotId': 'ead91f0f-a574-49ad-9273-f9d80a2cfc86', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}, 'inputDatasets': [], 'outputDatasets': [], 'runDefinition': {'script': 'train.py', 'command': '', 'useAbsolutePath': False, 'arguments': ['--C', '1', '--max_iter', '100', '--C', '0.01', '--max_iter', '61'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'demo-cluster', 'dataReferences': {}, 'data': {}, 'outputData': {}, 'datacaches': [], 'jobName': None, 'maxRunDurationSeconds': 2592000

In [40]:
for file in hyperdrive_run.get_file_names():
    print(file)

azureml-logs/hyperdrive.txt


In [41]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()

In [52]:
hyperdrive_run.get_children_sorted_by_primary_metric(top=1)

[{'run_id': 'HD_9b66ed05-6041-48ac-8332-70583b20381e_2',
  'hyperparameters': '{"--C": 0.01, "--max_iter": 61}',
  'best_primary_metric': 0.9141122913505311,
  'status': 'Completed'}]

In [48]:
best_run.get_file_names()

['logs/azureml/dataprep/0/backgroundProcess.log',
 'logs/azureml/dataprep/0/backgroundProcess_Telemetry.log',
 'logs/azureml/dataprep/0/rslex.log.2022-11-19-11',
 'outputs/model.joblib',
 'system_logs/cs_capability/cs-capability.log',
 'system_logs/hosttools_capability/hosttools-capability.log',
 'system_logs/lifecycler/execution-wrapper.log',
 'system_logs/lifecycler/lifecycler.log',
 'system_logs/metrics_capability/metrics-capability.log',
 'system_logs/snapshot_capability/snapshot-capability.log',
 'user_logs/std_log.txt']

In [47]:
model=best_run.register_model(model_name='model',model_path='outputs/model.joblib')

In [50]:
best_fitted_model = best_run.download_file('outputs/model.joblib')

In [26]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

url =  "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(path=url)

In [27]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
x_train['outcome'] = y_train
x_test['outcome'] = y_test


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [28]:
if "data" not in os.listdir():
    os.mkdir("./data")

In [29]:
x_train.to_csv("data/train_data.csv", index=False)
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='bankmarketing', overwrite=True, show_progress=True)
 
# Upload the training data as a tabular dataset for access during training on remote compute
train_data = TabularDatasetFactory.from_delimited_files(path=ds.path('bankmarketing/train_data.csv'))


"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 1 files
Uploading ./data/train_data.csv
Uploaded ./data/train_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [30]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    compute_target = aml_cluster, 
    training_data=train_data,
    label_column_name='outcome',
    featurization="auto",
    enable_early_stopping = True,
    n_cross_validations=10)

In [31]:
# Submit your automl run

from azureml.core.experiment import Experiment
automl_experiment = Experiment(ws,'automl-experiment')
automl_run = automl_experiment.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on demo-cluster with default configuration
Running on remote compute: demo-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
automl-experiment,AutoML_c65ee599-425d-4655-a492-de5d809cd8e7,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+-------------------------------------

In [33]:
# Retrieve and save your best automl model.
best_run, fitted_model = automl_run.get_output()
best_run_metrics = best_run.get_metrics()

Package:azureml-automl-runtime, training version:1.46.1, current version:1.44.0
Package:azureml-core, training version:1.46.0, current version:1.44.0
Package:azureml-dataprep, training version:4.5.7, current version:4.2.2
Package:azureml-dataprep-rslex, training version:2.11.4, current version:2.8.1
Package:azureml-dataset-runtime, training version:1.46.0, current version:1.44.0
Package:azureml-defaults, training version:1.46.0, current version:1.44.0
Package:azureml-interpret, training version:1.46.0, current version:1.44.0
Package:azureml-mlflow, training version:1.46.0, current version:1.44.0
Package:azureml-pipeline-core, training version:1.46.0, current version:1.44.0
Package:azureml-responsibleai, training version:1.46.0, current version:1.44.0
Package:azureml-telemetry, training version:1.46.0, current version:1.44.0
Package:azureml-train-automl-client, training version:1.46.0, current version:1.44.0
Package:azureml-train-automl-runtime, training version:1.46.1, current version:

In [35]:
for step_ in fitted_model.named_steps:
    print(step_)

datatransformer
prefittedsoftvotingclassifier


In [36]:
best_run.register_model(model_name = "automl_model.pkl", model_path = './outputs/')

Model(workspace=Workspace.create(name='ml-azure-ws1', subscription_id='bb379b6e-1466-40b0-9fa1-cadacc0825e6', resource_group='ml-azure-demo'), name=automl_model.pkl, id=automl_model.pkl:3, version=3, tags={}, properties={})

In [66]:
best_run.get_details()


{'runId': 'HD_9b66ed05-6041-48ac-8332-70583b20381e_2',
 'target': 'demo-cluster',
 'status': 'Completed',
 'startTimeUtc': '2022-11-19T11:56:49.289322Z',
 'endTimeUtc': '2022-11-19T11:57:07.64661Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlctrain',
  'ContentSnapshotId': 'ead91f0f-a574-49ad-9273-f9d80a2cfc86',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--C',
   '1',
   '--max_iter',
   '100',
   '--C',
   '0.01',
   '--max_iter',
   '61'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'demo-cluster',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'datacaches': [],
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'instanceTypes': [],
  'priority': None,
 

In [51]:
aml_cluster.delete()