In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-128450")
exp = Experiment(workspace=ws, name="first-udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-128450
Azure region: southcentralus
Subscription id: f39cb977-6a3a-445b-a26a-b9a791c5fd89
Resource group: aml-quickstarts-128450


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
cpu_cluster_name = "udacity-project-one"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Cluster Exist --- proceed!.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes = 4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    cpu_cluster.wait_for_completion(show_output=True)

Cluster Exist --- proceed!.


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
### YOUR CODE HERE ###
ps = RandomParameterSampling({
    "--C": uniform(0.0, 1.0), 
    "--max_iter": choice(50, 100, 150, 200)
})


# Specify a Policy
### YOUR CODE HERE ###
policy = BanditPolicy(slack_factor=0.1, evaluation_interval=1)

#if "training" not in os.listdir():
#    os.mkdir("./training")
####shutil.copy('./train.py', script_folder)

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory=os.getcwd(), compute_target=cpu_cluster, entry_script='train.py') ### YOUR CODE HERE ###

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
### YOUR CODE HERE ###
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name="Accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_concurrent_runs=4,
    max_total_runs=40
)

In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hyperdriverun = exp.submit(hyperdrive_config, show_output=True)
RunDetails(hyperdriverun).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [5]:
import joblib
# Get your best run and save the model from that run.
### YOUR CODE HERE ###
best_run = hyperdriverun.get_best_run_by_primary_metric()
#print(hyperdrive_run.get_best_run_by_primary_metric())
best_run_metrics = best_run.get_metrics()
print('Best run Id:', best_run.id)
print('\n Accuracy:', best_run_metrics["Accuracy"])


Best run Id: HD_326ca30f-7d55-4faa-87d8-e5447d0adb0f_19

 Accuracy: 0.9139354212187424


In [6]:
os.makedirs('outputs',exist_ok=True)
joblib.dump(value=best_run.id, filename='outputs/model.joblib')

['outputs/model.joblib']

In [7]:
best_run.get_file_names()[-1]

'outputs/model.joblib'

In [8]:
# insert this after fitting the model
best_run.register_model(model_name='best_model', model_path='logs/azureml/job_release_azureml.log')

Model(workspace=Workspace.create(name='quick-starts-ws-128450', subscription_id='f39cb977-6a3a-445b-a26a-b9a791c5fd89', resource_group='aml-quickstarts-128450'), name=best_model, id=best_model:2, version=2, tags={}, properties={})

In [9]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core.dataset import Dataset
# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv')### YOUR CODE HERE ###

In [10]:
from train import clean_data

# Use the clean_data function to clean your data.
### YOUR DATA OBJECT HERE ###
x, y = clean_data(ds)

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [12]:
#concatenating the training dataset
import pandas as pd
ds_am = pd.concat([x_train,y_train],axis=1)

In [13]:
#Additional optional settings:
automl_settings = {
    'enable_early_stopping': True,
    'iteration_timeout_minutes':5,
    'max_concurrent_iterations':4,
    'featurization': 'auto'
}

In [19]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds_am,
    label_column_name='y',
    n_cross_validations=5)

In [20]:
# Submit your automl run

automl_run = exp.submit(automl_config,show_output=True)### YOUR CODE HERE ###

Running on local machine
Parent Run ID: AutoML_0120513d-678c-4d7a-8a4d-fc4543c98f6f

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely p

In [21]:
automl_run

Experiment,Id,Type,Status,Details Page,Docs Page
first-udacity-project,AutoML_0120513d-678c-4d7a-8a4d-fc4543c98f6f,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [22]:
# Wait for the remote run to complete
automl_run.wait_for_completion()

{'runId': 'AutoML_0120513d-678c-4d7a-8a4d-fc4543c98f6f',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-11-30T11:12:17.853732Z',
 'endTimeUtc': '2020-11-30T11:44:51.178129Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.18.0", "azureml-train": "1.18.0", "azureml-train-restclients-hyperdrive": "1.18.0", "azureml-train-core": "1.18.0", "azureml-train-automl": "1.18.0", "azureml-train-automl-runtime": "1.18.0", "azureml-train-automl-client": "1.18.0", "azureml-tensorboard": "1.18.0", "azureml-telemetry": "1.18.0", "azureml-sdk": "1.18.0", "azureml-samples": "0+unknow

In [23]:
# Retrieve and save your best automl model.
best_automl_run, best_model = automl_run.get_output()
best_automl_run.register_model(model_name = "best_run_automl.pkl", model_path = './outputs/')
### YOUR CODE HERE ###

Model(workspace=Workspace.create(name='quick-starts-ws-128450', subscription_id='f39cb977-6a3a-445b-a26a-b9a791c5fd89', resource_group='aml-quickstarts-128450'), name=best_run_automl.pkl, id=best_run_automl.pkl:2, version=2, tags={}, properties={})

In [24]:
#Take a look to the parameters of the best run
best_automl_run.get_tags()

{'ensembled_iterations': '[32, 0, 1, 28, 36, 14, 34, 15, 4]',
 'ensembled_algorithms': "['XGBoostClassifier', 'LightGBM', 'XGBoostClassifier', 'XGBoostClassifier', 'XGBoostClassifier', 'RandomForest', 'XGBoostClassifier', 'SGD', 'SGD']",
 'ensemble_weights': '[0.07142857142857142, 0.07142857142857142, 0.2857142857142857, 0.07142857142857142, 0.14285714285714285, 0.14285714285714285, 0.07142857142857142, 0.07142857142857142, 0.07142857142857142]',
 'best_individual_pipeline_score': '0.9163158673384885',
 'best_individual_iteration': '32',
 'model_explanation': 'True'}

In [25]:
#Also take a look to the recommended function
from pprint import pprint

def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(best_model)

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['32', '0', '1', '28', '36', '14', '34', '15', '4'],
 'weights': [0.07142857142857142,
             0.07142857142857142,
             0.2857142857142857,
             0.07142857142857142,
             0.14285714285714285,
             0.14285714285714285,
             0.07142857142857142,
             0.07142857142857142,
             0.07142857142857142]}

32 - standardscalerwrapper
{'class_name': 'StandardScaler',
 'copy': True,
 'module_name': 'sklearn.preprocessing._data',
 'with_mean': False,
 'with_std': False}

32 - xgboostclassifier
{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_

In [26]:

#delete the Compute target in the notebook
cpu_cluster.delete()