In [2]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-134468")
exp = Experiment(workspace=ws, name="udacity-project-1")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-134468
Azure region: southcentralus
Subscription id: 1b944a9b-fdae-4f97-aeb1-b7eea0beac53
Resource group: aml-quickstarts-134468


In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
cpu_cluster_name = "compute-project1"
vm_size = "Standard_D2_V2"
try:
    compute_target = ComputeTarget(workspace = ws, name= cpu_cluster_name)
    print("Found already existing")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found already existing

Running


In [4]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling({
    "--C": uniform(0.001, 100),
    "--max_iter": choice(10, 20, 30, 40,50,60,70,80,90,100)
}
)
# Specify a Policy
policy = BanditPolicy(evaluation_interval=5, slack_factor=0.5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory=".", compute_target=compute_target, entry_script="train.py")

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps,
                                    primary_metric_name="Accuracy", 
                                    estimator=est, 
                                    policy=policy,
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=6)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [5]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hypdrive_run = exp.submit(hyperdrive_config, show_output=True)



In [6]:
RunDetails(hypdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [7]:
import joblib
# Get your best run and save the model from that run.

hd_best_run = hypdrive_run.get_best_run_by_primary_metric()
hd_best_model = hd_best_run.register_model(model_name="hd_best_model" , model_path="./")

In [8]:
hd_best_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-1,HD_3854fa53-9812-4498-afcd-043ee533c4f7_0,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [9]:
best_run_metrics = hd_best_run.get_metrics()
parameter_values = hd_best_run.get_details()

print("Best Run Id: ", hd_best_run.id)
print("Accuracy: ", best_run_metrics["Accuracy"])
print("Parameters: ", parameter_values["runDefinition"]["arguments"])

Best Run Id:  HD_3854fa53-9812-4498-afcd-043ee533c4f7_0
Accuracy:  0.9
Parameters:  ['--C', '72.00660149628659', '--max_iter', '60']


In [10]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [11]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [12]:
print(type(x))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [13]:
import pandas as pd
y_df = pd.DataFrame(y, columns=['y'])
y_df

Unnamed: 0,y
0,0
1,0
2,0
3,0
4,0
...,...
32945,0
32946,0
32947,0
32948,0


In [14]:
x_train=pd.concat([x,y_df], axis=1)
x_train

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
0,57,1,0,0,1,5,1,371,1,999,...,0,0,0,0,1,0,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,1,0,0,0,0,0,0,0,1,0
2,33,1,0,0,0,5,5,52,1,999,...,0,0,0,1,0,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,1,0,0,0,1,0,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32945,56,1,0,0,1,7,1,116,1,999,...,0,1,0,0,0,0,0,0,0,0
32946,37,1,0,0,1,7,5,69,7,999,...,0,0,0,0,0,0,0,1,0,0
32947,26,0,0,0,0,5,2,135,4,999,...,0,0,0,0,0,0,0,1,0,0
32948,31,0,0,0,0,4,1,386,1,999,...,0,0,0,1,0,0,0,0,0,0


In [15]:
x_train.to_csv('training/train.csv')


In [17]:
default_ds = ws.get_default_datastore()
default_ds

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-c518fc2e-a074-445e-81cc-0784b3819938",
  "account_name": "mlstrg134468",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [22]:
default_ds.upload(src_dir='training/', target_path='d/')

Uploading an estimated of 1 files
Target already exists. Skipping upload for d/train.csv
Uploaded 0 files


$AZUREML_DATAREFERENCE_55db6bc0e34e41e28b045f65e49b5134

In [25]:
from azureml.core import Dataset

final_data= Dataset.Tabular.from_delimited_files(path=[(default_ds,('d/train.csv'))])

In [26]:
final_data

{
  "source": [
    "('workspaceblobstore', 'd/train.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [27]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task= 'classification',
    primary_metric= 'accuracy',
    training_data= final_data,
    label_column_name= 'y',
    n_cross_validations= 5,
    compute_target = compute_target
    )

In [29]:
# Submit your automl run
automl_run = exp.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on compute-project1 with default configuration
Running on remote compute: compute-project1
Parent Run ID: AutoML_5091c0ee-191c-499c-be16-66e9377a7891

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------

In [31]:
RunDetails(automl_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [32]:
automl_best_run, model= automl_run.get_output()
automl_best_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-1,AutoML_5091c0ee-191c-499c-be16-66e9377a7891_23,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [33]:
# Retrieve and save your best automl model.

automl_model= automl_best_run.register_model(model_name = 'automl_best_model', model_path='./')

In [36]:
metrics = automl_best_run.get_metrics()
parameters = automl_best_run.get_details()

print("Best run ID: ",automl_best_run.id)
print("Accuracy: ",metrics["accuracy"])
print("Optimized Parameters: ", parameters)

Best run ID:  AutoML_5091c0ee-191c-499c-be16-66e9377a7891_23
Accuracy:  0.9173899848254932
Optimized Parameters:  {'runId': 'AutoML_5091c0ee-191c-499c-be16-66e9377a7891_23', 'target': 'compute-project1', 'status': 'Completed', 'startTimeUtc': '2021-01-12T17:34:20.468865Z', 'endTimeUtc': '2021-01-12T17:35:37.361079Z', 'properties': {'runTemplate': 'automl_child', 'pipeline_id': '__AutoML_Ensemble__', 'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'udacity-project-1\',\'compute_target\':\'compute-project1\',\'subscription_id\':\'1b944a9b-fdae-4f97-aeb1-b7eea0beac53\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_5091c0ee-191c-499c-be16-66e9377a7891_23","ex

In [37]:
model._final_estimator

PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('0',
                                           Pipeline(memory=None,
                                                    steps=[('maxabsscaler',
                                                            MaxAbsScaler(copy=True)),
                                                           ('lightgbmclassifier',
                                                            LightGBMClassifier(boosting_type='gbdt',
                                                                               class_weight=None,
                                                                               colsample_bytree=1.0,
                                                                               importance_type='split',
                                                                               learning_rate=0.1,
                                                                               max_

In [38]:
compute_target.delete()