## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import logging

from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

## Get the workspace

In [2]:
subscription_id = '95bcf3b7-9903-4d62-9b7b-00484a87a6cb'
resource_group = 'ResearchProject'
workspace_name = 'AutoML'

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded.")
except:
    print("Workspace not accessible.")

Workspace configuration succeeded.


## Get compute

In [3]:
# Choose a name for your CPU cluster
cpu_cluster_name = "Automl-Compute"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_F2S_V2', max_nodes=3)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Load Data

In [4]:
boston = load_boston()

df = pd.DataFrame(boston.data)
df.columns = boston.feature_names
df['PRICE'] = boston.target 

In [5]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [6]:
df.shape

(506, 14)

In [7]:
df = df.sample(frac=1).reset_index(drop=True)

train_data = df[100:]
test_data = df[:100]

In [8]:
print(f'Train shape: {train_data.shape}')
print(f'Test shape: {test_data.shape}')

Train shape: (406, 14)
Test shape: (100, 14)


In [9]:
# Data source and format (Pandas (local) or TabularDataset (remote compute))
if not os.path.isdir('data'):
    os.mkdir('data')
    
# Save the train data to a csv to be uploaded to the datastore
pd.DataFrame(train_data).to_csv("data/train_data.csv", index=False)
pd.DataFrame(test_data).to_csv("data/test_data.csv", index=False)

ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='boston', overwrite=True, show_progress=True)

# Upload the training data as a tabular dataset for access during training on remote compute
train_dataset = Dataset.Tabular.from_delimited_files(path=ds.path('boston/train_data.csv'))
test_dataset = Dataset.Tabular.from_delimited_files(path=ds.path('boston/test_data.csv'))

label = "PRICE"

Uploading an estimated of 2 files
Uploading ./data\test_data.csv
Uploaded ./data\test_data.csv, 1 files out of an estimated total of 2
Uploading ./data\train_data.csv
Uploaded ./data\train_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


## Configure Experiment

In [10]:
# Configure experiment settings
# Data featurization (automatically scaled and normalized)
# Exit criteria

automl_settings = {
    "experiment_timeout_hours" : 0.3,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "n_cross_validations": 2,
    "primary_metric": 'r2_score',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

automl_config = AutoMLConfig(task = 'regression',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             experiment_exit_score = 1,
                             enable_onnx_compatible_models=True,
                             training_data = train_dataset,
                             label_column_name = label,
                             **automl_settings
                            )

# Choose a name for experiment
experiment_name = 'Boston_AutoML'
experiment = Experiment(ws, experiment_name)

In [11]:
remote_run = experiment.submit(automl_config, show_output = True)

print(remote_run)

remote_run.wait_for_completion()

best_run_customized, fitted_model_customized = remote_run.get_output()

Running on remote.
No run_configuration provided, running on Automl-Compute with default configuration
Running on remote compute: Automl-Compute
Parent Run ID: AutoML_85845d76-6c35-4d06-b2df-014b856e9e5e

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more about high cardinality feature handling: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description