## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import logging

from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from sklearn.model_selection import train_test_split

## Get the workspace

In [2]:
subscription_id = '95bcf3b7-9903-4d62-9b7b-00484a87a6cb'
resource_group = 'ResearchProject'
workspace_name = 'AutoML'

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded.")
except:
    print("Workspace not accessible.")

Workspace configuration succeeded.


## Get compute

In [3]:
# Choose a name for your CPU cluster
cpu_cluster_name = "Automl-Compute"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_F2S_V2', max_nodes=3)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Load Data

In [4]:
df = pd.read_csv('../Data/wine_data.csv')

df = df.sample(frac=1).reset_index(drop=True)

train_data = df[70:]
test_data = df[:70]

In [5]:
df.head(10)

Unnamed: 0,inputs Alcohol,MalicAcid,Ash,AlcalinityOfAsh,Magnesium,TotalPhenols,flavanoids,NonflavanoidsPhenols,Proanthocyanins,ColorIntensity,Hue,OD280/OD315,Proline,Cultivar
0,12.2,3.03,2.32,19.0,96,1.25,0.49,0.4,0.73,5.5,0.66,1.83,510,2
1,11.96,1.09,2.3,21.0,101,3.38,2.14,0.13,1.65,3.21,0.99,3.13,886,1
2,13.45,3.7,2.6,23.0,111,1.7,0.92,0.43,1.46,10.68,0.85,1.56,695,2
3,12.43,1.53,2.29,21.5,86,2.74,3.15,0.39,1.77,3.94,0.69,2.84,352,1
4,12.72,1.75,2.28,22.5,84,1.38,1.76,0.48,1.63,3.3,0.88,2.42,488,1
5,12.85,1.6,2.52,17.8,95,2.48,2.37,0.26,1.46,3.93,1.09,3.63,1015,0
6,12.34,2.45,2.46,21.0,98,2.56,2.11,0.34,1.31,2.8,0.8,3.38,438,1
7,13.52,3.17,2.72,23.5,97,1.55,0.52,0.5,0.55,4.35,0.89,2.06,520,2
8,13.08,3.9,2.36,21.5,113,1.41,1.39,0.34,1.14,9.4,0.57,1.33,550,2
9,12.77,3.43,1.98,16.0,80,1.63,1.25,0.43,0.83,3.4,0.7,2.12,372,1


In [6]:
# Data source and format (Pandas (local) or TabularDataset (remote compute))
if not os.path.isdir('data'):
    os.mkdir('data')
    
# Save the train data to a csv to be uploaded to the datastore
pd.DataFrame(train_data).to_csv("data/train_data.csv", index=False)
pd.DataFrame(test_data).to_csv("data/test_data.csv", index=False)

ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='wine', overwrite=True, show_progress=True)

# Upload the training data as a tabular dataset for access during training on remote compute
train_dataset = Dataset.Tabular.from_delimited_files(path=ds.path('wine/train_data.csv'))
test_dataset = Dataset.Tabular.from_delimited_files(path=ds.path('wine/test_data.csv'))

label = "Cultivar"

Uploading an estimated of 2 files
Uploading ./data\test_data.csv
Uploaded ./data\test_data.csv, 1 files out of an estimated total of 2
Uploading ./data\train_data.csv
Uploaded ./data\train_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


## Configure Experiment

In [7]:
# Configure experiment settings
# Data featurization (automatically scaled and normalized)
# Exit criteria

automl_settings = {
    "experiment_timeout_hours" : 0.3,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "n_cross_validations": 2,
    "primary_metric": 'accuracy',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             experiment_exit_score = 1,
                             enable_onnx_compatible_models=True,
                             training_data = train_dataset,
                             label_column_name = label,
                             **automl_settings
                            )

# Choose a name for experiment
experiment_name = 'Wine_AutoML'
experiment = Experiment(ws, experiment_name)

In [8]:
remote_run = experiment.submit(automl_config, show_output = False)

print(remote_run)

remote_run.wait_for_completion()

best_run_customized, fitted_model_customized = remote_run.get_output()

Running on remote.
Run(Experiment: Wine_AutoML,
Id: AutoML_2881bc33-8622-427d-8c5a-660de4069668,
Type: automl,
Status: NotStarted)


## Transparency

In [9]:
custom_featurizer = fitted_model_customized.named_steps['datatransformer']
df = custom_featurizer.get_featurization_summary()
pd.DataFrame(data=df)

AttributeError: 'NoneType' object has no attribute 'named_steps'

In [None]:
df = custom_featurizer.get_featurization_summary(is_user_friendly=False)
pd.DataFrame(data=df)

In [None]:
df = custom_featurizer.get_stats_feature_type_summary()
pd.DataFrame(data=df)

## Results