# AutoML Classification experiment using Remote-AML-Compute and AML-Datasets
## Data: German credit dataset loaded from Azure ML Dataset

##  Get Azure ML Workspace to use

In [6]:
import logging

from matplotlib import pyplot as plt
import pandas as pd
import os

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.train.automl import AutoMLConfig

In [8]:
ws = Workspace.from_config()
output = {}
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
Subscription ID,7d48758f-d40b-4252-854c-e7d8f2ed7645
Workspace,amlworkspacesahiep2141
Resource Group,MCW_Synapse
Location,westeurope


## Load data from Azure ML Datasets 
Pandas DataFrame only used to check out the data

In [10]:
# Load Data
aml_dataset = ws.datasets['german-credit']
# Use Pandas DataFrame just to sneak peak some data and schema
df = aml_dataset.to_pandas_dataframe()
# .to_pandas_dataframe().dropna()
df.head(5)

Unnamed: 0,Sno,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [11]:
# Use Pandas DataFrame just to investigate the dataset's schema and info
df.describe()

Unnamed: 0,Sno,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,35.546,1.904,3271.258,20.903
std,288.819436,11.375469,0.653614,2822.736876,12.058814
min,0.0,19.0,0.0,250.0,4.0
25%,249.75,27.0,2.0,1365.5,12.0
50%,499.5,33.0,2.0,2319.5,18.0
75%,749.25,42.0,2.0,3972.25,24.0
max,999.0,75.0,3.0,18424.0,72.0


## Clean up the initial dataset 
#### (Using AML Tabular Dataset .drop_columns() method )

In [12]:
# Dropping Sno column since it is merely an identifier
aml_dataset = aml_dataset.drop_columns(['Sno'])
df = aml_dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903
std,11.375469,0.653614,2822.736876,12.058814
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


## Split original AML Tabular Dataset in two test/train AML Tabular Datasets (using AML DS function)

In [14]:
# Split using Azure Tabular Datasets
# https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py#random-split-percentage--seed-none-

train_dataset, test_dataset = aml_dataset.random_split(0.9, seed=1)

# Use Pandas DF only to check the data
train_dataset_df = train_dataset.to_pandas_dataframe()
test_dataset_df = test_dataset.to_pandas_dataframe()
# print(train_dataset_df.describe())
# print(test_dataset_df.describe())

## List remote AML compute targets available

In [17]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           max_nodes=6)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## List and select primary metric to drive the AutoML classification problem


List of possible primary metrics is here:
https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#primary-metric

In [19]:
from azureml.train import automl

# Get a list of valid metrics for your given task
automl.utilities.get_primary_metrics('classification')

['accuracy',
 'precision_score_weighted',
 'norm_macro_recall',
 'average_precision_score_weighted',
 'AUC_weighted']

## Define AutoML Experiment settings (With AML Remote Compute)

In [20]:
automl_settings = {
    "n_cross_validations": 5,
    "primary_metric": 'accuracy',
    "enable_early_stopping": True,
    "max_concurrent_iterations": 2, # This is a limit for testing purpose, please increase it as per cluster size
    "experiment_timeout_hours": 0.25, # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ablity to find the best model possible
    "verbosity": logging.INFO,
    "featurization": 'auto',
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target = compute_target,
                             training_data = train_dataset,
                             label_column_name = "Risk",
                             **automl_settings
                            )

## Run Experiment (on AML Remote Compute) with multiple child runs under the covers

In [21]:
from azureml.core import Experiment
from datetime import datetime

now = datetime.now()
time_string = now.strftime("%m-%d-%Y-%H")
#print(time_string)
experiment_name = "credit-automl-remote-{0}".format(time_string)
print(experiment_name)

experiment = Experiment(workspace=ws, 
                        name=experiment_name)
            
run = experiment.submit(automl_config, show_output=True)

04-18-2021-19
credit-automl-remote-04-18-2021-19
Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_098df595-23b7-4148-bebf-279fd95a12f8

Something went wrong while printing the experiment progress but the run is still executing on the compute target. 
Please check portal for updated status: https://ml.azure.com/experiments/credit-automl-remote-04-18-2021-19/runs/AutoML_098df595-23b7-4148-bebf-279fd95a12f8?wsid=/subscriptions/7d48758f-d40b-4252-854c-e7d8f2ed7645/resourcegroups/MCW_Synapse/workspaces/amlworkspacesahiep2141
Manual run timing: --- 434.9810309410095 seconds needed for running the whole Remote AutoML Experiment ---


## Explore results with Widget
#### Widget for Monitoring Runs

The widget will first report a "loading" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.

**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details

In [23]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Retrieve the 'Best Model' (Scikit-Learn model)

In [None]:
best_run, fitted_model = run.get_output()
print(best_run)
print(fitted_model)

## Make Predictions

### Extract X values (feature columns) from test dataset and convert to NumPi array for predicting 

In [None]:
import pandas as pd

#Remove Label/y column
if 'Risk' in test_dataset_df.columns:
    y_test_df = test_dataset_df.pop('Risk')

x_test_df = test_dataset_df

### Make the actual Predictions

In [None]:
# Try the best model
y_predictions = fitted_model.predict(x_test_df)

print('10 predictions: ')
print(y_predictions[:10])

In [None]:
y_predictions.shape

### Calculate the Accuracy with Test Dataset (Not used for training)

In [None]:
from sklearn.metrics import accuracy_score

print('Accuracy:')
accuracy_score(y_test_df, y_predictions)