# Automated ML

In [1]:
#Import required Dependencies
from azureml.data.dataset_factory import TabularDatasetFactory
import joblib
from azureml.train.automl import AutoMLConfig
from azureml.core import Workspace, Experiment, Dataset
from azureml.core.environment import Environment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
import os
import joblib
from azureml.core import Experiment
from azureml.core.model import Model

In [2]:
# Installing xgboost 0.90 in order to avoid compatibility issues when retrieving the best model
!pip install xgboost==0.90
# The kernel should be restarted after executing this cell in order to apply the changes
# Then the notebook can be executed normally



In [3]:
#find the workspace and create the environment with the name "AutoML". Start logging using exp.
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="AutoML")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

# Choose a name for your CPU cluster
cpu_cluster_name = "autoMlCluster"

# Verify that the cluster doesn't already exist
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster')
    
#create a new cluster with the specified configurations "vm_size='STANDARD_D12_V2' and at most 4 nodes
except ComputeTargetException:
    
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    print("New cluster is created")
    
cpu_cluster.wait_for_completion(show_output=True)


Workspace name: quick-starts-ws-142131
Azure region: southcentralus
Subscription id: aa7cf8e8-d23f-4bce-a7b9-1f0b4e0ac8ee
Resource group: aml-quickstarts-142131
New cluster is created
Creating....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

### Overview
The dataset used to develop both models comes from Kaggle datasets. It deals with heart failures, bringing information on 12 categorical and numerical features which can be used to forecast mortality rates by cardiovascular diseases (CVDs). Additionally, The dataset has a total of 299 observations.

#### Task
As it was mentioned previously, this data is going to be used to predict heart failures caused by CVDs, in order to provide an early detection ML model using the 12 clinical variables included in the dataset.

#### Access
For training the AutoML model, I registered the dataset from local files using the Datasets Hub in the Azure ML Studio.
By contrast, I used the following dataset url from github for training the customised model using HyperDrive: https://github.com/htrismicristo/Capstone-Project-Azure-ML-Engineer-Microsoft-Udacity/blob/main/heart_failure_clinical_records_dataset.csv

In [4]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'aa7cf8e8-d23f-4bce-a7b9-1f0b4e0ac8ee'
resource_group = 'aml-quickstarts-142131'
workspace_name = 'quick-starts-ws-142131'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='Heart Failure Data')
dataset.to_pandas_dataframe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


## AutoML Configuration

In overview, the following were the main settings and parameters used for the AutoML model on the Azure platform:

* n_cross_validations = 5. It sets the number of cross validations to carry out.
* iterations = 30. It specifies how many algorithm and parameter combinations to test during the experiment.
* max_concurrent_iterations = 4. It represents the maximum number of iterations that could be performed in parallel.
* primary_metric = Accuracy. This is the metric which will be optimized for model training and selection.
* target column = DEATH_EVENT. Whether the patient died from a heart failure or not.
* task = classification. Based on the project's goal and the nature of the data, the task is clearly a classification one.
* experiment_timeout_minutes = 30. It determines the maximum amount of time all iterations can take before the experiment terminates. 

In [5]:
automl_settings = {
    "experiment_timeout_minutes":30,
    "task":'classification',
    "primary_metric":'accuracy',
    "training_data":dataset,
    "label_column_name":'DEATH_EVENT',
    "iterations":30,
    "max_concurrent_iterations": 4,
    "n_cross_validations":5,
 }

automl_config = AutoMLConfig(
    compute_target=cpu_cluster,
    **automl_settings)


In [6]:
# Submit experiment
remote_run=exp.submit(automl_config, show_output=True)


Running on remote.
No run_configuration provided, running on autoMlCluster with default configuration
Running on remote compute: autoMlCluster
Parent Run ID: AutoML_ca9f86d7-590e-439a-9ecf-6e3de59abab6

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing value

## Run Details

In [7]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model



In [8]:

#getting the metrics required
best_run_auto, fitted_model_auto = remote_run.get_output()
best_run_metrics = best_run_auto.get_metrics()
parameter_values = best_run_auto.get_details()['runDefinition']['arguments']

print('The best run automl model\n', best_run_auto)

print('\nThe Id for best run is:', best_run_auto.id)
print('The Accuracy: is', best_run_metrics['accuracy'])



The best run automl model
 Run(Experiment: AutoML,
Id: AutoML_ca9f86d7-590e-439a-9ecf-6e3de59abab6_28,
Type: azureml.scriptrun,
Status: Completed)

The Id for best run is: AutoML_ca9f86d7-590e-439a-9ecf-6e3de59abab6_28
The Accuracy: is 0.8696610169491524


In [9]:
# Retrieve and save the best automl model.
best_run_model, fitted_model_a = remote_run.get_output()

joblib.dump(fitted_model_a, "bestModelAutoML.joblib")

print(best_run_model)

print(fitted_model_a)

Run(Experiment: AutoML,
Id: AutoML_ca9f86d7-590e-439a-9ecf-6e3de59abab6_28,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                  min_samples_leaf=0.035789473684210524,
                                                                                                  min_samples_split=0.19736842105263158

In [10]:
#print the estimators
from pprint import pprint

def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(fitted_model_a)

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['11', '21', '15', '13', '17', '19'],
 'weights': [0.16666666666666666,
             0.16666666666666666,
             0.16666666666666666,
             0.16666666666666666,
             0.16666666666666666,
             0.16666666666666666]}

11 - standardscalerwrapper
{'class_name': 'StandardScaler',
 'copy': True,
 'module_name': 'sklearn.preprocessing._data',
 'with_mean': False,
 'with_std': True}

11 - randomforestclassifier
{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 0.2,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity

In [11]:
#final estimator
print(fitted_model_a._final_estimator)


PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('11',
                                           Pipeline(memory=None,
                                                    steps=[('standardscalerwrapper',
                                                            <azureml.automl.runtime.shared.model_wrappers.StandardScalerWrapper object at 0x7fa0206f0198>),
                                                           ('randomforestclassifier',
                                                            RandomForestClassifier(bootstrap=False,
                                                                                   ccp_alpha=0.0,
                                                                                   class_weight=None,
                                                                                   criterion='entropy',
                                                                                   max_depth=Non.

In [12]:
#Register the model "best-model_auto.joblib"
the_bestmodel = best_run_model.register_model(model_name='bestModelAutoML.joblib', model_path='./')

In [13]:
#display the best model 
the_bestmodel

Model(workspace=Workspace.create(name='quick-starts-ws-142131', subscription_id='aa7cf8e8-d23f-4bce-a7b9-1f0b4e0ac8ee', resource_group='aml-quickstarts-142131'), name=bestModelAutoML.joblib, id=bestModelAutoML.joblib:1, version=1, tags={}, properties={})