# Automated ML with MS Azure

In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.model_selection import train_test_split
import pandas as pd
from azureml.core.dataset import Dataset
from azureml.train.automl.utilities import get_primary_metrics
from azureml.train.automl import AutoMLConfig
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment
from azureml.automl.core.shared import constants
import joblib
import time


## Dataset

### Source of the data set 
For this project we are using files from [Kaggle](https://www.kaggle.com/andrewmvd/heart-failure-clinical-data). In this dataset are data on cardiovascular diseases (CVDs). Which are the number one cause of death worldwide, claiming the lives of an estimated 17 million people each year. This represents approximately 31% of all deaths worldwide.
Heart failure is one of the common events caused by CVDs. This dataset contains 12 characteristics that can be used to predict mortality from heart failure.
In order for people with cardiovascular disease or at high cardiovascular risk (due to the presence of one or more risk factors such as hypertension, diabetes, hyperlipidaemia or established disease) to receive early detection and treatment, these datasets attempt to improve prediction.

### Content of the data set
The dataset contains 12 features that can be used to predict mortality from heart failure:
- age: Age of the patient
- amaemia: Decrease of red blood cells or hemoglobin
- creatinine_phosphokinase: Level of the CPK enzyme in the blood (mcg/L)
- diabetes: If the patient has diabetes
- ejection_fraction: Percentage of blood leaving the heart at each contraction
- high_blood_pressure: If the patient has hypertension
- platelets: Platelets in the blood (kiloplatelets/mL)
- serum_creatinine: Level of serum creatinine in the blood (mg/dL)
- serum_sodium: Level of serum sodium in the blood (mEq/L)
- sex: Woman or man (Gender at birth)
- smoking: patient smokes or not
- time: Follow-up period (days)

### Target
Our goal is to develop a machine learning algorithm that can detect whether a person is likely to die from heart failure. This will help in diagnosis and early prevention. For this, the above mentioned 12 features in the dataset are used to develop a model for the detection.

### Attention!
This is an experiment that was developed in the course of a test for the Udacity learning platform. Do not use this model in a medical environment or for acute indications. Always consult your doctor for medical questions or the medical emergency service in acute cases!


In [2]:
# Creating a new experiment with AutoML

ws = Workspace.from_config()
string_name_experiment = "automl_experiment_heart_failure"

experiment=Experiment(ws, string_name_experiment)

print("Workspace name: " + ws.name, 
      "Azure region: " + ws.location, 
      "Subscription ID: "  + ws.subscription_id, 
      "Resource group: " + ws.resource_group, sep = "\n")

run = experiment.start_logging()

Workspace name: quick-starts-ws-144480
Azure region: southcentralus
Subscription id: d4ad7261-832d-46b2-b093-22156001df5b
Resource group: aml-quickstarts-144480


In [3]:
# Checking and printing existing compute targets
compute_targets= ws.compute_targets
for name, ct in compute_targets.items():
    print("Name: "+ name) 
    print("Type: " +ct.type)
    print("State: "+ ct.provisioning_state)

ProjectPC ComputeInstance Succeeded
cpu-compute AmlCompute Succeeded


In [4]:
# Dfine compute claster name
compute_cluster_name= "computcluster"

#Check if compute cluster already exists

try:
    # Using exist cluster
    compute_cluster=ComputeTarget(workspace=ws, name=compute_cluster_name)
    print("Found existing cluster, use it")
except ComputeTargetException:
    #Create compute cluster
    print("Creating new cluster")
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=5)
    compute_cluster = ComputeTarget.create(ws, compute_cluster_name, compute_config)

# Wait for loading
compute_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it...
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [35]:
# Get data from CSV file
data=pd.read_csv ("heart_failure_clinical_records_dataset.csv")
data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [36]:
# split into train and test datasets
train_df,test_df=train_test_split(data,shuffle=True)


train_df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
118,65.0,1,113,1,60,1,203000.00,0.90,140,0,0,94,0
224,58.0,0,582,1,25,0,504000.00,1.00,138,1,0,205,0
43,72.0,0,127,1,50,1,218000.00,1.00,134,1,0,33,0
207,85.0,0,212,0,38,0,186000.00,0.90,136,1,0,187,0
151,62.0,0,30,1,60,1,244000.00,0.90,139,1,0,117,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,73.0,0,582,0,20,0,263358.03,1.83,134,1,0,198,1
46,51.0,0,1380,0,25,1,271000.00,0.90,130,1,0,38,1
145,50.0,0,185,0,30,0,266000.00,0.70,141,1,1,112,0
174,65.0,0,198,1,35,1,281000.00,0.90,137,1,1,146,0


In [37]:
#save train/test dataset as .csv and upload to datastore
if not os.path.isdir("data"):
    os.mkdir("data")
pd.DataFrame(train_df).to_csv("data/train_data.csv", index=False)
pd.DataFrame(test_df).to_csv("data/test_data.csv", index=False)

ds = ws.get_default_datastore()
ds.upload(src_dir="./data", target_path="heart_failure", overwrite=True, show_progress=True)

Uploading an estimated of 2 files
Uploading ./data/test_data.csv
Uploaded ./data/test_data.csv, 1 files out of an estimated total of 2
Uploading ./data/train_data.csv
Uploaded ./data/train_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_a0b4b9dc46be479698f1790305931d50

In [38]:
#Load dataset as TabularDataset
train_data = Dataset.Tabular.from_delimited_files(path=ds.path("heart_failure/train_data.csv"))

In [39]:
get_primary_metrics("classification")

['AUC_weighted',
 'norm_macro_recall',
 'accuracy',
 'average_precision_score_weighted',
 'precision_score_weighted']

## AutoML Configuration
The AutoML Config class is a way of leveraging the AutoML SDK to automate machine learning. 

The AutoML settings file is a dictionary that specifies all the parameters controlling the experiment like experiment_timeout_minutes, whether or not to enable early stoppong, number of cross validations, the primary metric, etc.

In the AutoML Config, we specify the task, the training data to be used, the target column name, the compute target and we pass the automl settings dictionary.

In [33]:
# automl settings 
automl_settings = {
    "enable_early_stopping" : True,
    "experiment_timeout_minutes": 20,
    "n_cross_validations": 4,
    "featurization": "auto",
    "primary_metric": "accuracy",
    "verbosity": logging.INFO
}

# automl config (with onnx compatible modus)
automl_config = AutoMLConfig(
    task="classification",
    debug_log = "automl_errors.log",
    training_data=train_data,
    label_column_name="DEATH_EVENT",
    compute_target=compute_cluster,
    enable_onnx_compatible_models=True,
    **automl_settings
)

In [34]:
# Submit your experiment
remote_run = experiment.submit(automl_config,show_output=True)

Submitting remote run.
No run_configuration provided, running on cpu-compute with default configuration
Running on remote compute: cpu-compute


Experiment,Id,Type,Status,Details Page,Docs Page
heart-failure-automl,AutoML_b7740e25-c266-481a-b486-ff34d918d2bf,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS

## Run Details

In [40]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [41]:
remote_run.wait_for_completion()

{'runId': 'AutoML_b7740e25-c266-481a-b486-ff34d918d2bf',
 'target': 'cpu-compute',
 'status': 'Completed',
 'startTimeUtc': '2021-05-10T20:14:49.148641Z',
 'endTimeUtc': '2021-05-10T20:41:46.77788Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'cpu-compute',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"8ecd0f9d-ca67-4d27-a7bf-95acbba8346b\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.27.0", "azureml-train": "1.27.0", "azureml-train-restclients-hyperdrive": "1.27.0", "azureml-train-core": "1.27.0", "azureml-train-automl": "1.27.0", "azureml-train-automl-runtime": "1.27.0", "azureml-train-automl-client": "1.27.0", "azur

## Best Model

In [42]:
#get best model and print all the metrics
best_run, fitted_model = remote_run.get_output()

best_run_metrics = best_run.get_metrics() 
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)
    

f1_score_macro 0.8803631187767703
f1_score_weighted 0.8964915996023645
weighted_accuracy 0.9137485302109578
average_precision_score_weighted 0.9433829432466446
precision_score_micro 0.8974774774774774
recall_score_macro 0.8769973544973545
recall_score_weighted 0.8974774774774774
average_precision_score_micro 0.9407831339029317
precision_score_macro 0.8908576416703351
AUC_weighted 0.9389927689594356
f1_score_micro 0.8974774774774775
log_loss 0.3161240594637804
average_precision_score_macro 0.9316316915909795
norm_macro_recall 0.7539947089947091
precision_score_weighted 0.9018098989461217
accuracy 0.8974774774774774
AUC_micro 0.9431034602169737
balanced_accuracy 0.8769973544973545
AUC_macro 0.9389927689594356
matthews_correlation 0.7673002428942496
recall_score_micro 0.8974774774774774
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_b7740e25-c266-481a-b486-ff34d918d2bf_19/confusion_matrix
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_b7740e25-c266-481a-b486-ff34d9

In [43]:
# Details of best model as well as the parameters of the best run
fitted_model

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               n_jobs=1,
                                                                                               nthread=None,
                                                                                               objective='binary:logistic',
                                                      

In [44]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
heart-failure-automl,AutoML_b7740e25-c266-481a-b486-ff34d918d2bf_19,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [45]:
#Save the best model
best_run.register_model(model_name = "automl_best_model.pkl", model_path = "./outputs/")
joblib.dump(fitted_model, filename= "outputs/automl_model.pkl")

['outputs/automl_model.pkl']

## Model Deployment

In [46]:
# Download score.py and env file
best_run.download_file("outputs/scoring_file_v_1_0_0.py", "inference/score.py")
best_run.download_file(constants.CONDA_ENV_FILE_PATH,"automl_env.yml")

In [47]:
# Register the model
model_name = best_run.properties["model_name"]
description = "An AutoML model trained on heart failure data to predict whether a death will occur or not. This is only a test!"
tags = None
model = remote_run.register_model(model_name = model_name, description = description, tags = tags)

print(remote_run.model_id)

AutoMLb7740e25c19


In [48]:
# Create inference config
script_file_name= "inference/score.py"
inference_config = InferenceConfig(entry_script=script_file_name)

aciconfig = AciWebservice.deploy_configuration(cpu_cores = 2, 
                                               memory_gb = 4, 
                                               tags = {"area": "hfData", "type": "automl_classification"}, 
                                               description = "Heart Failure Prediction (Experiment!)")

aci_service_name = "automl-heart-failure-model"
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.state)

automl-heart-failure
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-05-10 20:46:23+00:00 Creating Container Registry if not exists..
2021-05-10 20:46:59+00:00 Use the existing image.
2021-05-10 20:46:59+00:00 Generating deployment configuration..
2021-05-10 20:47:00+00:00 Submitting deployment to compute..
2021-05-10 20:47:24+00:00 Checking the status of deployment automl-heart-failure..
2021-05-10 20:50:50+00:00 Checking the status of inference endpoint automl-heart-failure.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [49]:
# Enable Application Insights
aci_service.update(enable_app_insights=True)

In [60]:
# State / Swagger/ Scoring information
print("State "+ aci_service.state)
print("Swagger URI " + aci_service.swagger_uri)
print("Scoring URI " + aci_service.scoring_uri)

State Healthy
Swagger URI http://0e6c64ff-efc5-4ae7-a193-7e7234d6027e.southcentralus.azurecontainer.io/swagger.json
Scoring URI http://0e6c64ff-efc5-4ae7-a193-7e7234d6027e.southcentralus.azurecontainer.io/score


In the cell below, a request is sent to the web service deployed to test it.

In [61]:
import requests
import json

# Short waiting time, it's stabler in process with this
time.sleep(10)

# URL for the web service, should be similar to:
print ("Scoring URL: "+aci_service.scoring_uri)
scoring_uri = aci_service.scoring_uri


# Two data sets are evaluated, we then receive two results back for this
data = {"data":
        [
          {
            "age": 70.0,
            "anaemia": 1,
            "creatinine_phosphokinase": 4020,
            "diabetes": 1,
            "ejection_fraction": 32,
            "high_blood_pressure": 1,
            "platelets": 234558.23,
            "serum_creatinine": 1.4,
            "serum_sodium": 125,
            "sex": 1,
            "smoking": 0,
            "time": 12
          },
          {
            "age": 65.0,
            "anaemia": 0,
            "creatinine_phosphokinase": 4221,
            "diabetes": 0,
            "ejection_fraction": 22,
            "high_blood_pressure": 0,
            "platelets": 404567.23,
            "serum_creatinine": 1.1,
            "serum_sodium": 115,
            "sex": 0,
            "smoking": 1,
            "time": 7
          },
      ]
    }
# Convert to JSON string
input_data = json.dumps(data)
with open("data.json", "w") as _f:
    _f.write(input_data)

# Set the content type
headers = {"Content-Type": "application/json"}
# If authentication is enabled, set the authorization header

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.json())

{"result": [1, 1]}


In [62]:
#Print the logs of the deployed service
aci_service.get_logs()



In [64]:
# Save file in ONNX
from azureml.automl.runtime.onnx_convert import OnnxConverter
automl_best_run_onnx, automl_fitted_model_onnx = remote_run.get_output(return_onnx_model=True)
OnnxConverter.save_onnx_model(automl_fitted_model_onnx, './AutoML.onnx' )


#Delete the deployed service (wait bevor this do 5 minutes)
time.sleep(300)
aci_service.delete()

print ("Done")

WebserviceException: WebserviceException:
	Message: There is a deployment operation in flight for the Service: automl-heart-failure
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "There is a deployment operation in flight for the Service: automl-heart-failure"
    }
}