# Deploy a DecisionTree model using Azure ML Python SDK 

In [1]:
# Handle to the workspace
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()

import azureml.core
from azureml.core import Workspace

#verify workspace details 
workspace = Workspace.from_config()
print(  "Workspace name: " + workspace.name,
        "Workspace region: " + workspace.location,
        "Subscription id: " + workspace.subscription_id,
        "Resource group: " + workspace.resource_group, sep="\n")

Workspace name: ws001
Workspace region: eastus
Subscription id: eb1308d1-6004-4e62-aebb-ec8032a938d0
Resource group: ml


In [2]:
# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=workspace.subscription_id,
    resource_group_name=workspace.resource_group,
    workspace_name=workspace.name,
)
print(ml_client)

MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f23b0f648e0>,
         subscription_id=eb1308d1-6004-4e62-aebb-ec8032a938d0,
         resource_group_name=ml,
         workspace_name=ws001)


## Creating compoute cluster resource and connect it with the current workspace

In [3]:
from azure.ai.ml.entities import AmlCompute

# Name assigned to the compute cluster
cpu_compute_target = "CC00"

try:
    # let's see if the compute target already exists
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"Compute resource {cpu_compute_target} already existed under compute cluster tab."
    )

except Exception:
    print("Initializing new computer cluster")

    # Let's create the Azure ML compute object with the intended parameters
    cpu_cluster = AmlCompute(
        name=cpu_compute_target,
        type="amlcompute",
        size="STANDARD_F4S_V2",
        min_instances=0,
        max_instances=2,
        idle_time_before_scale_down=180,
        tier="Dedicated",
    )
    print(
        f"Compute resource with name {cpu_cluster.name} will be created, using instance {cpu_cluster.size}"
    )
    
    cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster)

You already have a cluster named CC00, we'll reuse it as is.


## Dataset 

In [6]:
from azureml.core import Dataset
import pandas as pd

# The default datastore is a blob storage container where datasets are stored
datastore = workspace.get_default_datastore()

# Load some data into a dataframe (Note: Pandas is just one path into Azure ML)
df = pd.read_csv('./heart.csv')

# Register the dataset
ds = Dataset.Tabular.register_pandas_dataframe(
        dataframe=df, 
        name='Heart_dataset', 
        description='The dataset for cardiovascular diseases prediction',
        target=datastore
    )


Message: rslex failed, falling back to clex.
Payload: {"pid": 9870, "source": "azureml.dataprep", "version": "4.9.5", "trace": "azureml|data|dataset_factory.py, line 655 in function register_pandas_dataframe.\nazureml|data|_loggerfactory.py, line 132 in function wrapper.\ntmp|ipykernel_9870|3551696286.py, line 11 in function <module>.", "subscription": "", "run_id": "", "resource_group": "", "workspace_name": "", "experiment_id": "", "location": "", "rslex_version": "2.16.4"}
Failed to extract subscription information, Exception=AttributeError; 'Logger' object has no attribute 'activity_info'
Failed to extract subscription information, Exception=AttributeError; 'Logger' object has no attribute 'activity_info'


## Creating environment for Model training

In [7]:
import os

dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [8]:
%%writefile {dependencies_dir}/conda.yml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - xlrd==2.0.1
    - mlflow== 1.26.1
    - azureml-mlflow==1.42.0
    - psutil>=5.8,<5.9
    - tqdm>=4.59,<4.60
    - ipykernel~=6.0
    - matplotlib

Overwriting ./dependencies/conda.yml


In [9]:
from azure.ai.ml.entities import Environment

custom_env_name = "aml-scikit-learn"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for Heart Defaults pipeline",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join(dependencies_dir, "conda.yml"),
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name aml-scikit-learn is registered to workspace, the environment version is 3


## Creating Training Script

In [10]:
import os

train_src_dir = "./src"
os.makedirs(train_src_dir, exist_ok=True)

In [17]:
%%writefile {train_src_dir}/main.py
import os
import argparse
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--learning_rate", required=False, default=0.2, type=float)
    parser.add_argument("--max_depth", required=False, default=6, type=float)
    parser.add_argument("--min_samples_leaf", required=False, default=2, type=int)
    parser.add_argument("--random_state", required=False, default=42, type=int)
    parser.add_argument("--max_leaf_nodes", required=False, default=6, type=int)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    args = parser.parse_args()
   
    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.sklearn.autolog()

    ###################
    #<prepare the data>
    ###################
    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)
    
    # heart_df = pd.read_csv(args.data, header=1, index_col=0)
    heart_df = pd.read_csv("heart.csv")
    print(heart_df.head())

    X_train, X_test, y_train, y_test = train_test_split(
    heart_df.drop("target", axis=1), heart_df["target"], test_size=0.3
    )
    ####################
    #</prepare the data>
    ####################

    ##################
    #<train the model>
    ##################

    dtc = DecisionTreeClassifier(
        max_depth=args.max_depth,
        min_samples_leaf= args.min_samples_leaf,
        random_state= args.random_state,
        max_leaf_nodes= args.max_leaf_nodes,
        
    )
    dtc.fit(X_train, y_train)

    y_pred = dtc.predict(X_test)

    print(classification_report(y_test, y_pred))
    ###################
    #</train the model>
    ###################

    ##########################
    #<save and register model>
    ##########################
    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=dtc,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=dtc,
        path=os.path.join(args.registered_model_name, "trained_model"),
    )
    ###########################
    #</save and register model>
    ###########################
    
    # Stop Logging
    mlflow.end_run()

if __name__ == "__main__":
    main()

Overwriting ./src/main.py


# Create command and input


In [18]:
from azure.ai.ml import command
from azure.ai.ml import Input

registered_model_name = "heart_defaults_model"

job = command(
    inputs=dict(
        data="./heart.csv", #we already had the file locally so just put the path directory from src
        test_train_ratio=0.2,
        learning_rate=0.25,
        registered_model_name=registered_model_name,
    ),
    code="./src/",  # location of source code
    command="python main.py --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} --learning_rate ${{inputs.learning_rate}} --registered_model_name ${{inputs.registered_model_name}}",
    environment="aml-scikit-learn@latest",
    compute="CC00",
    experiment_name="train_model_heart_default_prediction",
    display_name="heart_default_prediction",
)

# Submit job


Using the initially created workspace hanlder (ml_client) and the above job, we can submit our model training:

In [None]:
ml_client.create_or_update(job)

```
!!! MAKE SURE THE ABOVE JOB IS COMPLETED BEFORE PROCCEEDING TO NEXT STEP
```

# Create an online endpoint

Now that you have a registered model and an inference script, it's time to create your online endpoint. The endpoint name needs to be unique in the entire Azure region.

In [20]:
## Create online endpoint

import uuid

# Making the endpoint name unique
online_endpoint_name = "heart-endpoint-" + str(uuid.uuid4())[:8]

In [21]:
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Model,
    Environment,
)

# Initiate new online endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="this is an online endpoint",
    auth_mode="key",
    tags={
        "training_dataset": "heart_defaults",
        "model_type": "sklearn.DecisionTreeClassifier",
    },
)

endpoint = ml_client.online_endpoints.begin_create_or_update(endpoint).result()

print(f"Endpoint {endpoint.name} provisioning state: {endpoint.provisioning_state}")

Endpoint heart-endpoint-e53a2e5b provisioning state: Succeeded


```
!!! THE ENDPOINT CREATION MAY TAKE 6-8 MINUTES
```

In [22]:
## Verify the endpoint created
endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

print(
    f'Endpoint "{endpoint.name}" with provisioning state "{endpoint.provisioning_state}" is retrieved'
)

Endpoint "heart-endpoint-e53a2e5b" with provisioning state "Succeeded" is retrieved


# Deploy model to the created endpoint

In [24]:
# Check latest version of the model
latest_model_version = max(
    [int(m.version) for m in ml_client.models.list(name=registered_model_name)]
)

# Pick latest model version
model = ml_client.models.get(name=registered_model_name, version=latest_model_version)

# online deployment.
blue_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=online_endpoint_name,
    model=model,
    instance_type="STANDARD_F4S_V2",
    instance_count=1,
)

blue_deployment_results = ml_client.online_deployments.begin_create_or_update(blue_deployment).result()

print(
    f"Deployment {blue_deployment_results.name} provisioning state: {blue_deployment_results.provisioning_state}"
)

Check: endpoint heart-endpoint-e53a2e5b exists
data_collector is not a known attribute of class <class 'azure.ai.ml._restclient.v2022_02_01_preview.models._models_py3.ManagedOnlineDeployment'> and will be ignored


.........................................................................................

# Clean up resources
If you're not going to use the endpoint, delete it to stop using the resource. Make sure no other deployments are using an endpoint before you delete it.

In [None]:
ml_client.online_endpoints.begin_delete(name=online_endpoint_name)