In [18]:
from azure.ai.ml import load_component
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import MLClient, Input
from azure.identity import DefaultAzureCredential, EnvironmentCredential
from azure.ai.ml.entities import AmlCompute
from dotenv import load_dotenv
import os
import pandas as pd

** Init variables**

In [44]:
subcription = ""
resource_group = ""
workspace_name = ""
uri_file_path =  "./data/water_potability.csv"

In [19]:
def get_comput_target(ml_client, name="cpu-cluster", family='Standard_DS2_v2'):
    cpu_compute_target = name
    
    try:
        # let's see if the compute target already exists
        cpu_cluster = ml_client.compute.get(cpu_compute_target)
    except Exception:
        cpu_cluster = AmlCompute(
            name=cpu_compute_target,
            type="amlcompute",
            size=family,
            min_instances=0,
            max_instances=4,
            idle_time_before_scale_down=180,
            tier="Dedicated",
        )
    
        cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster).result()

In [20]:

def get_azure_credential(subscription_id, resource_group,workspace ):
    credential = DefaultAzureCredential()
    ml_client = MLClient.from_config(credential)
    ml_client =  MLClient(credential, subscription_id, resource_group, workspace)
    return ml_client

ml_client = get_azure_credential(subcription, resource_group, workspace_name)
data_store_uri_file = uri_file_path

Found the config file in: /config.json


In [6]:
compute_target = get_comput_target(ml_client, name="cpu-cluster")

In [38]:
clean_component = load_component(source="./components/clean-component/clean.yml")
split_component = load_component(source="./components/split-component/split.yml")

tree_regression_component = load_component(source="./components/decission-trees-component/decission_trees.yml")
logistic_regression_component = load_component(source="./components/logistic-regression-component/logistic_regression.yml")

logistic_score_component = load_component(source="./components/score-component/score.yml")
tree_score_component = load_component(source="./components/score-component/score.yml")

logistic_eval_component = load_component(source="./components/eval-component/eval.yml")
tree_eval_component = load_component(source="./components/eval-component/eval.yml")




In [39]:
# define a pipeline containing 3 nodes: Prepare data node, train node, and score node
@pipeline(
    default_compute='cpu-cluster',
)

def water_potability_decision_tree_dummy(pipeline_input_data):
    
    clean_node = clean_component(training_data=pipeline_input_data)

    split_node = split_component(clean_data=clean_node.outputs.model_output)


    logistic_regression = logistic_regression_component(training_data=split_node.outputs.training_data)
    
    tree_regression = tree_regression_component(training_data=split_node.outputs.training_data)
    #train_node.compute = gpu_compute_target
    
    tree_score_node = logistic_score_component(
        test_data=split_node.outputs.testing_data,
        model_input=tree_regression.outputs.model_output
    )

    score_node = logistic_score_component(
        test_data=split_node.outputs.testing_data,
        model_input=logistic_regression.outputs.model_output
    )

    eval_node = logistic_eval_component(
        scoring_result=score_node.outputs.score_output,
        test_data = split_node.outputs.testing_data
    )
    
    tree_eval_node = logistic_eval_component(
        scoring_result=tree_score_node.outputs.score_output,
        test_data = split_node.outputs.testing_data
    )

    return {
        "pipeline_eval_output": eval_node.outputs.eval_output,
        "tree_pipeline_eval_output": tree_eval_node.outputs.eval_output
    }


In [40]:


# create a pipeline
water_potability_ds =  Input(type="uri_file", path=data_store_uri_file)

pipeline_job = water_potability_decision_tree_dummy(pipeline_input_data=water_potability_ds)


In [41]:
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_water_potability_dummy"
)
pipeline_job

[32mUploading decission_trees_src (0.0 MBs): 100%|██████████| 1949/1949 [00:00<00:00, 44953.72it/s]
[39m

[32mUploading eval_src (0.0 MBs): 100%|██████████| 1644/1644 [00:00<00:00, 20882.10it/s]
[39m



Experiment,Name,Type,Status,Details Page
pipeline_water_potability_dummy,helpful_sand_8q9jvm4ws4,pipeline,Preparing,Link to Azure Machine Learning studio


In [42]:
# wait until the job completes
ml_client.jobs.stream(pipeline_job.name)


RunId: helpful_sand_8q9jvm4ws4
Web View: https://ml.azure.com/runs/helpful_sand_8q9jvm4ws4?wsid=/subscriptions/46169265-43c5-42f4-b171-b27bdd8e5afa/resourcegroups/rchoque/workspaces/rc-ml-test

Execution Summary
RunId: helpful_sand_8q9jvm4ws4
Web View: https://ml.azure.com/runs/helpful_sand_8q9jvm4ws4?wsid=/subscriptions/46169265-43c5-42f4-b171-b27bdd8e5afa/resourcegroups/rchoque/workspaces/rc-ml-test



In [43]:
# Download all the outputs of the job
output = ml_client.jobs.download(name=pipeline_job.name, download_path='./pipeline_output', all=True)

Downloading artifact azureml://subscriptions/46169265-43c5-42f4-b171-b27bdd8e5afa/resourcegroups/rchoque/workspaces/rc-ml-test/datastores/workspaceblobstore/paths/azureml/855d6893-2fa0-471c-b98c-2391376367fa/eval_output/ to pipeline_output/named-outputs/pipeline_eval_output
Downloading artifact azureml://subscriptions/46169265-43c5-42f4-b171-b27bdd8e5afa/resourcegroups/rchoque/workspaces/rc-ml-test/datastores/workspaceblobstore/paths/azureml/1b771380-7aef-457d-bccc-3caa21f4ab49/eval_output/ to pipeline_output/named-outputs/tree_pipeline_eval_output
Downloading artifact azureml://datastores/workspaceartifactstore/paths/ExperimentRun/dcid.helpful_sand_8q9jvm4ws4/ to pipeline_output/artifacts
Ran into a deserialization error. Ignoring since this is failsafe deserialization
Traceback (most recent call last):
  File "/anaconda/envs/azureml_py38/lib/python3.8/site-packages/msrest/serialization.py", line 1509, in failsafe_deserialize
    return self(target_obj, data, content_type=content_type