<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Environment" data-toc-modified-id="Environment-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Environment</a></span></li><li><span><a href="#Compute-Target-and-container" data-toc-modified-id="Compute-Target-and-container-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Compute Target and container</a></span></li><li><span><a href="#Cleaning-step" data-toc-modified-id="Cleaning-step-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Cleaning step</a></span></li><li><span><a href="#Pipeline" data-toc-modified-id="Pipeline-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Pipeline</a></span></li><li><span><a href="#Run-pipeline" data-toc-modified-id="Run-pipeline-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Run pipeline</a></span></li></ul></div>

In [1]:
import azureml.core
from azureml.data.data_reference import DataReference
from azureml.data.datapath import DataPath
from azureml.core import Workspace, Datastore, Dataset
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.core.graph import PipelineParameter
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core import Experiment

# Environment

In [2]:
ws = Workspace.from_config()
def_blob_store = Datastore(ws, "workspaceblobstore")
steps_dir = './pipeline_steps'
cpu_cluster_name = "cpucluster"

# Compute Target and container

In [3]:
cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
cpu_cluster.wait_for_completion(show_output=True)

# Create a new runconfig object
run_amlcompute = RunConfiguration()

# Use the cpu_cluster you created above. 
run_amlcompute.target = cpu_cluster

# Enable Docker
run_amlcompute.environment.docker.enabled = True

# Set Docker base image to the default CPU-based image
run_amlcompute.environment.docker.base_image = DEFAULT_CPU_IMAGE

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_amlcompute.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
pip_packages=['azureml-dataprep[fuse,pandas]',
              'azureml.core']
conda_packages=['scikit-learn==0.22',
                'pandas==0.24.2',
                'pyarrow==0.16.0'
               ]
run_amlcompute.environment.python.conda_dependencies = CondaDependencies.create(python_version='3.7.7',
                                                                                pip_packages=pip_packages,
                                                                                conda_packages=conda_packages)

Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


# Cleaning step

In [4]:
dataset_full = Dataset.get_by_name(ws, name="annonces_ds")

clean_ds = PipelineData(
    "dataset_clean",
    datastore=def_blob_store
).as_dataset().parse_delimited_files(file_extension="")
#clean_ds = clean_ds.register()

clean_step = PythonScriptStep(
    script_name="clean.py",
    arguments=["--input", dataset_full.name, "--output", clean_ds],
    inputs=[dataset_full.as_named_input(dataset_full.name)],
    outputs=[clean_ds],
    compute_target=cpu_cluster,
    runconfig=run_amlcompute,
    source_directory=steps_dir
)

# Split step

In [5]:
train_ds = PipelineData(
    "dataset_train",
    datastore=def_blob_store
).as_dataset() #.parse_delimited_files(file_extension="")

valid_ds = PipelineData(
    "dataset_valid",
    datastore=def_blob_store
).as_dataset() #.parse_delimited_files(file_extension="")

train_size = PipelineParameter(
  name="TrainSize",
  default_value=800)

valid_size = PipelineParameter(
  name="ValidSize",
  default_value=200)

split_step = PythonScriptStep(
    script_name="split.py",
    arguments=[
        "--dataset", clean_ds.name,
        "--train", train_ds,
        "--valid", valid_ds,
        "--trainsize", train_size,
        "--validsize", valid_size
    ],
    inputs=[clean_ds.as_named_input(clean_ds.name)],
    outputs=[train_ds, valid_ds],
    compute_target=cpu_cluster,
    runconfig=run_amlcompute,
    source_directory=steps_dir,
    allow_reuse=True
)

# Train step

In [6]:
trained_model = PipelineData(
    "model",
    datastore=def_blob_store
)#.as_download(input_name="model.pkl", path_on_compute='./')

train_step = PythonScriptStep(
    script_name="train.py",
    arguments=[
        "--dataset", train_ds.name,
        "--model", trained_model
    ],
    inputs=[train_ds],
    outputs=[trained_model],
    compute_target=cpu_cluster,
    runconfig=run_amlcompute,
    source_directory=steps_dir
)

# Evaluation step

In [71]:
eval_step = PythonScriptStep(
    script_name="eval.py",
    arguments=[
        "--dataset", valid_ds.name,
        "--model", trained_model
    ],
    inputs=[valid_ds,
            trained_model],
    compute_target=cpu_cluster,
    runconfig=run_amlcompute,
    source_directory=steps_dir
)

# Pipeline

In [72]:
train_pipeline = Pipeline(workspace=ws, steps=[clean_step, split_step, train_step, eval_step])

# Run pipeline

In [73]:
pipeline_run = Experiment(ws, 'TrainingPipeline').submit(train_pipeline)

Created step clean.py [53bf69f2][23fcfeb1-89d4-4263-a9b5-12d2d363f3bf], (This step will run and generate new outputs)
Created step split.py [27179a1a][373fe879-629f-4859-87b9-0a1c7d0ea747], (This step will run and generate new outputs)
Created step train.py [6ebcd3f3][b017286a-11ed-4a47-90e8-af6f0a8a7855], (This step will run and generate new outputs)
Created step eval.py [52e6b291][7e73d664-1eac-4fd6-843e-28087fec857b], (This step will run and generate new outputs)
Submitted PipelineRun 58d3a91f-2796-4e7e-b3af-2fa9a663831a
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/TrainingPipeline/runs/58d3a91f-2796-4e7e-b3af-2fa9a663831a?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2


In [74]:
pipeline_run.wait_for_completion()

PipelineRunId: 58d3a91f-2796-4e7e-b3af-2fa9a663831a
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/TrainingPipeline/runs/58d3a91f-2796-4e7e-b3af-2fa9a663831a?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 90a7af8c-dc6d-4bdb-a802-d873ee34f965
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/TrainingPipeline/runs/90a7af8c-dc6d-4bdb-a802-d873ee34f965?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2
StepRun( clean.py ) Status: NotStarted
StepRun( clean.py ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_45b3b46f8ce43ec73ea35eeccbd4b8cd2dfeaa51fa3a6a0febc117f75b46c2f8_d.txt
2020-04-03T21:08:23Z Starting output-watcher...
2020-04-03T21:08:23Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using defau




StepRunId: 5bbc8e32-cda1-46b5-a517-7bff26b09de1
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/TrainingPipeline/runs/5bbc8e32-cda1-46b5-a517-7bff26b09de1?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2
StepRun( split.py ) Status: Queued
StepRun( split.py ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_45b3b46f8ce43ec73ea35eeccbd4b8cd2dfeaa51fa3a6a0febc117f75b46c2f8_d.txt
2020-04-03T21:10:18Z Starting output-watcher...
2020-04-03T21:10:18Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_c30f0e4e66ae5a324014f63a272f7ba7
Digest: sha256:ea753b74af41e7144b92336e1aa829a7b89e4ed5aa8e1419f322d6c02da9a14d
Status: Image is up to date for realestatepg7d804713.azurecr.io/azureml/azureml_c30f0e4e66ae5a324014f63a272f7ba7:latest
12dd0e2c0e73a756088d6d05c1560c030e8df2c38568eca98aa4af1d3c6d4f47
2020/0




StepRunId: 7a3702c1-04fd-42ed-8f6d-3e0891005bf8
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/TrainingPipeline/runs/7a3702c1-04fd-42ed-8f6d-3e0891005bf8?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2
StepRun( train.py ) Status: NotStarted
StepRun( train.py ) Status: Queued
StepRun( train.py ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_45b3b46f8ce43ec73ea35eeccbd4b8cd2dfeaa51fa3a6a0febc117f75b46c2f8_d.txt
2020-04-03T21:13:16Z Starting output-watcher...
2020-04-03T21:13:16Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_c30f0e4e66ae5a324014f63a272f7ba7
Digest: sha256:ea753b74af41e7144b92336e1aa829a7b89e4ed5aa8e1419f322d6c02da9a14d
Status: Image is up to date for realestatepg7d804713.azurecr.io/azureml/azureml_c30f0e4e66ae5a324014f63a272f7ba7:latest
6c56a6b98c89b2c6d13774edf1994799




StepRunId: 5a2a5e06-c0e7-492e-b2b0-6cb3250935fc
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/TrainingPipeline/runs/5a2a5e06-c0e7-492e-b2b0-6cb3250935fc?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2
StepRun( eval.py ) Status: NotStarted
StepRun( eval.py ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_45b3b46f8ce43ec73ea35eeccbd4b8cd2dfeaa51fa3a6a0febc117f75b46c2f8_d.txt
2020-04-03T21:15:58Z Starting output-watcher...
2020-04-03T21:15:58Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_c30f0e4e66ae5a324014f63a272f7ba7
Digest: sha256:ea753b74af41e7144b92336e1aa829a7b89e4ed5aa8e1419f322d6c02da9a14d
Status: Image is up to date for realestatepg7d804713.azurecr.io/azureml/azureml_c30f0e4e66ae5a324014f63a272f7ba7:latest
a1b4a430ea675bf17e480adfb28eec71bbc21e3fe52c2595062d8b66814c03bb
2020



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '58d3a91f-2796-4e7e-b3af-2fa9a663831a', 'status': 'Completed', 'startTimeUtc': '2020-04-03T21:06:54.611935Z', 'endTimeUtc': '2020-04-03T21:17:27.612006Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{"TrainSize":"800","ValidSize":"200"}'}, 'inputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://realestastoragef42c16f2b.blob.core.windows.net/azureml/ExperimentRun/dcid.58d3a91f-2796-4e7e-b3af-2fa9a663831a/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=g2WM%2F0NNqtV5t5xiGXK%2F99G53L%2FVki6wkYk8fBHfxJM%3D&st=2020-04-03T21%3A07%3A35Z&se=2020-04-04T05%3A17%3A35Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https://realestastoragef42c16f2b.blob.core.windows.net/azureml/ExperimentRun/dcid.58d3a91f-2796-4e7e-b3af-2fa9a663831a/logs/azureml/stderrlogs.txt?sv=2019-02-02&sr=b&sig=H7kHo3Qbbr81P2UO1UtoHMI%2B1nYbPkudPfeRaMzrBGs%3D&st=2

'Finished'

In [75]:
pipeline_run.get_metrics()

{'rmse': 115.0517634746127,
 'r2': 0.7243306309269382,
 'rmsle': 0.1399839405365404,
 'msle': 0.01959550360813768,
 'mape': 0.11361708878496973}

In [76]:
published_pipeline = pipeline_run.publish_pipeline(
     name="Training_pipeline",
     description="This is a training pipeline for the realestate project",
     version="1.0")

In [83]:
endpoint = published_pipeline.endpoint
endpoint

'https://westeurope.api.azureml.ms/pipelines/v1.0/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourceGroups/ProjectGroup2/providers/Microsoft.MachineLearningServices/workspaces/RealEstatePG2/PipelineRuns/PipelineSubmit/b802a661-a430-471f-92a7-2fa8215bf583'

In [82]:
published_pipeline_id = published_pipeline.id
published_pipeline_id

'b802a661-a430-471f-92a7-2fa8215bf583'

# Run Published Pipeline

In [None]:
published_pipeline = PublishedPipeline.get(workspace=ws, id=published_pipeline_id)
pipeline_run = experiment.submit(published_pipeline,
                                    continue_on_step_failure=False,
                                    pipeline_parameters={"TrainSize": 800,
                                                         "ValidSize": 200},
                                 )

In [None]:
from azureml.core.authentication import AzureCliAuthentication

cli_auth = AzureCliAuthentication()
aad_token = cli_auth.get_authentication_header()

In [None]:
from azureml.pipeline.core import PublishedPipeline
import requests

response = requests.post(endpoint,
                         headers=aad_token,
                         json={"ExperimentName": "Training",
                               "ParameterAssignments": {"TrainSize": 800,
                                                        "ValidSize": 200}})