<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Environment" data-toc-modified-id="Environment-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Environment</a></span></li><li><span><a href="#Compute-Target-and-container" data-toc-modified-id="Compute-Target-and-container-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Compute Target and container</a></span></li><li><span><a href="#Cleaning-step" data-toc-modified-id="Cleaning-step-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Cleaning step</a></span></li><li><span><a href="#Pipeline" data-toc-modified-id="Pipeline-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Pipeline</a></span></li><li><span><a href="#Run-pipeline" data-toc-modified-id="Run-pipeline-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Run pipeline</a></span></li></ul></div>

In [1]:
import azureml.core
from azureml.data.data_reference import DataReference
from azureml.data.datapath import DataPath
from azureml.core import Workspace, Datastore, Dataset
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.core.graph import PipelineParameter
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core import Experiment

# Environment

In [2]:
ws = Workspace.from_config()
def_blob_store = Datastore(ws, "workspaceblobstore")
steps_dir = './pipeline_steps'
cpu_cluster_name = "cpucluster"

# Compute Target and container

In [3]:
cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
cpu_cluster.wait_for_completion(show_output=True)

# Create a new runconfig object
run_amlcompute = RunConfiguration()

# Use the cpu_cluster you created above. 
run_amlcompute.target = cpu_cluster

# Enable Docker
run_amlcompute.environment.docker.enabled = True

# Set Docker base image to the default CPU-based image
run_amlcompute.environment.docker.base_image = DEFAULT_CPU_IMAGE

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_amlcompute.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
pip_packages=['azureml-dataprep[fuse,pandas]',
              'azureml.core']
conda_packages=['scikit-learn==0.22',
                'pandas==0.24.2',
                'pyarrow==0.16.0'
               ]
run_amlcompute.environment.python.conda_dependencies = CondaDependencies.create(python_version='3.7.7',
                                                                                pip_packages=pip_packages,
                                                                                conda_packages=conda_packages)

Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


# Cleaning step

In [65]:
dataset_full = Dataset.get_by_name(ws, name="annonces_ds")

clean_ds = PipelineData("dataset_clean",
                        datastore=def_blob_store).as_dataset().parse_delimited_files()
#clean_ds = clean_ds.register()

clean_step = PythonScriptStep(
    script_name="clean.py",
    arguments=["--input", dataset_full.name, "--output", clean_ds],
    inputs=[dataset_full.as_named_input(dataset_full.name)],
    outputs=[clean_ds],
    compute_target=cpu_cluster,
    runconfig=run_amlcompute,
    source_directory=steps_dir
)

# Split step

In [72]:
train_ds = PipelineData("dataset_train",
                        datastore=def_blob_store).as_dataset().parse_delimited_files()
valid_ds = PipelineData("dataset_valid",
                        datastore=def_blob_store).as_dataset().parse_delimited_files()

train_size = PipelineParameter(
  name="Train_Size",
  default_value=800)

valid_size = PipelineParameter(
  name="ValidSize",
  default_value=200)

split_step = PythonScriptStep(
    script_name="split.py",
    arguments=[
        "--dataset", clean_ds,
        "--train", train_ds,
        "--valid", valid_ds,
        "--trainsize", train_size,
        "--validsize", valid_size
    ],
    inputs=[clean_ds],
    outputs=[train_ds, valid_ds],
    compute_target=cpu_cluster,
    runconfig=run_amlcompute,
    source_directory=steps_dir
)

# Train step

In [73]:
trained_model = PipelineData(
    "model",
    datastore=def_blob_store
)#.as_download(input_name="model.pkl", path_on_compute='./')

train_step = PythonScriptStep(
    script_name="train.py",
    arguments=[
        "--dataset", train_ds,
        "--model", trained_model
    ],
    inputs=[train_ds],
    outputs=[trained_model],
    compute_target=cpu_cluster,
    runconfig=run_amlcompute,
    source_directory=steps_dir
)

# Evaluation step

In [74]:
eval_step = PythonScriptStep(
    script_name="eval.py",
    arguments=[
        "--dataset", valid_ds,
        "--model", trained_model.name
    ],
    inputs=[valid_ds,
            trained_model],
    compute_target=cpu_cluster,
    runconfig=run_amlcompute,
    source_directory=steps_dir
)

# Pipeline

In [75]:
train_pipeline = Pipeline(workspace=ws, steps=[clean_step, split_step, train_step, eval_step])

# Run pipeline

In [76]:
pipeline_run = Experiment(ws, 'TrainingPipeline').submit(train_pipeline)

Created step clean.py [4c27f273][9cffc60b-fc4f-4b38-b834-4dbb554a55fa], (This step will run and generate new outputs)Created step split.py [001ae348][577764fc-630c-4ef6-b1c6-31f964f2226d], (This step will run and generate new outputs)

Created step train.py [d1d2310d][c9d92338-769b-4adc-8497-cbd343253240], (This step will run and generate new outputs)
Created step eval.py [2ec79550][e663af80-2d1b-4ae5-aa63-dc40dca74767], (This step will run and generate new outputs)
Submitted PipelineRun 892ebe57-d526-4412-933f-15b596de903f
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/TrainingPipeline/runs/892ebe57-d526-4412-933f-15b596de903f?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2


In [None]:
pipeline_run.wait_for_completion()

PipelineRunId: 892ebe57-d526-4412-933f-15b596de903f
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/TrainingPipeline/runs/892ebe57-d526-4412-933f-15b596de903f?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: aa4cb714-98ea-4946-8ee2-98d520f42968
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/TrainingPipeline/runs/aa4cb714-98ea-4946-8ee2-98d520f42968?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2
StepRun( clean.py ) Status: NotStarted
StepRun( clean.py ) Status: Queued
StepRun( clean.py ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_c28e972a09d951131dd68c43762d6835bc1a89decf530156d552579d1b9d1186_d.txt
2020-04-03T13:56:06Z Starting output-watcher...
2020-04-03T13:56:06Z IsDedicatedCompute == True, won't poll for Low Pri Pre

In [None]:
model = pipeline_run.register_model(model_name='FirstModel', model_path='outputs/model')

In [None]:
pipeline_run.publish_pipeline(
     name="Training_pipeline",
     description="This is a training pipeline for the realestate project",
     version="1.0")

In [None]:
pipeline_run.endpoint

# Run Published Pipeline

In [None]:
from azureml.core.authentication import AzureCliAuthentication

cli_auth = AzureCliAuthentication()
aad_token = cli_auth.get_authentication_header()

In [None]:
from azureml.pipeline.core import PublishedPipeline
import requests

response = requests.post(train_pipeline.endpoint,
                         headers=aad_token,
                         json={"ExperimentName": "Training",
                               "ParameterAssignments": {"Train_Size": 800,
                                                        "Valid_Size": 200}})