In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Datastore
from azureml.widgets import RunDetails
 
from azureml.core import Dataset
 
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.core import PipelineRun, StepRun, PortDataReference
from azureml.pipeline.steps import PythonScriptStep
 
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
 
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
 
from azureml.core.model import Model
 
# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.35.0


In [2]:
# connect to your workspace
ws = Workspace.from_config()

### Set up a datastore

In [3]:
# Get the blob storage associated with the workspace
def_blob_store = ws.get_default_datastore()

### Upload a dataset (If the dataset does not exist on the datastore)

In [16]:
def_blob_store.upload_files(['/Users/howardlin/Documents/Class/Machine Learning operation/week 4/Assignment2/data/df_clean.csv'],
                     target_path='data',
                     overwrite=True, show_progress=True)

Uploading an estimated of 1 files
Uploading /Users/howardlin/Documents/Class/Machine Learning operation/week 4/Assignment2/data/df_clean.csv
Uploaded /Users/howardlin/Documents/Class/Machine Learning operation/week 4/Assignment2/data/df_clean.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_9fea95ea0da5419aa26950795468a07d

### create an Azure Machine Learning compute for running your steps

In [4]:
aml_compute_target = "demo-cluster"
try:
    aml_compute = AmlCompute(ws, aml_compute_target)
    print("found existing compute target.")
except ComputeTargetException:
    print("creating new compute target")
    
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2",
                                                                min_nodes = 1, 
                                                                max_nodes = 4)    
    aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
print("Azure Machine Learning Compute attached")

found existing compute target.
Azure Machine Learning Compute attached


### Configure the training run's environment

In [5]:
aml_run_config = RunConfiguration()
 
aml_run_config.target = aml_compute
aml_run_config.environment.docker.enabled = True
aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:latest"
 
aml_run_config.environment.python.user_managed_dependencies = False
 
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn','numpy','xgboost'], 
    pip_packages=['joblib','azureml-sdk','fusepy'], 
    pin_sdk_version=False)

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


### Define the CSV file as input dataset.

In [7]:
# Import and register new dataset
df = Dataset.Tabular.from_delimited_files(def_blob_store.path('./data/df_clean.csv'))
df = df.register(ws, 'heartattack_data',create_new_version = True)

In [6]:
# Or get data from existing dataset on Azure
df = Dataset.get_by_name(workspace = ws,
                                 name = 'heartattack_data', 
                                 version = 1)

### Define the intermediary datasets and the output from each step.

In [7]:
raw_data = df.as_named_input('raw_data')
train_data = PipelineData("train_data", datastore=def_blob_store).as_dataset()
test_data = PipelineData("test_data", datastore=def_blob_store).as_dataset()
scaler_file = PipelineData("scaler_file", datastore=def_blob_store)
model_file = PipelineData("model_file", datastore=def_blob_store)

### Construct your pipeline steps

#### Step 1: Data preparation

In [8]:
source_directory="./prep"
step1 = PythonScriptStep(name="prep_step",
                         script_name="./prep_with_scaler.py", 
                         arguments=["--train", train_data,"--test", test_data,"--scaler",scaler_file],
                         inputs=[raw_data],
                         outputs=[train_data,test_data,scaler_file],                         
                         compute_target=aml_compute, 
                         runconfig=aml_run_config,
                         source_directory=source_directory,
                         allow_reuse=True)

#### Step 2: model training

In [9]:
source_directory="./train"
step2 = PythonScriptStep(name="train_step",
                         script_name="./train_rf.py", 
                         arguments=["--train", train_data,"--test", test_data,"--model",model_file],
                         inputs=[train_data,test_data],
                         outputs=[model_file],                         
                         compute_target=aml_compute, 
                         runconfig=aml_run_config,
                         source_directory=source_directory,
                         allow_reuse=True)

### Create a pipeline

In [10]:
steps = [step1,step2]
pipeline1 = Pipeline(workspace=ws, steps=steps)
pipeline_run1 = Experiment(ws, 'heartattack_prediction').submit(pipeline1)

Created step prep_step [18453909][cc9612db-d9f8-41a4-b093-ff50ec9ed0a2], (This step is eligible to reuse a previous run's output)
Created step train_step [ba83f935][82c0c6ee-5848-43de-8ab6-34dcebc26029], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun b298f8ef-c4bc-4828-85f0-d3f5135489fa
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/b298f8ef-c4bc-4828-85f0-d3f5135489fa?wsid=/subscriptions/8767f4b2-b039-4104-ab02-91dded909118/resourcegroups/mlop/workspaces/MLOP&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6


In [11]:
pipeline_run1.wait_for_completion(show_output=True)

PipelineRunId: b298f8ef-c4bc-4828-85f0-d3f5135489fa
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/b298f8ef-c4bc-4828-85f0-d3f5135489fa?wsid=/subscriptions/8767f4b2-b039-4104-ab02-91dded909118/resourcegroups/mlop/workspaces/MLOP&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
PipelineRun Status: Running


StepRunId: d3354dca-e604-43df-ab03-8805103fb46c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/d3354dca-e604-43df-ab03-8805103fb46c?wsid=/subscriptions/8767f4b2-b039-4104-ab02-91dded909118/resourcegroups/mlop/workspaces/MLOP&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6

StepRun(prep_step) Execution Summary
StepRun( prep_step ) Status: Finished
{'runId': 'd3354dca-e604-43df-ab03-8805103fb46c', 'status': 'Completed', 'startTimeUtc': '2021-11-03T02:34:22.001415Z', 'endTimeUtc': '2021-11-03T02:34:22.082027Z', 'services': {}, 'properties': {'azureml.reusedrunid': 'b5b3888e-4e0b-4bdc-8982-c3de56d3016d', 'azureml.reusednodeid': '4e19f548', 'azureml.reusedpipeline': '


Streaming azureml-logs/75_job_post-tvmps_de0dd5c18e9120cd0a8f6e3b09f7f37f221590e88d239deda513746c13df6e26_d.txt
[2021-11-03T02:35:06.384246] Entering job release
[2021-11-03T02:35:07.179073] Starting job release
[2021-11-03T02:35:07.179729] Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for process with pid = 233
[2021-11-03T02:35:07.180815] job release stage : upload_datastore starting...
[2021-11-03T02:35:07.181468] job release stage : start importing azureml.history._tracking in run_history_release.
[2021-11-03T02:35:07.181561] job release stage : execute_job_release starting...
[2021-11-03T02:35:07.195219] job release stage : copy_batchai_cached_logs starting...
[2021-11-03T02:35:07.195596] job release stage : copy_batchai_cached_logs completed...
[2021-11-03T02:35:07.199759] Entering context manager injector.
[2021-11-03T02:35:07.212599] job release stage : upload_datastore completed...
[2021-11-03T02:35:07.2670



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'b298f8ef-c4bc-4828-85f0-d3f5135489fa', 'status': 'Completed', 'startTimeUtc': '2021-11-03T02:34:20.825307Z', 'endTimeUtc': '2021-11-03T02:36:25.055597Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mlop6016320412.blob.core.windows.net/azureml/ExperimentRun/dcid.b298f8ef-c4bc-4828-85f0-d3f5135489fa/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=bDGl2nIXKA5h5KpoKrqN2UYXsJS8K5DIPrjIPmQjfh4%3D&skoid=af705135-2b6a-4643-8ab3-c089f2ca0024&sktid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6&skt=2021-11-02T21%3A13%3A12Z&ske=2021-11-04T05%3A23%3A12Z&sks=b&skv=2019-07-07&st=2021-11-03T02%3A26%3A29Z&se=2021-11-03T10%3A36%3A29Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https://mlop6016320412.blob

'Finished'

### Register the output model

#### Download the model to local

In [37]:
train_step = pipeline_run1.find_step_run('train_step')[0]
step_run_output = train_step.get_output("model_file")

port_data_reference = step_run_output.get_port_data_reference()
port_data_reference.download(local_path=".")
model_file=port_data_reference.path_on_datastore

In [38]:
model=model_file+"/model.pkl"
os.makedirs("model", exist_ok=True)
os.popen("cp "+ model +" model")

<os._wrap_close at 0x7fb8b2091220>

#### Register model

In [12]:
pipeline_run1.find_step_run('train_step')[0].register_model(model_name = 'rf_model_scaled', model_path = "model_file", 
                                                            datasets =[('train test data',df)])

Model(workspace=Workspace.create(name='MLOP', subscription_id='8767f4b2-b039-4104-ab02-91dded909118', resource_group='mlop'), name=rf_model_scaled, id=rf_model_scaled:2, version=2, tags={}, properties={})

#### Register scaler

In [13]:
pipeline_run1.find_step_run('prep_step')[0].register_model(model_name = 'scaler', model_path = "scaler_file", 
                                                            datasets =[('train test data',df)])

Model(workspace=Workspace.create(name='MLOP', subscription_id='8767f4b2-b039-4104-ab02-91dded909118', resource_group='mlop'), name=scaler, id=scaler:2, version=2, tags={}, properties={})