In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Datastore
from azureml.widgets import RunDetails
 
from azureml.core import Dataset
 
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.core import PipelineRun, StepRun, PortDataReference
from azureml.pipeline.steps import PythonScriptStep
 
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
 
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
 
from azureml.core.model import Model
 
# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.35.0


In [2]:
# connect to your workspace
ws = Workspace.from_config()

### Set up a datastore

In [3]:
# Get the blob storage associated with the workspace
def_blob_store = ws.get_default_datastore()

### Upload a dataset (If the dataset does not exist on the datastore)

In [16]:
def_blob_store.upload_files(['/Users/howardlin/Documents/Class/Machine Learning operation/week 4/Assignment2/data/df_clean.csv'],
                     target_path='data',
                     overwrite=True, show_progress=True)

Uploading an estimated of 1 files
Uploading /Users/howardlin/Documents/Class/Machine Learning operation/week 4/Assignment2/data/df_clean.csv
Uploaded /Users/howardlin/Documents/Class/Machine Learning operation/week 4/Assignment2/data/df_clean.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_9fea95ea0da5419aa26950795468a07d

### create an Azure Machine Learning compute for running your steps

In [4]:
aml_compute_target = "demo-cluster"
try:
    aml_compute = AmlCompute(ws, aml_compute_target)
    print("found existing compute target.")
except ComputeTargetException:
    print("creating new compute target")
    
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2",
                                                                min_nodes = 1, 
                                                                max_nodes = 4)    
    aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
print("Azure Machine Learning Compute attached")

found existing compute target.
Azure Machine Learning Compute attached


### Configure the training run's environment

In [5]:
aml_run_config = RunConfiguration()
 
aml_run_config.target = aml_compute
aml_run_config.environment.docker.enabled = True
aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:latest"
 
aml_run_config.environment.python.user_managed_dependencies = False
 
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn','numpy','xgboost'], 
    pip_packages=['joblib','azureml-sdk','fusepy'], 
    pin_sdk_version=False)

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


### Define the CSV file as input dataset.

In [7]:
# Import and register new dataset
df = Dataset.Tabular.from_delimited_files(def_blob_store.path('./data/df_clean.csv'))
df = df.register(ws, 'heartattack_data',create_new_version = True)

In [30]:
# Or get data from existing dataset on Azure
df = Dataset.get_by_name(workspace = ws,
                                 name = 'heartattack_data', 
                                 version = 1)

### Define the intermediary datasets and the output from each step.

In [31]:
raw_data = df.as_named_input('raw_data')
train_data = PipelineData("train_data", datastore=def_blob_store).as_dataset()
test_data = PipelineData("test_data", datastore=def_blob_store).as_dataset()
model_file = PipelineData("model_file", datastore=def_blob_store)

### Construct your pipeline steps

#### Step 1: Data preparation

In [32]:
source_directory="./prep"
step1 = PythonScriptStep(name="prep_step",
                         script_name="./prep.py", 
                         arguments=["--train", train_data,"--test", test_data],
                         inputs=[raw_data],
                         outputs=[train_data,test_data],                         
                         compute_target=aml_compute, 
                         runconfig=aml_run_config,
                         source_directory=source_directory,
                         allow_reuse=True)

#### Step 2: model training

In [37]:
source_directory="./train"
step2 = PythonScriptStep(name="train_step",
                         script_name="./train_rf_change_hyper.py", 
                         arguments=["--train", train_data,"--test", test_data,"--model",model_file],
                         inputs=[train_data,test_data],
                         outputs=[model_file],                         
                         compute_target=aml_compute, 
                         runconfig=aml_run_config,
                         source_directory=source_directory,
                         allow_reuse=True)

### Create a pipeline

In [38]:
steps = [step1,step2]
pipeline1 = Pipeline(workspace=ws, steps=steps)
pipeline_run1 = Experiment(ws, 'heartattack_prediction').submit(pipeline1)

Created step prep_step [028cb43c][8d8ccd3c-c83d-436c-ab74-bced0535701f], (This step is eligible to reuse a previous run's output)Created step train_step [637f3656][caf78173-3e1d-475d-bde7-63ef3ae1a6b0], (This step will run and generate new outputs)

Submitted PipelineRun 8a8a2256-2355-4ad6-af96-ff8ab2bafb66
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8a8a2256-2355-4ad6-af96-ff8ab2bafb66?wsid=/subscriptions/8767f4b2-b039-4104-ab02-91dded909118/resourcegroups/mlop/workspaces/MLOP&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6


In [39]:
pipeline_run1.wait_for_completion(show_output=True)

PipelineRunId: 8a8a2256-2355-4ad6-af96-ff8ab2bafb66
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8a8a2256-2355-4ad6-af96-ff8ab2bafb66?wsid=/subscriptions/8767f4b2-b039-4104-ab02-91dded909118/resourcegroups/mlop/workspaces/MLOP&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
PipelineRun Status: Running


StepRunId: 1fb01f6d-bc64-4359-9e3b-ffb692b70c9e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1fb01f6d-bc64-4359-9e3b-ffb692b70c9e?wsid=/subscriptions/8767f4b2-b039-4104-ab02-91dded909118/resourcegroups/mlop/workspaces/MLOP&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6

StepRun(prep_step) Execution Summary
StepRun( prep_step ) Status: Finished
{'runId': '1fb01f6d-bc64-4359-9e3b-ffb692b70c9e', 'status': 'Completed', 'startTimeUtc': '2021-11-03T02:55:19.233252Z', 'endTimeUtc': '2021-11-03T02:55:19.308985Z', 'services': {}, 'properties': {'azureml.reusedrunid': '66388176-e22d-4bbb-8d2b-8d0892683d9a', 'azureml.reusednodeid': '11c603db', 'azureml.reusedpipeline': '




StepRunId: 163b909c-6f11-4a32-bdb3-5ea8914c3125
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/163b909c-6f11-4a32-bdb3-5ea8914c3125?wsid=/subscriptions/8767f4b2-b039-4104-ab02-91dded909118/resourcegroups/mlop/workspaces/MLOP&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
StepRun( train_step ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_de0dd5c18e9120cd0a8f6e3b09f7f37f221590e88d239deda513746c13df6e26_d.txt
2021-11-03T02:55:32Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/mlop/azureml/163b909c-6f11-4a32-bdb3-5ea8914c3125/mounts/workspaceblobstore
2021-11-03T02:55:33Z The vmsize standard_d2_v2 is not a GPU VM, skipping get GPU count by running nvidia-smi command.
2021-11-03T02:55:33Z Starting output-watcher...
2021-11-03T02:55:33Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2021-11-03T02:55:33Z Executing 'Copy ACR Details file' on 10.0.0.4
2021-11-03T02:55:33Z Copy ACR Details file succeeded 


Streaming azureml-logs/75_job_post-tvmps_de0dd5c18e9120cd0a8f6e3b09f7f37f221590e88d239deda513746c13df6e26_d.txt
[2021-11-03T02:56:01.269056] Entering job release
[2021-11-03T02:56:02.042133] Starting job release
[2021-11-03T02:56:02.043069] Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for process with pid = 234
[2021-11-03T02:56:02.043519] job release stage : upload_datastore starting...[2021-11-03T02:56:02.045932] job release stage : start importing azureml.history._tracking in run_history_release.
[2021-11-03T02:56:02.046059] job release stage : execute_job_release starting...

[2021-11-03T02:56:02.058608] job release stage : copy_batchai_cached_logs starting...
[2021-11-03T02:56:02.058902] job release stage : copy_batchai_cached_logs completed...[2021-11-03T02:56:02.068501] Entering context manager injector.

[2021-11-03T02:56:02.076272] job release stage : upload_datastore completed...
[2021-11-03T02:56:02.1240



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '8a8a2256-2355-4ad6-af96-ff8ab2bafb66', 'status': 'Completed', 'startTimeUtc': '2021-11-03T02:55:18.024117Z', 'endTimeUtc': '2021-11-03T02:57:23.900335Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mlop6016320412.blob.core.windows.net/azureml/ExperimentRun/dcid.8a8a2256-2355-4ad6-af96-ff8ab2bafb66/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=o9A1FKZ56K%2BzmjiGaEKhVwbgygg%2FmHk8t%2FQAe9NBs%2Fo%3D&skoid=af705135-2b6a-4643-8ab3-c089f2ca0024&sktid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6&skt=2021-11-02T21%3A13%3A12Z&ske=2021-11-04T05%3A23%3A12Z&sks=b&skv=2019-07-07&st=2021-11-03T02%3A47%3A26Z&se=2021-11-03T10%3A57%3A26Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https://mlop6016320

'Finished'

### Register the output model

#### Download the model to local

In [37]:
train_step = pipeline_run1.find_step_run('train_step')[0]
step_run_output = train_step.get_output("model_file")

port_data_reference = step_run_output.get_port_data_reference()
port_data_reference.download(local_path=".")
model_file=port_data_reference.path_on_datastore

In [38]:
model=model_file+"/model.pkl"
os.makedirs("model", exist_ok=True)
os.popen("cp "+ model +" model")

<os._wrap_close at 0x7fb8b2091220>

#### Register model

In [40]:
pipeline_run1.find_step_run('train_step')[0].register_model(model_name = 'rf_model', model_path = "model_file", 
                                                            datasets =[('train test data',df)])

Model(workspace=Workspace.create(name='MLOP', subscription_id='8767f4b2-b039-4104-ab02-91dded909118', resource_group='mlop'), name=rf_model, id=rf_model:3, version=3, tags={}, properties={})