In [None]:
%reload_ext autoreload
%autoreload 2

from datetime import datetime, timezone
import os
import shutil
from pathlib import Path

import glob2 as glob
from azureml.core import Dataset, Experiment, Environment, Run, ScriptRunConfig, Workspace
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import MpiConfiguration, RunConfiguration
from azureml.core import Datastore
from azureml.core.conda_dependencies import CondaDependencies
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline

In [None]:
REPO_DIR = Path.cwd().parents[2]

In [None]:
workspace = Workspace.from_config('./config.py'); workspace

In [None]:
cgm_result_datastore = Datastore(workspace, 'cgm_result_datastore')
cgm_datasets_datastore = Datastore(workspace, 'cgm_datasets_datastore')

In [None]:
sql_dataset_name = "sql-dataset"
sql_dataset = workspace.datasets[sql_dataset_name]

In [None]:
blob_dataset_name = 'cgm-result-dataset'
blob_dataset = workspace.datasets[blob_dataset_name]

In [None]:
cluster_name = 'pipeline-cpu'  # 'mhinsche-cpu'# "gpu-cluster"
compute_target = ComputeTarget(workspace=workspace, name=cluster_name);compute_target

In [None]:
aml_run_config = RunConfiguration()
aml_run_config.target = compute_target
curated_environment = Environment.get(workspace=workspace, name="AzureML-Tutorial")
aml_run_config.environment = curated_environment

In [None]:
curated_env_name = "cgm-env"

ENV_EXISTS = True
if ENV_EXISTS:
    cgm_env = Environment.get(workspace=workspace, name=curated_env_name)
else:
    cgm_env = Environment.from_conda_specification(name=curated_env_name, file_path=REPO_DIR / "environment_train.yml")
    cgm_env.docker.enabled = True
    cgm_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04'
    # cgm_env.register(workspace)  # Please be careful not to overwrite existing environments

In [None]:
aml_run_config.environment = cgm_env

In [None]:
dataprep_source_dir = "./dataprep_src"
entry_point = "prepare.py"

sql_ds_input = sql_dataset.as_named_input('input1')
blob_ds_input = blob_dataset.as_named_input('input2')

dataset_out_dir = datetime.now(timezone.utc).strftime("dataset-%Y-%m-%d-%H-%M-%S")
output_data1 = OutputFileDatasetConfig(destination = (cgm_datasets_datastore, dataset_out_dir))
output_data_dataset = output_data1.register_on_complete(name = 'prepared_output_data')

data_prep_step = PythonScriptStep(
    script_name=entry_point,
    source_directory=dataprep_source_dir,
    arguments=["--input", sql_ds_input,
               "--input", blob_ds_input.as_mount(), 
               "--output", output_data1],
    compute_target=compute_target,
    runconfig=aml_run_config,
    allow_reuse=True
)

In [None]:
create_dataset = [data_prep_step]
pipeline1 = Pipeline(workspace=workspace, steps=[create_dataset])

In [None]:
# Submit the pipeline to be run
pipeline_run1 = Experiment(workspace, 'Compare_Models_Exp').submit(pipeline1)
pipeline_run1.wait_for_completion()