# MNIST-Azure
## Pipeline Test
### By: Sebastian Goodfellow

In [3]:
# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Import 3rd party libraries
import os
import sys
import  azureml.core
from azureml.core import Workspace, Experiment, Datastore
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.widgets import RunDetails
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep, DataTransferStep
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# Import local Libraries
sys.path.insert(0, './../')
from mnistazure.config import DATA_PATH, TENSORBOARD_PATH

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Initialize Workspace

In [5]:
# Get workspace
workspace = Workspace.get(name='mnist-azure', subscription_id='30284b70-31e1-4b93-b620-26959f80a8f9', 
                          resource_group='spector-ai')

# Get file datastore
datastore = Datastore.get(workspace=workspace, datastore_name='workspacefilestore')

# source directory
source_directory = './../'

In [11]:
# View workspace datastores
datastores = workspace.datastores
for name, ds in datastores.items():
    print(name, ds.datastore_type)

workspacefilestore AzureFile
workspaceblobstore AzureBlob
raw AzureFile
train AzureFile


# Initialize Compute Target

In [83]:
# View available compute targets
cts = workspace.compute_targets
for ct in cts:
    print(ct)

aml-compute


In [90]:
# Create compute target if not available
aml_compute_target = 'aml-compute'
try:
    aml_compute = AmlCompute(workspace=workspace, name=aml_compute_target)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating new compute target')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_D2_V2', min_nodes = 1, 
                                                                max_nodes = 4)    
    aml_compute = ComputeTarget.create(workspace=workspace, name=aml_compute_target, 
                                       provisioning_configuration=provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


# Create Data References

In [118]:
# Raw data reference
pl_raw_data_ref = PipelineData(name='pl_raw_data', datastore=datastore, is_directory=True, 
                               pipeline_output_name='pl_raw_data', output_overwrite=True)

# Raw data reference
pl_train_data_ref = PipelineData(name='pl_train_data', datastore=datastore, is_directory=True, 
                                 pipeline_output_name='pl_train_data', output_overwrite=True)

# Training data reference
train_data_ref = DataReference(datastore=datastore, data_reference_name='train_data', 
                               path_on_datastore='train_data')

# Build Pipeline
### Setup Pipeline Environment

In [119]:
# Create a new runconfig object
run_config = RunConfiguration()

# Enable Docker 
run_config.environment.docker.enabled = True

# Set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# Auto-prepare the Docker image when used for execution (if it is not already prepared)
run_config.auto_prepare_environment = True

# Specify CondaDependencies obj
run_config.environment.python.conda_dependencies = \
    CondaDependencies(conda_dependencies_file_path='./../pipeline_env.yml')

### Step 1: Download MNIST Dataset

In [120]:
# Download MNIST dataset to pipeline data object
step_1 = PythonScriptStep(script_name='pipeline_1.py', arguments=['--output', pl_raw_data_ref], 
                          inputs=None, outputs=[pl_raw_data_ref], compute_target=aml_compute, 
                          source_directory=source_directory, runconfig=run_config)

### Step 2: Creating Training Dataset

In [121]:
# Create training dataset
step_2 = PythonScriptStep(script_name='pipeline_2.py', 
                          arguments=['--input', pl_raw_data_ref, '--output', pl_train_data_ref], 
                          inputs=[pl_raw_data_ref], outputs=[pl_train_data_ref], compute_target=aml_compute, 
                          source_directory=source_directory, runconfig=run_config)

In [None]:
# Build Pipeline object
pipeline = Pipeline(workspace=workspace, steps=[step_1, step_2])

# Submit pipeline job
pipeline_run = Experiment(workspace=workspace, name='Data_dependency').submit(pipeline, regenerate_outputs=False)
RunDetails(pipeline_run).show()