# AzureML using SDK

This is a document of implement an AzureML workflow using SDK.

**Steps**

1. Create Workspace 
2. Create Datastore
3. Create Dataset
4. Create Compute

### Create an AzureML Workspace

* **Workspace**
* **Config** file: used for other creation of datastore, dataset and so on.


In [None]:
from azureml.core import Workspace

ws = Workspace.create(
    name='myworkspace',
    subscription_id='<azure-subscription-id>',
    resource_group='myresourcegroup',
    create_resource_group=True,
    location='eastus2'
)

# save the config file to local directory used for other creation of datastore and so on
ws.write_config(path='./config')

### Create an Azure Datastore

* Azure Blob Container

In [None]:
from azureml.core import Workspace, Datastore

# get workspace from config file
ws = Workspace.from_config('./config')

# Create a datastore
az_store = Datastore.register_azure_blob_container(
    workspace=ws,
    datastore_name='azure_sdk_blob01',
    container_name='azuremlstb01blob',
    account_name='azuremllsb01',
    account_key='abc-bbc-aba'
)

### Create Dataset

1. Access to Workspace
2. Access to Datastore
3. Create dataset under datastore

In [None]:
from azureml.core import Workspace, Datastore, Dataset

# Access Workspace
ws = Workspace.from_config('./config')

# Access datastore
az_store = Datastore.get(ws, "azure_sdk_blob01")

# Create the path of the csv file
csv_path = [(az_store, "Loan Data/Loan Approval Prediction.csv")]

# Create the dataset
loan_dataset = Dataset.Tabular.from_delimited_files(path=csv_path)

# Register the dataset to workspace
loan_dataset = loan_dataset.register(workspace=ws,
                                     name="loan Applications Using SDK",
                                     create_new_version=True)

### Access Workspace, Datastore, Datasets using SDK

In [None]:
# ---------------------------------------
# Access wokrspace by name
# ---------------------------------------

ws = Workspace.from_config('./config')

# List all workspaces within a subcription
ws_list = Workspace.list(subscription_id="4235-234")


# ---------------------------------------
# Access the default datastore from workspace
# ---------------------------------------

az_default_store = ws.get_default_datastore()

# List all datastore
store_list = list(ws.datastores)


# ---------------------------------------
# Access datasets
# ---------------------------------------

# get dataset by name from a workspace
az_dataset = Dataset.get_by_name(ws, "Loan Applications Using SDK")

# list datasets from workspace
dataset_list = list(ws.datastores.keys())

### Pandas DataFrame in and out of AzureML

In [None]:
ws = Workspace.from_config('./config')
az_store = Datastore.get(ws, "azure_sdk_blob01")
az_dataset = Dataset.get_by_name(ws, "Loan Applications Using SDK")
az_default_store = ws.get_default_datastore()

# ---------------------------------------
# dataset from and to DataFrame
# ---------------------------------------

# Load AzureML dataset into the pandas DataFrame
df = az_dataset.to_pandas_dataframe()

# Upload DataFrame to the AzureML dataset
az_ds_from_df = Dataset.Tabular.register_pandas_dataframe(
    dataframe=df,
    target=az_store,
    name='upload_df_dataset')



### Upload local file to storage account via datastore

In [None]:
ws = Workspace.from_config('./config')
az_store = Datastore.get(ws, "azure_sdk_blob01")
az_dataset = Dataset.get_by_name(ws, "Loan Applications Using SDK")
az_default_store = ws.get_default_datastore()

# Upload local file
files_list = ['./data/test1.csv', './data/test2.csv']

az_store.upload_files(file=files_list,
                      target_path='loan Data/',
                      relative_root='./data/',
                      overwrite=True)

# Upload local folder
az_store.upload(src_dir='./data',
                target_path='loan Data/data',
                 overwrite=True)

### Create Experiment

Create the experiment on localmachin
* Get the datasets
* Log the results

Finally access to Workspace 

In [None]:
# Create/Access  an Experiment object
from azureml.core import Experiment

experiment = Experiment(workspace=ws,
                        name='Loan-SDK-Exp01'
                        )

# ---------------------------------------
# Run an experiment
# ---------------------------------------

# Start an experiment run
new_run = experiment.start_logging()

# stuff
df = az_dataset.to_pandas_dataframe()

# Log the metrics to the workspace
new_run.log('Total obserrvations:', len(df))

# Complete an experiment run
new_run.complete()

### Run an experiment from config script

* Create a Workspacke
* Create an Experiment using config script
  * Objective an experiment
  * Submit an experiment by a config script
  * Submit an run in the existing running experiment, should run under the submit experiment, directly submit a Run() experiment   
    ```python Run.get_context() ``` will treat all the script submit to the existing runing experiment, no need to start a new run.

In [None]:
from azureml.core import Workspace, Experiment, ScriptRunConfig

# ---------------------------------------
# Access the workspace using config.json
# ---------------------------------------

ws = Workspace.from_config('./config') # in below

# ---------------------------------------
# Create and submit an experiment
# ---------------------------------------

new_experiment = Experiment(workspace=ws, name="new_expriment")
script_config = ScriptRunConfig(source_directory='.', script='experiment run script.py')

new_run = new_experiment.submit(config=script_config)

new_run.wait_for_completion()

In [None]:
# -- experiment run script.py Submit a script to the existing run

# Create/Access  an Experiment object
from azureml.core import Experiment, Run

# ---------------------------------------
# Run an experiment
# ---------------------------------------

# Embedded a run into an existing running experiment
new_run = Run.get_context()

# stuff
df = az_dataset.to_pandas_dataframe()

# Log the metrics to the workspace
new_run.log('Total obserrvations:', len(df))

# Complete an experiment run
new_run.complete()

### Run Script as Training Model to AzureML

On previous # stuff part, we can chagne to local training model script

* Access the Workspace
* Create and register custom environment, and dependencies 
* Get then context of the experiment run
* Local training and prediction procedures(# stuff)

In [None]:
# Import required classes from AzureML
from azureml.core import Workspace, Experiment, Run, ScriptRunConfig, Environment
from azureml.core.environment import CondaDependencies

# Access the Workspace
ws = Workspace.from_config('./config')

# Create and register a custom environment
myenv = Environment(name='mytrainingenv')

# Create the dependencies
my_dep = CondaDependencies.create(conda_packages=['scikit-learn'])
myenv.python.conda_dependencies = my_dep

# Register the environment
myenv.register(ws)

# Submit Run Experiment from script
new_exp = Experiment(Workspace=ws, name='Prediction_to_run')
script_config = ScriptRunConfig(source_directory='.', script='training.py')

new_run = new_exp.submit(config=script_config)

new_run.wait_for_complete()


In [None]:
# training.py

# get context of experiment run
new_run = Run.get_context()

# Training stuffs here

# Logging run
new_run.log('Total observation')
new_run.log('metrics:')

new_run.complete()

### Provision a Compute Cluster

* Access the Workspace
* Create a Compute Cluster
* Attach the Compute Cluster to an Experiment

In [None]:
# Access the Workspace
from azureml.core import Workspace

ws = Workspace.from_config('./config')

# Provision a Compute Cluster
from azureml.core.compute import ComputeTarget, AmlCompute

# Configuration of the compute cluster
compute_config = AmlCompute.provisioning_configuration(
    vm_size='STANDARD_D11_V2',
    max_nodes=2
)

compute_target = ComputeTarget.create(
    workspace=ws,
    name='aml-cluster' 
)

# Create the cluster
computer_cluster = ComputeTarget.create(ws, name='my-cluster-001', compute_config=compute_config)

computer_cluster.wait_for_completion()

### Automate Model Training

Build two pipelines: 
* Data procsssing ```DataPrep.py```
  * Read the data
  * Select/Drop columns
  * Replace Missing values
  * Normalize the data
  * Upload the data and log metrics
  * Save and Pass the data to the next step
  
* Build and train model ```Training.py```
  * Read the data saved in previous step
  * Split the data into training and testing
  * Train and test the model
  * Upload the results and log metric

Another script to create and submit the pipelines ```Pipeline.py```
1. Create the environment
2. Assign compute clusters
3. Create data transfer folder
4. Define pipeline steps
5. Build the pipeline
6. Create/access an experiment
7. Run the pipeline

In [None]:
from azureml.core import Workspace, Experiment, Run, ScriptRunConfig, Environment
from azureml.core.environment import CondaDependencies

# Access the Workspace
ws = Workspace.from_config('./config')

# Create and register a custom environment
myenv = Environment(name='mytrainingenv')

# Provision a Compute Cluster
from azureml.core.compute import ComputeTarget, AmlCompute

# Configuration of the compute cluster
compute_config = AmlCompute.provisioning_configuration(
    vm_size='STANDARD_D11_V2',
    max_nodes=2
)

# Create the cluster
computer_cluster = ComputeTarget.create(ws, name='my-cluster-001', compute_config=compute_config)

computer_cluster.wait_for_completion()

# ----------------------------------------------
# Create pipelines
# ----------------------------------------------

# Step 01 - Data Preparation
from azureml.core.runconfig import RunConfiguration

run_config = RunConfiguration()
run_config.target = computer_cluster
run_config.environment = myenv

# Define the pipeline steps
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.data import PipelineData

input_ds = ws.datasets.get('Defaults')
dataFolder = PipelineData('datafolder', datastore=ws.get_default_datastore())

data_prep_step = PythonScriptStep(
    name='01 Data Preparation',
    script_name='DataPrep.py',
    compute_target=computer_cluster,
    source_directory='.',
    inputs=[input_ds.as_named_input('raw_data')],
    output=[dataFolder],
    runconfig=run_config,
    arguments=['--datafolder', dataFolder]
)

# Step 02 - Build and train model
train_step = PythonScriptStep(
    name='02 Build and train model',
    script_name='Training.py',
    compute_target=computer_cluster,
    source_directory='.',
    inputs=[dataFolder],
    runconfig=run_config,
    arguments=['--datafolder', dataFolder]
)

# Configure and build the pipeline
steps = [data_prep_step, train_step]

from azureml.pipeline.core import Pipeline
new_pipeline = Pipeline(workspace=ws, steps=steps)

# ----------------------------------------------
# Create/access an experiment
# ----------------------------------------------
from azureml.core import Experiment
new_experiment = Experiment(workspace=ws, name='Loan-SDK-Exp01')
new_pipeline_run = new_experiment.submit(new_pipeline)

new_pipeline_run.wait_for_completion(show_output=True)

#### Enable command line argument to run the script

In [None]:
from argparser import ArgumentParser

parser = ArgumentParser()

# single argument
parser.add_argument('--datafolder', type=str, dest='datafolder', help='data folder path')

# multiple arguments
args = parser.parse_args()

# access the argument
print('data folder:', args.datafolder)


#### Create DataPrep.py

In [None]:
from azureml.core import Run

# Get the run context
run = Run.get_context()

# Access the workspace
ws = run.experiment.workspace

# load data to DataFrame
df = run.input_datasets['raw_data'].to_pandas_dataframe()

# ----------------------------------------------
# Data Preparation Code Here
# ----------------------------------------------

# Get the arguments from pipeline job
from argparser import ArgumentParser

parser = ArgumentParser()
parser.add_argument('--datafolder', type=str)
args = parser.parse_args()

# create the folder if not exists
import os

# create the folder if not exists
os.makedirs(args.datafolders, exist_ok=True)

# save the DataFrame to csv file
path = os.path.join(args.datafolders, 'defaults_prep.csv')
df.to_csv(path, index=False)

# Complete the run
run.complete()

#### Training.py

In [None]:
from azureml.core import Run
import pandas as pd
from argparser import ArgumentParser

parser = ArgumentParser()
parser.add_argument('--datafolder', type=str)
args = parser.parse_args()

# Get the run context
run = Run.get_context()

# Access the workspace
ws = run.experiment.workspace

# Get the run context
run = Run.get_context()

# ----------------------------------------------
# Training Code Here
# ----------------------------------------------
# save the DataFrame to csv file
path = os.path.join(args.datafolders, 'defaults_prep.csv')
df = pd.read_csv(path)

# Complete the run
run.complete()

### SDK in Azure Designer

Except using SDK to create the pipeline, we also can embeded the Python script in the Azure Designer with Python Script Module. Also can use Zip Bundle to upload the package to the module.

Select the Execute the Python Script module, and write the code to execute the script. Save the script select the Run settings to choose a compute. 

In [None]:
# Python Script Module
import pandas as pd

def azureml_main(dataframe1 = None, dataframe2 = None):
    df = dataframe1
    df = df.drop(['fw', 'edu_num'], axis=1)

    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    return X, y

In [None]:
# Zip Bundle
# 1 -- Packed the previous process script into a file as a zip bundle
# DataPrep.py
from azureml.core import Run

# Get the run context
run = Run.get_context()

# Access the workspace
ws = run.experiment.workspace

def data_prep():
    input_ds = ws.datasets.get('Defaults').to_pandas_dataframe()
    input_ds = input_ds.drop(['fw', 'edu_num'], axis=1)
    return input_ds

# 2 -- Pack the DataPrep.py and other dependencies into a zip file
# 3 -- Create a Dataset in the workspace with uploaded packed ZIP file
# 4 -- Drage Dataset and create a Python Script Module in the Designer to import the DataPrep.py

# Python Script Module
import pandas as pd

def azureml_main(dataframe1 = None, dataframe2 = None):

    import DataPrep
    df = DataPrep.data_prep()
    return df
