# Create a Pipeline

## Connect to Workspace

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.44.0 to work with ml-tabular_synthesis


# Create Compute or connect to it

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

vm_size="STANDARD_DS11_V2"
vm_size="STANDARD_NC6_PROMO"
compute_name = "cpu-ds11-cluster"
compute_name = "gpu-nc6-promo"

try:
    # Check for existing compute target
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, max_nodes=2)
        compute_target = ComputeTarget.create(ws, compute_name, compute_config)
        compute_target.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


# alternative: select current compute instance

In [3]:
from azureml.core.compute_target import ComputeTargetException
from azureml.core.compute import ComputeTarget

compute_name = "gpu-nc6-promo"
try:
    # Check for existing compute target
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
    print('Found existing Target, use it.')
except ComputeTargetException as e:
    print("Compute not found: ", e)

Found existing Target, use it.


# Environment and config

In [4]:
import os
# Create a folder for the pipeline step files
experiment_folder = '../../git_repos/Tabular-Data-Synthesis/src'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

../../git_repos/Tabular-Data-Synthesis/src


In [5]:
from azureml.core import  Environment

# Create a Python environment for the experiment (from a .yml file)
env = Environment.from_conda_specification("tabsyn", "environment.yml")

# Register the environment 
env.register(workspace=ws)
registered_env = Environment.get(ws, 'tabsyn')

In [6]:
from azureml.core.runconfig import RunConfiguration
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = compute_target

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


# Define Pipeline

input_args = [  "--config_path",
                "tabular_synthesis/data/config/adult.json",
                "--learn_sigma",
                "True",
                "--iterations",
                "5000",
                "--anneal_lr",
                "True",
                "--batch_size",
                "64",
                "--lr",
                "3e-4",
                "--save_interval",
                "10000",
                "--weight_decay",
                "0.05",
                "--classifier_attention_resolutions",
                "32,16,8",
                "--classifier_depth",
                "2",
                "--classifier_width",
                "64",
                "--classifier_pool",
                "attention",
                "--classifier_resblock_updown",
                "True",
                "--classifier_use_scale_shift_norm",
                "True",
                "--log_interval",
                "25",
                "--eval_interval",
                "50"
            ]

In [None]:
input_args = [  "--config_path",
                "tabular_synthesis/data/config/adult.json",
                "--learn_sigma",
                "True",
                "--iterations",
                "5000",
                "--anneal_lr",
                "True",
                "--batch_size",
                "64",
                "--lr",
                "2e-3",
                "--save_interval",
                "10000",
                "--weight_decay",
                "0.01",
                "--classifier_attention_resolutions",
                "32,16,8",
                "--classifier_depth",
                "4",
                "--classifier_width",
                "64",
                "--classifier_pool",
                "attention",
                "--classifier_resblock_updown",
                "True",
                "--classifier_use_scale_shift_norm",
                "True",
                "--log_interval",
                "10",
                "--eval_interval",
                "25",
                "--noised",
                "True",
                "--classifier_use_fp16",
                "True"
            ]

# Create Pipeline for classifier train

In [None]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os

# Get the training dataset
adult_ds = ws.datasets.get("adult_train")


# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
output = OutputFileDatasetConfig("output")

# Step 1, Run the data prep script
classifier_train = PythonScriptStep(name = "classifier_train",
                                source_directory = experiment_folder,
                                script_name = "tabular_synthesis/classifier_train_azure.py",
                                arguments = ["--dataset_path", adult_ds.as_download(),
                                             '--output_path', output] + input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [classifier_train]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'classifier_train_test0')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

# Pipeline for diffusion train

In [20]:
model_args =[
    "--learn_sigma",
    "True",
    "--class_cond",
    "False",
    "--num_channels",
    "128",
    "--num_res_blocks",
    "3",
    ]

training_args = [
    "--iterations",
    "-25",
    "--save_interval",
    "10000",
    "--log_interval",
    "5",
    "--diffusion_steps",
    "500",
    "--noise_schedule",
    "linear",
    "--lr",
    "1e-4",
    "--weight_decay",
    "0.01",
    "--batch_size",
    "64",
    "--use_fp16",
    "True",
    "--rescale_timesteps",
    "True"
]


location_args = [  
    "--config_path",
    "tabular_synthesis/data/config/adult.json",
]

input_args = location_args + model_args + training_args

In [21]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os

# Get the training dataset
adult_ds = ws.datasets.get("adult_train")


# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
output = OutputFileDatasetConfig("output")



train_args =  ["--dataset_path", adult_ds.as_download(), '--output_path', output] + input_args

# Step 1, Run the data prep script
image_train = PythonScriptStep(name = "image_training",
                                source_directory = experiment_folder,
                                script_name = "tabular_synthesis/image_train_azure.py",
                                arguments = train_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

output2=OutputFileDatasetConfig("output2")
sample_process_args = [
                "--diffusion_steps",
                "200",
                "--noise_schedule",
                "cosine",
                "--num_samples",
                "-1",
                "--batch_size",
                "128",
                "--use_fp16",
                "True"
]
sample_args = ["--dataset_path", adult_ds.as_download(), "--model_path", output.as_input(), "--output_path", output2] + model_args + sample_process_args + location_args

image_sample = PythonScriptStep(name = "image_sampling",
                                source_directory = experiment_folder,
                                script_name = "tabular_synthesis/image_sample_azure.py",
                                arguments = sample_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

final_output=OutputFileDatasetConfig("final_output")

eval_args = ["--real_dataset_path", adult_ds.as_download(),"--synthetic_dataset", output2.as_input(), "--output_path", final_output] + location_args


quick_eval = PythonScriptStep(name = "dataset_evaluation",
                                source_directory = experiment_folder,
                                script_name = "tabular_synthesis/quick_evaluation_azure.py",
                                arguments = eval_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

In [22]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [image_train, image_sample, quick_eval]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print(pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'image_train_sample_eval_no_class')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

[<azureml.pipeline.steps.python_script_step.PythonScriptStep object at 0x7f07eb4cd310>, <azureml.pipeline.steps.python_script_step.PythonScriptStep object at 0x7f07eb4cda30>, <azureml.pipeline.steps.python_script_step.PythonScriptStep object at 0x7f07eb88e4c0>]
Pipeline is built.
Created step image_training [01d5284f][e91574c2-9f7e-4733-aacf-3f697f4742f0], (This step will run and generate new outputs)
Created step image_sampling [dd704ffe][8df98ab1-3bc1-4610-a005-9391e6135290], (This step will run and generate new outputs)Created step dataset_evaluation [5ca9161d][e504daef-44db-4432-93f6-d7913048bb62], (This step will run and generate new outputs)

Submitted PipelineRun 00dd8c22-5c98-4335-84be-e17504477092
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/00dd8c22-5c98-4335-84be-e17504477092?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular_synthesis&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
Pipeline submitt

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 00dd8c22-5c98-4335-84be-e17504477092
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/00dd8c22-5c98-4335-84be-e17504477092?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular_synthesis&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 64b3b331-99c0-4eb2-89ab-6ed504727d2f
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/64b3b331-99c0-4eb2-89ab-6ed504727d2f?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular_synthesis&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
StepRun( image_training ) Status: Queued
StepRun( image_training ) Status: Running

StepRun(image_training) Execution Summary
StepRun( image_training ) Status: Finished
{'runId': '64b3b331-99c0-4eb2-89ab-6ed504727d2f', 'target': 'gpu-nc6-promo', 'status': 'Completed', 'startTimeUtc': '2022-11-03T15:05:

'Finished'

In [11]:
import torch as th
softmax(th.rand_like(th.Tensor([4,1,2,2])))

NameError: name 'softmax' is not defined

In [None]:
th.rand_like(th.Tensor([4,1,2,2])) * 0.1