# Azure Scripts:
This Notebook contains the code that was used to start the scripts inside microsoft azure.


# Setup

### Connect to Workspace

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.47.0 to work with ml-tabular-synthesis-us


### Create Compute or connect to it

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# specify the name of the cluster to be created
# vm_size="STANDARD_DS11_V2"
vm_size="STANDARD_NC6_PROMO"
# compute_name = "cpu-ds11-cluster"
compute_name = "gpu-standard-nc6"

try:
    # Check for existing compute target
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
    print('Found existing compute, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, max_nodes=2)
        compute_target = ComputeTarget.create(ws, compute_name, compute_config)
        compute_target.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing compute, use it.


### Environment and config

In [3]:
import os
exp_folder = os.getcwd()

In [None]:
from azureml.core import  Environment

# Create a Python environment for the experiment (from a .yml file)
env = Environment.from_conda_specification("tddpm", "environment.yml")

# Register the environment 
env.register(workspace=ws)
registered_env = Environment.get(ws, 'tddpm')

### Pipeline run config

In [5]:
from azureml.core.runconfig import RunConfiguration
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = compute_target

# adding the environment variables (didn't work)
pipeline_run_config.environment_variables = {
    "PYTHONPATH":"tab_ddpm"
}

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")


Run configuration created.


## Tune Evaluation model
Firstly, the machine learning evaluation model (catboost|mlp) needs to be tuned to find the best hyperparameters.

Make sure to save the found hyperparameters are saved at "tuned_models/{catboost|ml}/{dataset}_cv.json".

In [None]:
input_args = [
    "adult-bgm",
    "catboost", # or mlp
    "val",
    "cuda"
            ]

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import DockerConfiguration
import tempfile
import os
# docker_runtime_config=DockerConfiguration(use_docker=True),


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "tune_eval_model",
                                source_directory = exp_folder,
                                script_name = "scripts/tune_evaluation_model.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True,
                                )

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'tune_eval_model')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

# Define Pipeline

In [None]:
input_args = [
    "--config",
    "exp/adult-bgm/ddpm_cb_best/config.toml",
    "--train",
    "--sample",
    "--eval"
            ]

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import DockerConfiguration
import tempfile
import os
# docker_runtime_config=DockerConfiguration(use_docker=True),


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "pipeline_simple",
                                source_directory = exp_folder,
                                script_name = "scripts/pipeline.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True,
                                )

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'tdppm_setup_test1')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

# Create Pipeline for Pipeline train

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import DockerConfiguration
import tempfile
import os
# docker_runtime_config=DockerConfiguration(use_docker=True),


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "pipeline_simple",
                                source_directory = exp_folder,
                                script_name = "scripts/pipeline.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True,
                                )

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'tdppm_setup_test1')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

# Tuning Pipeline


In [6]:
EXP_NAME="ddpm_ft_sim_tune_quantile"
input_args = [
    "adult", #dsname
    "26048", # train_size
    "synthetic",
    "catboost",
    EXP_NAME,
    "--eval_seeds",
    "--optimize_sim_score" # new
    # "--debug"
    ]

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os
from azureml.core.runconfig import DockerConfiguration

# docker_runtime_config=DockerConfiguration(use_docker=True, arguments=[f'-e PYTHONPATH {os.getcwd()}'])
#docker_runtime_config=DockerConfiguration(use_docker=True)

# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = EXP_NAME,
                                source_directory = exp_folder,
                                script_name = "scripts/tune_ddpm.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = EXP_NAME)
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline steps defined
Pipeline is built.
Created step ddpm_ft_sim_tune_quantile [170c5147][ad9ec5d8-a9b5-40d6-825a-d30d976b0af5], (This step will run and generate new outputs)
Submitted PipelineRun 9d2a52af-cd08-444c-a88a-777824189025
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/9d2a52af-cd08-444c-a88a-777824189025?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 9d2a52af-cd08-444c-a88a-777824189025
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/9d2a52af-cd08-444c-a88a-777824189025?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
PipelineRun Status: Running


StepRunId: 89343d20-b21c-470d-be72-537fb334cb0b
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/89343d20-b21c-470d-be72-537fb334cb0b?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
StepRun( ddpm_ft_sim_tune_quantile ) Status: NotStarted
StepRun( ddpm_ft_sim_tune_quantile ) Status: Running


# Evaluation Pipeline

In [6]:
input_args = [
    "--config",
    "exp/adult/ddpm_bgm_best/config.toml",
    "10", #n_eval_seeds
    "ddpm",
    "real", # real
    "catboost",
    "5" #n_sample_seeds
    ]

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "eval_seeds",
                                source_directory = exp_folder,
                                script_name = "scripts/eval_seeds.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'tdppm_eval_seeds')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline steps defined
Pipeline is built.
Created step eval_seeds [6eade92c][53d72a4c-7d30-43e1-8606-ad522c1d6d4b], (This step will run and generate new outputs)
Submitted PipelineRun cc989ba6-0356-47b1-8bb4-ec247234bb28
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/cc989ba6-0356-47b1-8bb4-ec247234bb28?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: cc989ba6-0356-47b1-8bb4-ec247234bb28
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/cc989ba6-0356-47b1-8bb4-ec247234bb28?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
PipelineRun Status: Running


StepRunId: abc517f1-35e6-4fd3-a240-231325afc4f9
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/abc517f1-35e6-4fd3-a240-231325afc4f9?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
StepRun( eval_seeds ) Status: Running


# Smote

In [5]:
input_args = [
    "--config",
    "exp/adult/smote/config.toml",
    "10", #n_eval_seeds
    "smote",
    "synthetic", # real
    "catboost",
    "5" #n_sample_seeds
    ]

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "eval_seeds",
                                source_directory = exp_folder,
                                script_name = "scripts/eval_seeds.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'smote_eval_seeds')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

# CTGAN / TVAE

## Pipeline train

In [None]:
input_args = ["--config",
                "exp/adult/tvae/config.toml",
                "--train"
                ]

from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import DockerConfiguration
import tempfile
import os
# docker_runtime_config=DockerConfiguration(use_docker=True),


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "pipeline_simple",
                                source_directory = exp_folder,
                                script_name = "CTGAN/pipeline_tvae.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True,
                                )

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'tdppm_setup_test1')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

## Eval seeds

In [6]:
input_args = [
    "--config",
    "exp/adult/tvae/config.toml",
    "10", #n_eval_seeds
    "tvae",
    "synthetic", # real
    "catboost",
    "5" #n_sample_seeds
    ]

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "eval_seeds",
                                source_directory = exp_folder,
                                script_name = "scripts/eval_seeds.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'tvae_eval_seeds')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

## Tune

In [6]:
input_args=[
    "data/adult/",
    "26048",
    "synthetic",
    "cuda",
    "--optimize_sim_score"
]

from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "tune_tvae",
                                source_directory = exp_folder,
                                script_name = "CTGAN/tune_tvae.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = "tune_tvae")
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline steps defined
Pipeline is built.
Created step tune_tvae [df878ff2][5c7e78da-c19f-4e99-95d6-bd9c1938d548], (This step will run and generate new outputs)
Submitted PipelineRun 1004f41e-3fc6-4dc3-9698-c63cd3eed830
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1004f41e-3fc6-4dc3-9698-c63cd3eed830?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 1004f41e-3fc6-4dc3-9698-c63cd3eed830
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1004f41e-3fc6-4dc3-9698-c63cd3eed830?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
PipelineRun Status: Running


StepRunId: dd234d82-4125-4ca6-9427-c397fed9af64
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/dd234d82-4125-4ca6-9427-c397fed9af64?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
StepRun( tune_tvae ) Status: NotStarted
StepRun( tune_tvae ) Status: Running

StepRun(tune_tvae) Execution Summary
StepRun( tune_tvae ) Status: Canceled
{'runId': 'dd234d82-4125-4ca6-9427-c397fed9af64', 'target': 'gpu-standard-nc6', 'status': 'Queued', 'services': {}, 'properties': {'ContentSnapshotId': '082ce88f-e36f-4fe3-a14

'Canceled'

# CTABGAN

## Pipeline

In [None]:
input_args = ["--config",
                "exp/adult/ctabgan/config.toml",
                "--train"
                ]

from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import DockerConfiguration
import tempfile
import os
# docker_runtime_config=DockerConfiguration(use_docker=True),


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "pipeline_simple",
                                source_directory = exp_folder,
                                script_name = "CTAB-GAN/pipeline_ctabgan.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True,
                                )

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'train_ctab-gan')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline steps defined
Pipeline is built.
Created step pipeline_simple [90530a86][22c69987-60ac-4842-b176-ba92bda2b9ac], (This step will run and generate new outputs)
Submitted PipelineRun 4a781617-4a0f-4b0f-b421-c07b9bac7d01
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4a781617-4a0f-4b0f-b421-c07b9bac7d01?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 4a781617-4a0f-4b0f-b421-c07b9bac7d01
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4a781617-4a0f-4b0f-b421-c07b9bac7d01?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 9b213723-df8e-4884-938e-32f2fc083bc8
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/9b213723-df8e-4884-938e-32f2fc083bc8?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
StepRun( pipeline_simple ) Status: NotStarted
StepRun( pipeline_simple ) Status: Running
Performing interactive authentication. Please follow the instructions on the terminal.


## Eval seeds

In [None]:
input_args = [
    "--config",
    "exp/adult/ctabgan/config.toml",
    "10", #n_eval_seeds
    "ctabgan",
    "synthetic", # real
    "catboost",
    "5" #n_sample_seeds
    ]
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "eval_seeds",
                                source_directory = exp_folder,
                                script_name = "scripts/eval_seeds.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'ctabgan_eval_seeds')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

## Tune

In [None]:
input_args=[
    "data/adult/",
    "26048",
    "synthetic",
    "cuda:0",
    "--optimize_sim_score",
    # "--debug"
]

from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "tune_ctabgan",
                                source_directory = exp_folder,
                                script_name = "CTAB-GAN/tune_ctabgan.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = "tune_ctabgan")
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline steps defined
Pipeline is built.
Created step tune_ctabgan [a534c365][7c99a9bf-ebe1-4baf-a575-1b1dd39245ce], (This step will run and generate new outputs)
Submitted PipelineRun 46599e32-ef2e-45e1-ba16-484948ee5c0f
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/46599e32-ef2e-45e1-ba16-484948ee5c0f?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 46599e32-ef2e-45e1-ba16-484948ee5c0f
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/46599e32-ef2e-45e1-ba16-484948ee5c0f?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
PipelineRun Status: Running


StepRunId: cb725dea-8cdc-48fb-beae-6ab22da4b92e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/cb725dea-8cdc-48fb-beae-6ab22da4b92e?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
StepRun( tune_ctabgan ) Status: NotStarted
StepRun( tune_ctabgan ) Status: Running


# CTABGAN PLUS

## Pipeline Train

In [6]:
input_args = ["--config",
                "exp/adult/ctabgan-plus/config.toml",
                "--train"
                ]

from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import DockerConfiguration
import tempfile
import os
# docker_runtime_config=DockerConfiguration(use_docker=True),


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "pipeline_simple",
                                source_directory = exp_folder,
                                script_name = "CTAB-GAN-Plus/pipeline_ctabganp.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True,
                                )

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'train_ctab-gan-plus')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline steps defined
Pipeline is built.
Created step pipeline_simple [74dbd10d][9e508a19-a3a7-4bed-8b07-5c96037736ea], (This step will run and generate new outputs)
Submitted PipelineRun 57fec28b-d314-4cb6-8b4c-89062abfeae3
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/57fec28b-d314-4cb6-8b4c-89062abfeae3?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 57fec28b-d314-4cb6-8b4c-89062abfeae3
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/57fec28b-d314-4cb6-8b4c-89062abfeae3?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
PipelineRun Status: Running


StepRunId: e297483e-c797-490d-af4f-d7f638737292
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/e297483e-c797-490d-af4f-d7f638737292?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
StepRun( pipeline_simple ) Status: Queued
StepRun( pipeline_simple ) Status: Running


## Eval Seeds

In [None]:
input_args = [
    "--config",
    "exp/adult/ctabgan-plus/config.toml",
    "10", #n_eval_seeds
    "ctabgan-plus",
    "synthetic", # real
    "catboost",
    "5" #n_sample_seeds
    ]
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "eval_seeds",
                                source_directory = exp_folder,
                                script_name = "scripts/eval_seeds.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'ctabgan_eval_seeds')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline steps defined
Pipeline is built.
Created step eval_seeds [a8974f04][d603be2c-45b9-4b39-9dc4-46bf90b61d84], (This step will run and generate new outputs)
Submitted PipelineRun 630565b8-aaa6-4e5b-a0d3-5905a3102950
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/630565b8-aaa6-4e5b-a0d3-5905a3102950?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 630565b8-aaa6-4e5b-a0d3-5905a3102950
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/630565b8-aaa6-4e5b-a0d3-5905a3102950?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
PipelineRun Status: Running


StepRunId: 2f6fcf45-7c6c-445f-b273-05c48af92201
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/2f6fcf45-7c6c-445f-b273-05c48af92201?wsid=/subscriptions/49641ae7-6237-4363-b149-e721ac81137a/resourcegroups/rg-tabular_synthesis/workspaces/ml-tabular-synthesis-us&tid=84c31ca0-ac3b-4eae-ad11-519d80233e6f
StepRun( eval_seeds ) Status: NotStarted
StepRun( eval_seeds ) Status: Running


## Tune

In [None]:
input_args=[
    "data/adult/",
    "26048",
    "synthetic",
    "cuda",
    "--optimize_sim_score",
    "--debug"
]

from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os


# Step 1, Run the data prep script
pipeline_simple = PythonScriptStep(name = "tune_ctabganplus",
                                source_directory = exp_folder,
                                script_name = "CTAB-GAN-Plus/tune_ctabgan.py",
                                arguments = input_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


# Construct the pipeline
pipeline_steps = [pipeline_simple]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = "tune_ctabganplus")
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

# Pipeline for diffusion train

In [None]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import tempfile
import os


model_args =[
    "--learn_sigma",
    "True",
    "--class_cond",
    "False",
    "--num_channels",
    "128",
    "--num_res_blocks",
    "3",
    ]

training_args = [
    "--iterations",
    "-100",
    "--save_interval",
    "10000",
    "--log_interval",
    "5",
    "--diffusion_steps",
    "500",
    "--noise_schedule",
    "cosine",
    "--lr",
    "2e-4",
    "--weight_decay",
    "0.01",
    "--batch_size",
    "128",
    "--use_fp16",
    "True",
    "--rescale_timesteps",
    "True"
]


location_args = [  
    "--config_path",
    "tabular_synthesis/data/config/adult.json",
]

input_args = location_args + model_args + training_args



# Get the training dataset
adult_ds = ws.datasets.get("adult_train")


# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
output = OutputFileDatasetConfig("output")



train_args =  ["--dataset_path", adult_ds.as_download(), '--output_path', output] + input_args

# Step 1, Run the data prep script
image_train = PythonScriptStep(name = "image_training",
                                source_directory = experiment_folder,
                                script_name = "tabular_synthesis/image_train_azure.py",
                                arguments = train_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

output2=OutputFileDatasetConfig("output2")
sample_process_args = [
                "--diffusion_steps",
                "500",
                "--noise_schedule",
                "cosine",
                "--num_samples",
                "-1",
                "--batch_size",
                "128",
                "--use_fp16",
                "True"
]
sample_args = ["--dataset_path", adult_ds.as_download(), "--model_path", output.as_input(), "--output_path", output2] + model_args + sample_process_args + location_args

image_sample = PythonScriptStep(name = "image_sampling",
                                source_directory = experiment_folder,
                                script_name = "tabular_synthesis/image_sample_azure.py",
                                arguments = sample_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

final_output=OutputFileDatasetConfig("final_output")

eval_args = ["--real_dataset_path", adult_ds.as_download(),"--synthetic_dataset", output2.as_input(), "--output_path", final_output] + location_args


quick_eval = PythonScriptStep(name = "dataset_evaluation",
                                source_directory = experiment_folder,
                                script_name = "tabular_synthesis/quick_evaluation_azure.py",
                                arguments = eval_args,
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [image_train, image_sample, quick_eval]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print(pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'image_train_sample_eval_no_class')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)