In [1]:
from azureml.core import Workspace, LinkedService
from azureml.widgets import RunDetails

In [7]:
synapse_linked = 'synw-dmpbackup-westeu-01p-linked'
synapse_compute_name = 'cc-small'
synapse_pool_name = 'test'

In [8]:
ws = Workspace.from_config()

## Retrieve the link between your Azure Synapse Analytics workspace and your Azure Machine Learning workspace

In [9]:
for service in LinkedService.list(ws) : 
    print(f"Service: {service}")

# Retrieve a known linked service
linked_service = LinkedService.get(ws, synapse_linked)

Service: LinkedService(workspace=Workspace.create(name='mlw-dmpbackup-westeu-01p', subscription_id='25a89471-60b3-4c91-b44f-ca49f38e6137', resource_group='rg-dmpbackup-westeu-01p'), name=synw-dmpbackup-westeu-01p-linked, type=LinkedServiceLinkType.synapse, linked_service_resource_id=/subscriptions/25a89471-60b3-4c91-b44f-ca49f38e6137/resourceGroups/rg-dmpbackup-westeu-01p/providers/Microsoft.Synapse/workspaces/synw-dmpbackup-westeu-01p, system_assigned_identity_principal_id=4920cd8e-47e7-4407-8d4b-2dc91a133c2f


## Attach your Apache spark pool as a compute target for Azure Machine Learning

In [10]:
from azureml.core.compute import SynapseCompute, ComputeTarget


attach_config = SynapseCompute.attach_configuration(
        linked_service=linked_service,
        type="SynapseSpark",
        pool_name=synapse_pool_name,
)

synapse_compute = ComputeTarget.attach(
        workspace=ws,
        name=synapse_compute_name,
        attach_configuration=attach_config,
)

synapse_compute.wait_for_completion()

Provisioning operation finished, operation "Succeeded"


## Create a SynapseSparkStep that uses the linked Apache Spark pool

In [6]:
from azureml.core.environment import Environment
from azureml.pipeline.steps import SynapseSparkStep

env = Environment(name="myenv")
env.python.conda_dependencies.add_pip_package("azureml-core>=1.20.0")

step_1 = SynapseSparkStep(
    name='synapse-spark',
    file='prep-dataset-synapse.py',
    source_directory="./code", 
    arguments=["--hday", '2021-12-01', '--out_dataset_name', 'one-user-dataset', '--out_dataset_desc', 'dataset with only one user'],
    compute_target=synapse_compute_name,
    driver_memory="7g",
    driver_cores=4,
    executor_memory="7g",
    executor_cores=2,
    num_executors=1,
    environment=env,
)

only conda_dependencies specified in environment will be used in Synapse Spark run.


In [None]:
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[step_1])
pipeline_run = pipeline.submit('synapse-pipeline', regenerate_outputs=True)
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

In [11]:
from azureml.core import RunConfiguration
from azureml.core import ScriptRunConfig 

from azureml.core.environment import CondaDependencies
conda_dep = CondaDependencies()
conda_dep.add_pip_package("azureml-core==1.20.0")

run_config = RunConfiguration(framework="pyspark")
run_config.target = synapse_compute_name

run_config.spark.configuration["spark.driver.memory"] = "7g" 
run_config.spark.configuration["spark.driver.cores"] = 2 
run_config.spark.configuration["spark.executor.memory"] = "7g" 
run_config.spark.configuration["spark.executor.cores"] = 1 
run_config.spark.configuration["spark.executor.instances"] = 1 

run_config.environment.python.conda_dependencies = conda_dep

script_run_config=ScriptRunConfig(
    source_directory='./code',
    script='prep-dataset-synapse.py',
    arguments=["--hday", '2021-12-01', '--out_dataset_name', 'one-user-dataset', '--out_dataset_desc', 'dataset with only one user'],
    run_config=run_config,
) 

In [12]:
from azureml.core import Experiment

exp = Experiment(workspace=ws, name="synapse-spark") 
run = exp.submit(config=script_run_config) 
RunDetails(run).show()
run.wait_for_completion(show_output=True)

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

RunId: synapse-spark_1654782772_9f61aa5e
Web View: https://ml.azure.com/runs/synapse-spark_1654782772_9f61aa5e?wsid=/subscriptions/25a89471-60b3-4c91-b44f-ca49f38e6137/resourcegroups/rg-dmpbackup-westeu-01p/workspaces/mlw-dmpbackup-westeu-01p&tid=041d21aa-b4ab-4ad1-891d-62207b3367ef
