In [1]:
import azureml.core
from azureml.core import Workspace, Datastore

In [2]:
ws = Workspace.from_config(path='./config.json')

In [3]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

In [4]:
env = Environment(name='bookstrator')

In [5]:
conda_dep = CondaDependencies()

# Installs scikit-learn version 0.21.3 conda package
conda_dep.add_pip_package("PyMuPDF==1.16.11==0.21.3")
conda_dep.add_pip_package("fitz")

# Adds dependencies to PythonSection of myenv
env.python.conda_dependencies = conda_dep

In [6]:
env.register(workspace=ws)

{
    "name": "bookstrator",
    "version": "3",
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "python": {
        "userManagedDependencies": false,
        "interpreterPath": "python",
        "condaDependenciesFile": null,
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "conda-forge"
            ],
            "dependencies": [
                "python=3.6.2",
                {
                    "pip": [
                        "azureml-defaults",
                        "PyMuPDF==1.16.11==0.21.3",
                        "fitz"
                    ]
                }
            ],
            "name": "azureml_cdf40690fdf959df045b77be181464fc"
        }
    },
    "docker": {
        "enabled": false,
        "baseImage": "mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04",
        "baseDockerfile": null,
        "sharedVolumes": true,
        "shmSize": null,
        "argum

In [7]:
# Default datastore
def_data_store = ws.get_default_datastore()

In [12]:
# Get the blob storage associated with the workspace
def_blob_store = Datastore(ws, "workspaceblobstore")

In [11]:
# Get file storage associated with the workspace
def_file_store = Datastore(ws, "workspacefilestore")

In [15]:
def_blob_store.upload_files(
    ['./raw-data/aesops-fables.pdf'],
    target_path='aesops-fables',
    overwrite=True
)

Uploading an estimated of 1 files
Uploading ./raw-data/aesops-fables.pdf
Uploaded ./raw-data/aesops-fables.pdf, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_580ae3d5a50544c5bd446e7f3aecc896

In [9]:
from azureml.data.data_reference import DataReference

In [13]:
blob_input_data = DataReference(
    datastore=def_blob_store,
    data_reference_name='test_data',
    path_on_datastore='aesops-fables/aesops-fables.pdf'
)

In [14]:
from azureml.pipeline.core import PipelineData

In [15]:
output_data1 = PipelineData(
    "output_data1",
    datastore=def_blob_store,
    output_name="output_data1"
)

In [16]:
from azureml.pipeline.steps import PythonScriptStep

In [17]:
from azureml.core.compute import ComputeTarget, AmlCompute

print(AmlCompute.supported_vmsizes(workspace=ws))

[{'name': 'Standard_D1_v2', 'vCPUs': 1, 'gpus': 0, 'memoryGB': 3.5, 'maxResourceVolumeMB': 51200}, {'name': 'Standard_D2_v2', 'vCPUs': 2, 'gpus': 0, 'memoryGB': 7.0, 'maxResourceVolumeMB': 102400}, {'name': 'Standard_D3_v2', 'vCPUs': 4, 'gpus': 0, 'memoryGB': 14.0, 'maxResourceVolumeMB': 204800}, {'name': 'Standard_D4_v2', 'vCPUs': 8, 'gpus': 0, 'memoryGB': 28.0, 'maxResourceVolumeMB': 409600}, {'name': 'Standard_D11_v2', 'vCPUs': 2, 'gpus': 0, 'memoryGB': 14.0, 'maxResourceVolumeMB': 102400}, {'name': 'Standard_D12_v2', 'vCPUs': 4, 'gpus': 0, 'memoryGB': 28.0, 'maxResourceVolumeMB': 204800}, {'name': 'Standard_D13_v2', 'vCPUs': 8, 'gpus': 0, 'memoryGB': 56.0, 'maxResourceVolumeMB': 409600}, {'name': 'Standard_D14_v2', 'vCPUs': 16, 'gpus': 0, 'memoryGB': 112.0, 'maxResourceVolumeMB': 819200}, {'name': 'Standard_D1', 'vCPUs': 1, 'gpus': 0, 'memoryGB': 3.5, 'maxResourceVolumeMB': 51200}, {'name': 'Standard_D2', 'vCPUs': 2, 'gpus': 0, 'memoryGB': 7.0, 'maxResourceVolumeMB': 102400}, {'nam

In [19]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS15_v2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [20]:
parse_step = PythonScriptStep(
    script_name='convert_to_csv.py',
    arguments=['--input_file', blob_input_data, '--output_file', output_data1, '--start_page', 1, '--end_page', 88],
    inputs=[blob_input_data],
    outputs=[output_data1],
    compute_target=cpu_cluster,
    runconfig=
    source_directory='../PDFReader'
)

In [21]:
from azureml.pipeline.core import Pipeline

In [22]:
pipeline1 = Pipeline(workspace=ws, steps=[parse_step])

In [None]:
cts = ws.compute_targets
for ct in cts:
    print(ct)

In [27]:
from azureml.core import ScriptRunConfig, Experiment

experiment = Experiment(ws, 'Parse_Aesop_PDF')

In [28]:
# setup run configuration for the step below, for the sake of virtual env
runconfig = ScriptRunConfig(source_directory="example", script="example.py")
runconfig.run_config.target = "local"
runconfig.run_config.environment = env
run = experiment.submit(pipeline1, config=runconfig)

TypeError: submit() got multiple values for argument 'config'