In [1]:
import azureml.core
from azureml.core import Workspace, Datastore

In [2]:
ws = Workspace.from_config(path='./config.json')

In [3]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

In [4]:
env = Environment(name='bookstrator')

In [26]:
conda_dep = CondaDependencies()

# Installs scikit-learn version 0.21.3 conda package
conda_dep.add_pip_package("PyMuPDF==1.16.11==0.21.3")
conda_dep.add_pip_package("fitz")

# Adds dependencies to PythonSection of myenv
env.python.conda_dependencies = conda_dep

In [27]:
env.register(workspace=ws)

{
    "name": "bookstrator",
    "version": "3",
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "python": {
        "userManagedDependencies": false,
        "interpreterPath": "python",
        "condaDependenciesFile": null,
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "conda-forge"
            ],
            "dependencies": [
                "python=3.6.2",
                {
                    "pip": [
                        "azureml-defaults",
                        "PyMuPDF==1.16.11==0.21.3",
                        "fitz"
                    ]
                }
            ],
            "name": "azureml_cdf40690fdf959df045b77be181464fc"
        }
    },
    "docker": {
        "enabled": false,
        "baseImage": "mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04",
        "baseDockerfile": null,
        "sharedVolumes": true,
        "shmSize": null,
        "argum

In [28]:
# Default datastore
def_data_store = ws.get_default_datastore()

In [29]:
# Get the blob storage associated with the workspace
def_blob_store = Datastore(ws, "workspaceblobstore")

In [30]:
# Get file storage associated with the workspace
def_file_store = Datastore(ws, "workspacefilestore")

In [15]:
def_blob_store.upload_files(
    ['./raw-data/aesops-fables.pdf'],
    target_path='aesops-fables',
    overwrite=True
)

Uploading an estimated of 1 files
Uploading ./raw-data/aesops-fables.pdf
Uploaded ./raw-data/aesops-fables.pdf, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_580ae3d5a50544c5bd446e7f3aecc896

In [31]:
from azureml.data.data_reference import DataReference

In [32]:
blob_input_data = DataReference(
    datastore=def_blob_store,
    data_reference_name='test_data',
    path_on_datastore='aesops-fables/aesops-fables.pdf'
)

In [33]:
from azureml.pipeline.core import PipelineData

In [34]:
output_data1 = PipelineData(
    "output_data1",
    datastore=def_blob_store,
    output_name="output_data1"
)

In [35]:
from azureml.pipeline.steps import PythonScriptStep

In [34]:
from azureml.core.compute import ComputeTarget, AmlCompute

print(AmlCompute.supported_vmsizes(workspace=ws))

[{'name': 'Standard_D1_v2', 'vCPUs': 1, 'gpus': 0, 'memoryGB': 3.5, 'maxResourceVolumeMB': 51200}, {'name': 'Standard_D2_v2', 'vCPUs': 2, 'gpus': 0, 'memoryGB': 7.0, 'maxResourceVolumeMB': 102400}, {'name': 'Standard_D3_v2', 'vCPUs': 4, 'gpus': 0, 'memoryGB': 14.0, 'maxResourceVolumeMB': 204800}, {'name': 'Standard_D4_v2', 'vCPUs': 8, 'gpus': 0, 'memoryGB': 28.0, 'maxResourceVolumeMB': 409600}, {'name': 'Standard_D11_v2', 'vCPUs': 2, 'gpus': 0, 'memoryGB': 14.0, 'maxResourceVolumeMB': 102400}, {'name': 'Standard_D12_v2', 'vCPUs': 4, 'gpus': 0, 'memoryGB': 28.0, 'maxResourceVolumeMB': 204800}, {'name': 'Standard_D13_v2', 'vCPUs': 8, 'gpus': 0, 'memoryGB': 56.0, 'maxResourceVolumeMB': 409600}, {'name': 'Standard_D14_v2', 'vCPUs': 16, 'gpus': 0, 'memoryGB': 112.0, 'maxResourceVolumeMB': 819200}, {'name': 'Standard_D1', 'vCPUs': 1, 'gpus': 0, 'memoryGB': 3.5, 'maxResourceVolumeMB': 51200}, {'name': 'Standard_D2', 'vCPUs': 2, 'gpus': 0, 'memoryGB': 7.0, 'maxResourceVolumeMB': 102400}, {'nam

In [36]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS15_v2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [37]:
parse_step = PythonScriptStep(
    script_name='convert_to_csv.py',
    arguments=['--input_file', blob_input_data, '--output_file', output_data1, '--start_page', 1, '--end_page', 88],
    inputs=[blob_input_data],
    outputs=[output_data1],
    compute_target=cpu_cluster_name,
    source_directory='../PDFReader'
)

In [38]:
from azureml.pipeline.core import Pipeline

In [39]:
pipeline1 = Pipeline(workspace=ws, steps=[parse_step])

In [None]:
cts = ws.compute_targets
for ct in cts:
    print(ct)

In [42]:
from azureml.core import Experiment

# Submit the pipeline to be run
pipeline_run1 = Experiment(ws, 'Parse_Aesop_PDF').submit(pipeline1)
pipeline_run1.wait_for_completion()

Submitted PipelineRun fbb42733-0c02-4a0f-9385-7c5e84be4fa4
Link to Azure Machine Learning studio: https://ml.azure.com/experiments/Parse_Aesop_PDF/runs/fbb42733-0c02-4a0f-9385-7c5e84be4fa4?wsid=/subscriptions/cd309673-f356-437c-b53a-ef46d5ec9635/resourcegroups/bookstrator_intro/workspaces/Bookstrator_1




PipelineRunId: fbb42733-0c02-4a0f-9385-7c5e84be4fa4
Link to Portal: https://ml.azure.com/experiments/Parse_Aesop_PDF/runs/fbb42733-0c02-4a0f-9385-7c5e84be4fa4?wsid=/subscriptions/cd309673-f356-437c-b53a-ef46d5ec9635/resourcegroups/bookstrator_intro/workspaces/Bookstrator_1
PipelineRun Status: Running


StepRunId: 6aa29bf6-07e6-434c-bf02-92fe5eded71f
Link to Portal: https://ml.azure.com/experiments/Parse_Aesop_PDF/runs/6aa29bf6-07e6-434c-bf02-92fe5eded71f?wsid=/subscriptions/cd309673-f356-437c-b53a-ef46d5ec9635/resourcegroups/bookstrator_intro/workspaces/Bookstrator_1
StepRun( convert_to_csv.py ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_7c0dae618436e8ff1b3b102cee6a30345590acff28cffe88311093069c6751ed_d.txt
2020-03-01T14:02:27Z Starting output-watcher...
2020-03-01T14:02:27Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_bbf736b212d7e6227e70e54a9e4b7f44
a1298f4ce990: Pu

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "User program failed with ModuleNotFoundError: No module named 'fitz'",
        "detailsUri": "https://aka.ms/azureml-known-errors",
        "details": [],
        "debugInfo": {
            "type": "ModuleNotFoundError",
            "message": "No module named 'fitz'",
            "stackTrace": "  File \"/mnt/batch/tasks/shared/LS_root/jobs/bookstrator_1/azureml/6aa29bf6-07e6-434c-bf02-92fe5eded71f/mounts/workspaceblobstore/azureml/6aa29bf6-07e6-434c-bf02-92fe5eded71f/azureml-setup/context_manager_injector.py\", line 127, in execute_with_context\n    runpy.run_path(sys.argv[0], globals(), run_name=\"__main__\")\n  File \"/azureml-envs/azureml_1b417bb747e35859ebf611fb43071e9c/lib/python3.6/runpy.py\", line 263, in run_path\n    pkg_name=pkg_name, script_name=fname)\n  File \"/azureml-envs/azureml_1b417bb747e35859ebf611fb43071e9c/lib/python3.6/runpy.py\", line 96, in _run_module_code\n    mod_name, mod_spec, pkg_name, script_name)\n  File \"/azureml-envs/azureml_1b417bb747e35859ebf611fb43071e9c/lib/python3.6/runpy.py\", line 85, in _run_code\n    exec(code, run_globals)\n  File \"convert_to_csv.py\", line 1, in <module>\n    from PDFReader import PDFReader\n  File \"/mnt/batch/tasks/shared/LS_root/jobs/bookstrator_1/azureml/6aa29bf6-07e6-434c-bf02-92fe5eded71f/mounts/workspaceblobstore/azureml/6aa29bf6-07e6-434c-bf02-92fe5eded71f/PDFReader.py\", line 1, in <module>\n    import fitz\n"
        }
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"User program failed with ModuleNotFoundError: No module named 'fitz'\",\n        \"detailsUri\": \"https://aka.ms/azureml-known-errors\",\n        \"details\": [],\n        \"debugInfo\": {\n            \"type\": \"ModuleNotFoundError\",\n            \"message\": \"No module named 'fitz'\",\n            \"stackTrace\": \"  File \\\"/mnt/batch/tasks/shared/LS_root/jobs/bookstrator_1/azureml/6aa29bf6-07e6-434c-bf02-92fe5eded71f/mounts/workspaceblobstore/azureml/6aa29bf6-07e6-434c-bf02-92fe5eded71f/azureml-setup/context_manager_injector.py\\\", line 127, in execute_with_context\\n    runpy.run_path(sys.argv[0], globals(), run_name=\\\"__main__\\\")\\n  File \\\"/azureml-envs/azureml_1b417bb747e35859ebf611fb43071e9c/lib/python3.6/runpy.py\\\", line 263, in run_path\\n    pkg_name=pkg_name, script_name=fname)\\n  File \\\"/azureml-envs/azureml_1b417bb747e35859ebf611fb43071e9c/lib/python3.6/runpy.py\\\", line 96, in _run_module_code\\n    mod_name, mod_spec, pkg_name, script_name)\\n  File \\\"/azureml-envs/azureml_1b417bb747e35859ebf611fb43071e9c/lib/python3.6/runpy.py\\\", line 85, in _run_code\\n    exec(code, run_globals)\\n  File \\\"convert_to_csv.py\\\", line 1, in <module>\\n    from PDFReader import PDFReader\\n  File \\\"/mnt/batch/tasks/shared/LS_root/jobs/bookstrator_1/azureml/6aa29bf6-07e6-434c-bf02-92fe5eded71f/mounts/workspaceblobstore/azureml/6aa29bf6-07e6-434c-bf02-92fe5eded71f/PDFReader.py\\\", line 1, in <module>\\n    import fitz\\n\"\n        }\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}