In [36]:
import azureml.core
from azureml.core import Workspace, Datastore

In [37]:
ws = Workspace.from_config(path='./config.json')

In [38]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

In [39]:
env = Environment(name='bookstrator')

In [40]:
conda_dep = CondaDependencies()

# Installs scikit-learn version 0.21.3 conda package
conda_dep.add_pip_package("PyMuPDF==1.16.11==0.21.3")
conda_dep.add_pip_package("fitz")

# Adds dependencies to PythonSection of myenv
env.python.conda_dependencies = conda_dep

In [41]:
env.register(workspace=ws)

{
    "name": "bookstrator",
    "version": "3",
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "python": {
        "userManagedDependencies": false,
        "interpreterPath": "python",
        "condaDependenciesFile": null,
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "conda-forge"
            ],
            "dependencies": [
                "python=3.6.2",
                {
                    "pip": [
                        "azureml-defaults",
                        "PyMuPDF==1.16.11==0.21.3",
                        "fitz"
                    ]
                }
            ],
            "name": "azureml_cdf40690fdf959df045b77be181464fc"
        }
    },
    "docker": {
        "enabled": false,
        "baseImage": "mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04",
        "baseDockerfile": null,
        "sharedVolumes": true,
        "shmSize": null,
        "argum

In [42]:
# Default datastore
def_data_store = ws.get_default_datastore()

In [43]:
# Get the blob storage associated with the workspace
def_blob_store = Datastore(ws, "workspaceblobstore")

In [44]:
# Get file storage associated with the workspace
def_file_store = Datastore(ws, "workspacefilestore")

In [15]:
def_blob_store.upload_files(
    ['./raw-data/aesops-fables.pdf'],
    target_path='aesops-fables',
    overwrite=True
)

Uploading an estimated of 1 files
Uploading ./raw-data/aesops-fables.pdf
Uploaded ./raw-data/aesops-fables.pdf, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_580ae3d5a50544c5bd446e7f3aecc896

In [45]:
from azureml.data.data_reference import DataReference

In [46]:
blob_input_data = DataReference(
    datastore=def_blob_store,
    data_reference_name='test_data',
    path_on_datastore='aesops-fables/aesops-fables.pdf'
)

In [47]:
from azureml.pipeline.core import PipelineData

In [48]:
output_data1 = PipelineData(
    "output_data1",
    datastore=def_blob_store,
    output_name="output_data1"
)

In [49]:
from azureml.pipeline.steps import PythonScriptStep

In [50]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS15_v2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [60]:
from azureml.core import RunConfiguration

# setup run configuration for the step below, for the sake of virtual env
runconfig = RunConfiguration(conda_dependencies=conda_dep)

In [61]:
parse_step = PythonScriptStep(
    script_name='convert_to_csv.py',
    arguments=['--input_file', blob_input_data, '--output_file', output_data1, '--start_page', 1, '--end_page', 88],
    inputs=[blob_input_data],
    outputs=[output_data1],
    compute_target=cpu_cluster,
    runconfig=runconfig,
    source_directory='../PDFReader'
)

In [62]:
from azureml.pipeline.core import Pipeline

In [63]:
pipeline1 = Pipeline(workspace=ws, steps=[parse_step])

In [None]:
cts = ws.compute_targets
for ct in cts:
    print(ct)

In [64]:
from azureml.core import Experiment

# Submit the pipeline to be run
pipeline_run1 = Experiment(ws, 'Parse_Aesop_PDF').submit(pipeline1)
pipeline_run1.wait_for_completion()

Created step convert_to_csv.py [19faa7e9][6cb830d9-8c20-4bad-8d40-be97a3496f87], (This step is eligible to reuse a previous run's output)
Using data reference test_data for StepId [10cd4867][8a1a6ee0-2771-46da-87d6-892348348b05], (Consumers of this data are eligible to reuse prior runs.)
Submitted PipelineRun 69ccb149-61fb-49ea-b629-66b184a7e607
Link to Azure Machine Learning studio: https://ml.azure.com/experiments/Parse_Aesop_PDF/runs/69ccb149-61fb-49ea-b629-66b184a7e607?wsid=/subscriptions/cd309673-f356-437c-b53a-ef46d5ec9635/resourcegroups/bookstrator_intro/workspaces/Bookstrator_1
PipelineRunId: 69ccb149-61fb-49ea-b629-66b184a7e607
Link to Portal: https://ml.azure.com/experiments/Parse_Aesop_PDF/runs/69ccb149-61fb-49ea-b629-66b184a7e607?wsid=/subscriptions/cd309673-f356-437c-b53a-ef46d5ec9635/resourcegroups/bookstrator_intro/workspaces/Bookstrator_1
PipelineRun Status: Running


StepRunId: ec813bad-e307-4f12-a49e-9c565fe9b61d
Link to Portal: https://ml.azure.com/experiments/Parse_

2020/03/03 11:27:34 Container failed during run: acb_step_0. No retries remaining.
failed to run step ID: acb_step_0: exit status 1

Run ID: cj1 failed after 1m20s. Error: failed during run, err: exit status 1

StepRun(convert_to_csv.py) Execution Summary
StepRun( convert_to_csv.py ) Status: Failed

This compute target type doesn't support non-Docker runs; overriding run configuration to enable Docker.


ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt",
        "details": []
    },
    "correlation": {
        "operation": null,
        "request": "6b76c0b03d6029fb"
    },
    "environment": "centralus",
    "location": "centralus",
    "time": "2020-03-03T11:27:42.580087Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt\",\n        \"details\": []\n    },\n    \"correlation\": {\n        \"operation\": null,\n        \"request\": \"6b76c0b03d6029fb\"\n    },\n    \"environment\": \"centralus\",\n    \"location\": \"centralus\",\n    \"time\": \"2020-03-03T11:27:42.580087Z\"\n}"
    }
}