In [1]:
import numpy as np
from azureml.core import Workspace, Dataset, Datastore
from azureml.core import ScriptRunConfig, RunConfiguration, Experiment
from azureml.train.estimator import Estimator
from azureml.data.data_reference import DataReference
from azureml.core import Environment
#from azureml.data.dataset_consumption_config import DatasetConsumptionConfig

In [2]:
ws = Workspace.from_config()

In [3]:
def_blob_store = Datastore(ws, "workspaceblobstore")
def_file_store = Datastore(ws, "workspacefilestore")

In [4]:
localenv = Environment(name="localenv")

localenv= Environment.from_conda_specification("localenv", './environment.yml')
#localenv.docker = False
localenv

{
    "name": "localenv",
    "version": null,
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "python": {
        "userManagedDependencies": false,
        "interpreterPath": "python",
        "condaDependenciesFile": null,
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "name": "base",
            "dependencies": [
                "python=3.7.6",
                "pandas==0.24.2",
                "scikit-learn==0.22",
                "pyarrow==0.16.0",
                {
                    "pip": [
                        "azureml-dataprep[fuse,pandas]",
                        "azureml.core",
                        "azureml.train"
                    ]
                }
            ]
        }
    },
    "docker": {
        "enabled": false,
        "baseImage": "mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04",
        "baseDockerfile": null,
        "sharedVolumes": true,
        "shmSize": "2g",
      

# Testing clean.py

In [5]:
dataset = Dataset.get_by_name(ws, name='annonces_ds')
experiment = Experiment(ws, "CleanTest")

clean_est = Estimator(source_directory='./pipeline_steps', entry_script='clean.py',
                script_params= {"--input": 'annonces_ds', "--output": 'cleantest'},
                inputs=[dataset.as_named_input('annonces_ds')],
                compute_target='local',
                environment_definition=localenv
               )

run = experiment.submit(clean_est)

run.wait_for_completion(show_output=True)

RunId: CleanTest_1585831165_ff151d16
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585831165_ff151d16?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt

Streaming azureml-logs/70_driver_log.txt

Starting the daemon thread to refresh tokens in background for process with pid = 2893
Entering Run History Context Manager.
Preparing to call script [ clean.py ] with arguments: ['--input', 'annonces_ds', '--output', 'cleantest']
After variable expansion, calling script [ clean.py ] with arguments: ['--input', 'annonces_ds', '--output', 'cleantest']

  notcolocation_rows = ~(df_full['description'].str.contains("([Cc]oloc)")).astype('Bool')


The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for pr

{'runId': 'CleanTest_1585831165_ff151d16',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-04-02T12:39:27.419546Z',
 'endTimeUtc': '2020-04-02T12:40:47.034149Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'e671dbd1-fae5-4450-869a-98eee83ed50b'},
 'inputDatasets': [{'dataset': {'id': '5a88c2bd-5165-485b-a81f-bc8290270b5a'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'annonces_ds', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'clean.py',
  'useAbsolutePath': False,
  'arguments': ['--input', 'annonces_ds', '--output', 'cleantest'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'annonces_ds': {'dataLocation': {'dataset': {'id': '5a88c2bd-5165-485b-a81f-bc8290270b5a'},
     'dataPath': None},
    'createOutputDirectories': False,
    'mechanism': 'Direct',
    'environmentVariableName': 'annonces_ds',
    'pathOnComput

In [6]:
run.download_file('outputs/cleantest', output_file_path='./cleantmp.csv')
!head ./cleantmp.csv

,idannonce,typedebien,codepostal,idtypechauffage,idtypecuisine,si_balcon,nb_chambres,nb_pieces,si_sdbain,si_sdEau,etage,prix,surface,dpeC,date
0,154680523,Appartement,33000,gaz collectif,,0.0,2.0,3,1,0,2.0,843,60.0,0.0,2019-03-28 15:30:41.565724000
1,155037587,Appartement,33700,individuel électrique radiateur,aucune,0.0,0.0,1,0,0,2.0,512,21.35,195.0,2019-03-28 19:37:48.081944028
2,152350643,Appartement,33140,,,1.0,2.0,3,0,0,3.0,859,69.48,138.0,2019-03-28 23:44:54.598164056
3,155513075,Appartement,33400,,,1.0,1.0,2,0,0,3.0,695,47.25,162.0,2019-03-29 03:52:01.114384084
4,155025785,Appartement,33000,gaz collectif,,0.0,2.0,3,1,0,4.0,814,68.0,0.0,2019-03-29 07:59:07.630604112
5,155551059,Appartement,33700,individuel électrique,américaine,1.0,2.0,3,1,0,0.0,756,65.51,0.0,2019-03-29 12:06:14.146824141
7,154854333,Appartement,33600,,équipée,0.0,1.0,2,0,1,0.0,720,36.0,0.0,2019-03-29 20:20:27.179264197
8,155325225,Appartement,33600,individuel,,1.0,1.0,2,0,1,1.0,564,37.01,167.0,2019-03-30 

In [7]:
clean_ref = def_blob_store.upload_files(
    ['./cleantmp.csv'],
    target_path='tmp/clean.csv',
    overwrite=True)
clean_ref.data_reference_name = "clean_data"
clean_ref.mode = 'download' # 'download'
clean_ref

Uploading an estimated of 1 files
Uploading ./cleantmp.csv
Uploaded ./cleantmp.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_clean_data

In [8]:
datastore_paths = [(def_blob_store, 'tmp/clean.csv')]
clean_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)
# # clean_ds = clean_ds.register(workspace=ws,
# #                            name='clean_ds',
# #                            description='annonces data clean')
# clean_ds = DatasetConsumptionConfig('clean_ds', clan_ds, mode='direct', path_on_compute=None)

# Testing split.py

In [9]:
split_est = Estimator(source_directory='./pipeline_steps',
                      entry_script='split.py',
                      script_params= {"--dataset": "clean_data",
                                      "--train": "train_ds",
                                      "--valid": "valid_ds",
                                      "--trainsize": 400,
                                      "--validsize": 100},
                      #inputs=[clean_ref],
                      inputs=[dataset.as_named_input('clean_data')],
                      compute_target='local',
                      environment_definition=localenv
                     )

run = experiment.submit(split_est)

run.wait_for_completion(show_output=True)

RunId: CleanTest_1585831259_04769e28
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585831259_04769e28?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt

Streaming azureml-logs/70_driver_log.txt

Starting the daemon thread to refresh tokens in background for process with pid = 3127
Entering Run History Context Manager.
Preparing to call script [ split.py ] with arguments: ['--dataset', 'clean_data', '--train', 'train_ds', '--valid', 'valid_ds', '--trainsize', '400', '--validsize', '100']
After variable expansion, calling script [ split.py ] with arguments: ['--dataset', 'clean_data', '--train', 'train_ds', '--valid', 'valid_ds', '--trainsize', '400', '--validsize', '100']

Dataset:  TabularDataset
{
  "source": [
    "('workspaceblobstore', './data/annonces_timestamp.csv')"
  ],
  "definition": [
    "GetDatast



The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for process with pid = 3127
Cleaning up all outstanding Run operations, waiting 300.0 seconds
2 items cleaning up...
Cleanup took 0.7075161933898926 seconds

Execution Summary
RunId: CleanTest_1585831259_04769e28
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585831259_04769e28?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2



{'runId': 'CleanTest_1585831259_04769e28',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-04-02T12:41:00.968763Z',
 'endTimeUtc': '2020-04-02T12:42:21.405955Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'e671dbd1-fae5-4450-869a-98eee83ed50b'},
 'inputDatasets': [{'dataset': {'id': '5a88c2bd-5165-485b-a81f-bc8290270b5a'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'clean_data', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'split.py',
  'useAbsolutePath': False,
  'arguments': ['--dataset',
   'clean_data',
   '--train',
   'train_ds',
   '--valid',
   'valid_ds',
   '--trainsize',
   '400',
   '--validsize',
   '100'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'clean_data': {'dataLocation': {'dataset': {'id': '5a88c2bd-5165-485b-a81f-bc8290270b5a'},
     'dataPath': None},
    'createOutputDirectories': False,

In [14]:
run.download_file('outputs/train_ds', output_file_path='./train_dstmp.csv')
!head ./train_dstmp.csv
run.download_file('outputs/valid_ds', output_file_path='./valid_dstmp.csv')
!head ./valid_dstmp.csv

UserErrorException: UserErrorException:
	Message: File with path outputs/train_ds was not found,
available files include: azureml-logs/60_control_log.txt,azureml-logs/70_driver_log.txt,logs/azureml/3397_azureml.log.
	InnerException None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "File with path outputs/train_ds was not found,\navailable files include: azureml-logs/60_control_log.txt,azureml-logs/70_driver_log.txt,logs/azureml/3397_azureml.log."
    }
}

In [11]:
train_ref = def_blob_store.upload_files(
    ['./train_dstmp.csv'],
    target_path='tmp/train.csv',
    overwrite=True)
train_ref.data_reference_name = "train_data"
train_ref.mode = 'download'

valid_ref = def_blob_store.upload_files(
    ['./valid_dstmp.csv'],
    target_path='tmp/valid.csv',
    overwrite=True)
valid_ref.data_reference_name = "valid_data"
valid_ref.mode = 'download'

Uploading an estimated of 1 files
Uploading ./train_dstmp.csv
Uploaded ./train_dstmp.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Uploading an estimated of 1 files
Uploading ./valid_dstmp.csv
Uploaded ./valid_dstmp.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [12]:
datastore_paths = [(def_blob_store, 'tmp/train.csv')]
train_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)

datastore_paths = [(def_blob_store, 'tmp/valid.csv')]
valid_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)

# Testing train.py

In [17]:
split_est = Estimator(source_directory='./pipeline_steps', entry_script='train.py',
                script_params= {"--dataset": "train_data",
                                "--model": "model.pkl"},
                #inputs=[train_ref],
                inputs=[train_ds.as_named_input('train_data')],
                compute_target='local',
                environment_definition=localenv
               )

run = experiment.submit(split_est)

run.wait_for_completion(show_output=True)

RunId: CleanTest_1585832478_f89ae13b
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585832478_f89ae13b?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt

Streaming azureml-logs/70_driver_log.txt

Starting the daemon thread to refresh tokens in background for process with pid = 3852
Entering Run History Context Manager.
Preparing to call script [ train.py ] with arguments: ['--dataset', 'train_data', '--model', 'model.pkl']
After variable expansion, calling script [ train.py ] with arguments: ['--dataset', 'train_data', '--model', 'model.pkl']



The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for process with pid = 3852
Cleaning up all outstanding Run operations, waiting 300.0 seconds
2 i

{'runId': 'CleanTest_1585832478_f89ae13b',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-04-02T13:01:19.937564Z',
 'endTimeUtc': '2020-04-02T13:02:37.479497Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '7a59caab-e847-4dbf-b977-2230cb47a876'},
 'inputDatasets': [{'dataset': {'id': 'bd605afd-3c00-4d22-b6a7-25881b1cbc46'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'train_data', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'train.py',
  'useAbsolutePath': False,
  'arguments': ['--dataset', 'train_data', '--model', 'model.pkl'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'train_data': {'dataLocation': {'dataset': {'id': 'bd605afd-3c00-4d22-b6a7-25881b1cbc46'},
     'dataPath': None},
    'createOutputDirectories': False,
    'mechanism': 'Direct',
    'environmentVariableName': 'train_data',
    'pathOnCompute':

In [18]:
model = run.register_model(model_name='test_model',
                           tags={'test': 'test'},
                           model_path='outputs/model.pkl')

In [19]:
model

Model(workspace=Workspace.create(name='RealEstatePG2', subscription_id='68bdd703-8837-469c-80bd-bfb35f3b886f', resource_group='ProjectGroup2'), name=test_model, id=test_model:1, version=1, tags={'test': 'test'}, properties={})

# Testing eval.py

In [21]:
eval_est = Estimator(source_directory='./pipeline_steps', entry_script='eval.py',
                script_params= {"--dataset": "valid_data",
                                "--model": "model.pkl"},
                inputs=[valid_ds.as_named_input('valid_data')],
                compute_target='local',
                environment_definition=localenv
               )

run = experiment.submit(eval_est)

run.wait_for_completion(show_output=True)

RunId: CleanTest_1585832732_87759553
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585832732_87759553?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt

Streaming azureml-logs/70_driver_log.txt

Starting the daemon thread to refresh tokens in background for process with pid = 4049
Entering Run History Context Manager.
Preparing to call script [ eval.py ] with arguments: ['--dataset', 'valid_data', '--model', 'model.pkl']
After variable expansion, calling script [ eval.py ] with arguments: ['--dataset', 'valid_data', '--model', 'model.pkl']



The experiment failed. Finalizing run...
Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for process with pid = 4049
Cleaning up all outstanding Run operations, waiting 300.0 seconds
1 items cleaning up..

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "User program failed with NameError: name 'Model' is not defined",
        "detailsUri": "https://aka.ms/azureml-known-errors",
        "details": [],
        "debugInfo": {
            "type": "NameError",
            "message": "name 'Model' is not defined",
            "stackTrace": "  File \"azureml-setup/context_manager_injector.py\", line 127, in execute_with_context\n    runpy.run_path(sys.argv[0], globals(), run_name=\"__main__\")\n  File \"/home/jovyan/.azureml/envs/azureml_845b0473cd8b436f0ee71a08b041558c/lib/python3.7/runpy.py\", line 263, in run_path\n    pkg_name=pkg_name, script_name=fname)\n  File \"/home/jovyan/.azureml/envs/azureml_845b0473cd8b436f0ee71a08b041558c/lib/python3.7/runpy.py\", line 96, in _run_module_code\n    mod_name, mod_spec, pkg_name, script_name)\n  File \"/home/jovyan/.azureml/envs/azureml_845b0473cd8b436f0ee71a08b041558c/lib/python3.7/runpy.py\", line 85, in _run_code\n    exec(code, run_globals)\n  File \"eval.py\", line 19, in <module>\n    model_path = Model.get_model_path(args.model)\n"
        },
        "messageParameters": {}
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"User program failed with NameError: name 'Model' is not defined\",\n        \"detailsUri\": \"https://aka.ms/azureml-known-errors\",\n        \"details\": [],\n        \"debugInfo\": {\n            \"type\": \"NameError\",\n            \"message\": \"name 'Model' is not defined\",\n            \"stackTrace\": \"  File \\\"azureml-setup/context_manager_injector.py\\\", line 127, in execute_with_context\\n    runpy.run_path(sys.argv[0], globals(), run_name=\\\"__main__\\\")\\n  File \\\"/home/jovyan/.azureml/envs/azureml_845b0473cd8b436f0ee71a08b041558c/lib/python3.7/runpy.py\\\", line 263, in run_path\\n    pkg_name=pkg_name, script_name=fname)\\n  File \\\"/home/jovyan/.azureml/envs/azureml_845b0473cd8b436f0ee71a08b041558c/lib/python3.7/runpy.py\\\", line 96, in _run_module_code\\n    mod_name, mod_spec, pkg_name, script_name)\\n  File \\\"/home/jovyan/.azureml/envs/azureml_845b0473cd8b436f0ee71a08b041558c/lib/python3.7/runpy.py\\\", line 85, in _run_code\\n    exec(code, run_globals)\\n  File \\\"eval.py\\\", line 19, in <module>\\n    model_path = Model.get_model_path(args.model)\\n\"\n        },\n        \"messageParameters\": {}\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}