In [1]:
import numpy as np
from azureml.core import Workspace, Dataset, Datastore
from azureml.core import ScriptRunConfig, RunConfiguration, Experiment
from azureml.train.estimator import Estimator
from azureml.data.data_reference import DataReference
from azureml.core import Environment
#from azureml.data.dataset_consumption_config import DatasetConsumptionConfig

In [2]:
ws = Workspace.from_config()

In [3]:
def_blob_store = Datastore(ws, "workspaceblobstore")
def_file_store = Datastore(ws, "workspacefilestore")

In [4]:
localenv = Environment(name="localenv")

localenv= Environment.from_conda_specification("localenv", './environment.yml')
#localenv.docker = False
localenv

{
    "name": "localenv",
    "version": null,
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "python": {
        "userManagedDependencies": false,
        "interpreterPath": "python",
        "condaDependenciesFile": null,
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "name": "machinelearning",
            "channels": [
                "conda-forge",
                "default"
            ],
            "dependencies": [
                "python=3.7.6",
                "pandas==0.24.2",
                "scikit-learn==0.22",
                "pyarrow==0.16.0",
                "numpy==1.17.4",
                "pip",
                {
                    "pip": [
                        "azureml-dataprep[fuse,pandas]",
                        "azureml.core",
                        "azureml.train"
                    ]
                }
            ]
        }
    },
    "docker": {
        "enabled": false,
        "ba

# Testing clean.py

In [5]:
dataset = Dataset.get_by_name(ws, name='annonces_ds')
experiment = Experiment(ws, "CleanTest")

clean_est = Estimator(source_directory='./pipeline_steps', entry_script='clean.py',
                script_params= {"--input": 'annonces_ds', "--output": 'cleantest'},
                inputs=[dataset.as_named_input('annonces_ds')],
                compute_target='local',
                environment_definition=localenv
               )

run = experiment.submit(clean_est)

run.wait_for_completion(show_output=True)

RunId: CleanTest_1585901032_c0a2674e
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585901032_c0a2674e?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2

Streaming azureml-logs/70_driver_log.txt

Starting the daemon thread to refresh tokens in background for process with pid = 16288
Entering Run History Context Manager.
Preparing to call script [ clean.py ] with arguments: ['--input', 'annonces_ds', '--output', 'cleantest']
After variable expansion, calling script [ clean.py ] with arguments: ['--input', 'annonces_ds', '--output', 'cleantest']

  notcolocation_rows = ~(df_full['description'].str.contains("([Cc]oloc)")).astype('Bool')


The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for process with pid = 16288
Cleaning up all outstanding Run operations, waiting 300.0 seconds
2 ite

{'runId': 'CleanTest_1585901032_c0a2674e',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-04-03T08:03:54.258284Z',
 'endTimeUtc': '2020-04-03T08:04:17.824923Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '9701564f-d4c5-44b2-a4f6-ebbb3ad315a6',
  'azureml.git.repository_uri': 'https://github.com/Simplon-IA-Bdx-1/realestate-guillaume-nicos-pierre-silvia.git',
  'mlflow.source.git.repoURL': 'https://github.com/Simplon-IA-Bdx-1/realestate-guillaume-nicos-pierre-silvia.git',
  'azureml.git.branch': 'azure-dev',
  'mlflow.source.git.branch': 'azure-dev',
  'azureml.git.commit': '355ded4b814b86437056cb9e7ce65f2484dae50d',
  'mlflow.source.git.commit': '355ded4b814b86437056cb9e7ce65f2484dae50d',
  'azureml.git.dirty': 'False'},
 'inputDatasets': [{'dataset': {'id': '5a88c2bd-5165-485b-a81f-bc8290270b5a'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'annonces_ds', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'clean.py'

In [6]:
run.download_file('outputs/cleantest', output_file_path='./cleantmp.csv')
!head ./cleantmp.csv

'head' n’est pas reconnu en tant que commande interne
ou externe, un programme exécutable ou un fichier de commandes.


In [7]:
clean_ref = def_blob_store.upload_files(
    ['./cleantmp.csv'],
    target_path='tmp/clean.csv',
    overwrite=True)
clean_ref.data_reference_name = "clean_data"
clean_ref.mode = 'download' # 'download'
clean_ref

Uploading an estimated of 1 files
Uploading ./cleantmp.csv
Uploaded ./cleantmp.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_clean_data

In [8]:
datastore_paths = [(def_blob_store, 'tmp/clean.csv')]
clean_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)
# # clean_ds = clean_ds.register(workspace=ws,
# #                            name='clean_ds',
# #                            description='annonces data clean')
# clean_ds = DatasetConsumptionConfig('clean_ds', clan_ds, mode='direct', path_on_compute=None)

# Testing split.py

In [9]:
split_est = Estimator(source_directory='./pipeline_steps',
                      entry_script='split.py',
                      script_params= {"--dataset": "clean_data",
                                      "--train": "train_ds",
                                      "--valid": "valid_ds",
                                      "--trainsize": 400,
                                      "--validsize": 100},
                      #inputs=[clean_ref],
                      inputs=[dataset.as_named_input('clean_data')],
                      compute_target='local',
                      environment_definition=localenv
                     )

run = experiment.submit(split_est)

run.wait_for_completion(show_output=True)

RunId: CleanTest_1585901077_74e42c65
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585901077_74e42c65?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2

Streaming azureml-logs/70_driver_log.txt

Starting the daemon thread to refresh tokens in background for process with pid = 11920
Entering Run History Context Manager.
Preparing to call script [ split.py ] with arguments: ['--dataset', 'clean_data', '--train', 'train_ds', '--valid', 'valid_ds', '--trainsize', '400', '--validsize', '100']
After variable expansion, calling script [ split.py ] with arguments: ['--dataset', 'clean_data', '--train', 'train_ds', '--valid', 'valid_ds', '--trainsize', '400', '--validsize', '100']

Dataset:  TabularDataset
{
  "source": [
    "('workspaceblobstore', './data/annonces_timestamp.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration"

Cleaning up all outstanding Run operations, waiting 300.0 seconds
2 items cleaning up...
Cleanup took 0.3182518482208252 seconds

Execution Summary
RunId: CleanTest_1585901077_74e42c65
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585901077_74e42c65?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2



{'runId': 'CleanTest_1585901077_74e42c65',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-04-03T08:04:39.124731Z',
 'endTimeUtc': '2020-04-03T08:04:54.132062Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '9701564f-d4c5-44b2-a4f6-ebbb3ad315a6',
  'azureml.git.repository_uri': 'https://github.com/Simplon-IA-Bdx-1/realestate-guillaume-nicos-pierre-silvia.git',
  'mlflow.source.git.repoURL': 'https://github.com/Simplon-IA-Bdx-1/realestate-guillaume-nicos-pierre-silvia.git',
  'azureml.git.branch': 'azure-dev',
  'mlflow.source.git.branch': 'azure-dev',
  'azureml.git.commit': '355ded4b814b86437056cb9e7ce65f2484dae50d',
  'mlflow.source.git.commit': '355ded4b814b86437056cb9e7ce65f2484dae50d',
  'azureml.git.dirty': 'False'},
 'inputDatasets': [{'dataset': {'id': '5a88c2bd-5165-485b-a81f-bc8290270b5a'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'clean_data', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'split.py',

In [10]:
run.download_file('outputs/train_ds', output_file_path='./train_dstmp.csv')
!head ./train_dstmp.csv
run.download_file('outputs/valid_ds', output_file_path='./valid_dstmp.csv')
!head ./valid_dstmp.csv

'head' nâ€™est pas reconnu en tant que commande interne
ou externe, un programme exÃ©cutable ou un fichier de commandes.
'head' nâ€™est pas reconnu en tant que commande interne
ou externe, un programme exÃ©cutable ou un fichier de commandes.


In [11]:
train_ref = def_blob_store.upload_files(
    ['./train_dstmp.csv'],
    target_path='tmp/train.csv',
    overwrite=True)
train_ref.data_reference_name = "train_data"
train_ref.mode = 'download'

valid_ref = def_blob_store.upload_files(
    ['./valid_dstmp.csv'],
    target_path='tmp/valid.csv',
    overwrite=True)
valid_ref.data_reference_name = "valid_data"
valid_ref.mode = 'download'

Uploading an estimated of 1 files
Uploading ./train_dstmp.csv
Uploaded ./train_dstmp.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Uploading an estimated of 1 files
Uploading ./valid_dstmp.csv
Uploaded ./valid_dstmp.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [12]:
datastore_paths = [(def_blob_store, 'tmp/train.csv')]
train_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)

datastore_paths = [(def_blob_store, 'tmp/valid.csv')]
valid_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)

# Testing train.py

In [13]:
split_est = Estimator(source_directory='./pipeline_steps', entry_script='train.py',
                script_params= {"--dataset": "train_data",
                                "--model": "model.pkl"},
                #inputs=[train_ref],
                inputs=[train_ds.as_named_input('train_data')],
                compute_target='local',
                environment_definition=localenv
               )

run = experiment.submit(split_est)

run.wait_for_completion(show_output=True)

RunId: CleanTest_1585901117_d70c8bdf
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585901117_d70c8bdf?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2

Streaming azureml-logs/70_driver_log.txt

Starting the daemon thread to refresh tokens in background for process with pid = 14740
Entering Run History Context Manager.
Preparing to call script [ train.py ] with arguments: ['--dataset', 'train_data', '--model', 'model.pkl']
After variable expansion, calling script [ train.py ] with arguments: ['--dataset', 'train_data', '--model', 'model.pkl']



The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for process with pid = 14740
Cleaning up all outstanding Run operations, waiting 300.0 seconds
2 items cleaning up...
Cleanup took 0.47327089309692383 seconds

Execution Summary
RunId: CleanT

{'runId': 'CleanTest_1585901117_d70c8bdf',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-04-03T08:05:19.839645Z',
 'endTimeUtc': '2020-04-03T08:05:35.957229Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '9701564f-d4c5-44b2-a4f6-ebbb3ad315a6',
  'azureml.git.repository_uri': 'https://github.com/Simplon-IA-Bdx-1/realestate-guillaume-nicos-pierre-silvia.git',
  'mlflow.source.git.repoURL': 'https://github.com/Simplon-IA-Bdx-1/realestate-guillaume-nicos-pierre-silvia.git',
  'azureml.git.branch': 'azure-dev',
  'mlflow.source.git.branch': 'azure-dev',
  'azureml.git.commit': '355ded4b814b86437056cb9e7ce65f2484dae50d',
  'mlflow.source.git.commit': '355ded4b814b86437056cb9e7ce65f2484dae50d',
  'azureml.git.dirty': 'False'},
 'inputDatasets': [{'dataset': {'id': 'bd605afd-3c00-4d22-b6a7-25881b1cbc46'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'train_data', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'train.py',

In [14]:
model = run.register_model(model_name='test_model',
                           tags={'test': 'test'},
                           model_path='outputs/model.pkl')

In [15]:
model

Model(workspace=Workspace.create(name='RealEstatePG2', subscription_id='68bdd703-8837-469c-80bd-bfb35f3b886f', resource_group='ProjectGroup2'), name=test_model, id=test_model:5, version=5, tags={'test': 'test'}, properties={})

# Testing eval.py

In [16]:
eval_est = Estimator(source_directory='./pipeline_steps', entry_script='eval.py',
                script_params= {"--dataset": "valid_data",
                                "--model": "test_model"},
                inputs=[valid_ds.as_named_input('valid_data')],
                compute_target='local',
                environment_definition=localenv
               )

run = experiment.submit(eval_est)

run.wait_for_completion(show_output=True)

RunId: CleanTest_1585901143_222d895c
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585901143_222d895c?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2

Streaming azureml-logs/70_driver_log.txt

Starting the daemon thread to refresh tokens in background for process with pid = 13924
Entering Run History Context Manager.
Preparing to call script [ eval.py ] with arguments: ['--dataset', 'valid_data', '--model', 'test_model']
After variable expansion, calling script [ eval.py ] with arguments: ['--dataset', 'valid_data', '--model', 'test_model']



The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for process with pid = 13924
Cleaning up all outstanding Run operations, waiting 300.0 seconds
3 items cleaning up...
Cleanup took 1.5357658863067627 seconds

Execution Summary
RunId: CleanTe

{'runId': 'CleanTest_1585901143_222d895c',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-04-03T08:05:46.454482Z',
 'endTimeUtc': '2020-04-03T08:06:02.306128Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '9701564f-d4c5-44b2-a4f6-ebbb3ad315a6',
  'azureml.git.repository_uri': 'https://github.com/Simplon-IA-Bdx-1/realestate-guillaume-nicos-pierre-silvia.git',
  'mlflow.source.git.repoURL': 'https://github.com/Simplon-IA-Bdx-1/realestate-guillaume-nicos-pierre-silvia.git',
  'azureml.git.branch': 'azure-dev',
  'mlflow.source.git.branch': 'azure-dev',
  'azureml.git.commit': '355ded4b814b86437056cb9e7ce65f2484dae50d',
  'mlflow.source.git.commit': '355ded4b814b86437056cb9e7ce65f2484dae50d',
  'azureml.git.dirty': 'False'},
 'inputDatasets': [{'dataset': {'id': '49fd4a97-6d25-4bba-8880-be6be6c556de'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'valid_data', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'eval.py',
