In [64]:
import numpy as np
from azureml.core import Workspace, Dataset, Datastore
from azureml.core import ScriptRunConfig, RunConfiguration, Experiment
from azureml.train.estimator import Estimator
from azureml.data.data_reference import DataReference

In [40]:
ws = Workspace.from_config()

In [43]:
def_blob_store = Datastore(ws, "workspaceblobstore")
def_file_store = Datastore(ws, "workspacefilestore")

# Testing clean.py

In [4]:
dataset = Dataset.get_by_name(ws, name='annonces_ds')
experiment = Experiment(ws, "CleanTest")

clean_est = Estimator(source_directory='.', entry_script='./pipeline_steps/clean.py',
                script_params= {"--input": 'annonces_ds', "--output": 'cleantest'},
                inputs=[dataset.as_named_input('annonces_ds')],
                compute_target='local',
                use_docker=False
               )

run = experiment.submit(clean_est)

run.wait_for_completion(show_output=True)



RunId: CleanTest_1585813664_058ca633
Web View: https://ml.azure.com/experiments/CleanTest/runs/CleanTest_1585813664_058ca633?wsid=/subscriptions/68bdd703-8837-469c-80bd-bfb35f3b886f/resourcegroups/ProjectGroup2/workspaces/RealEstatePG2

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt

Streaming azureml-logs/70_driver_log.txt

Starting the daemon thread to refresh tokens in background for process with pid = 92
Entering Run History Context Manager.
Preparing to call script [ pipeline_steps/clean.py ] with arguments: ['--input', 'annonces_ds', '--output', 'cleantest']
After variable expansion, calling script [ pipeline_steps/clean.py ] with arguments: ['--input', 'annonces_ds', '--output', 'cleantest']

  return func(self, *args, **kwargs)


The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service.
Starting the daemon thread to refresh tokens in background for process with pid = 92
Cleani

{'runId': 'CleanTest_1585813664_058ca633',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-04-02T07:47:47.59181Z',
 'endTimeUtc': '2020-04-02T07:49:07.669134Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'c0f5c2a8-e300-478f-86d0-e39126f72781'},
 'inputDatasets': [{'dataset': {'id': '5a88c2bd-5165-485b-a81f-bc8290270b5a'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'annonces_ds', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'pipeline_steps/clean.py',
  'useAbsolutePath': False,
  'arguments': ['--input', 'annonces_ds', '--output', 'cleantest'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'annonces_ds': {'dataLocation': {'dataset': {'id': '5a88c2bd-5165-485b-a81f-bc8290270b5a'},
     'dataPath': None},
    'createOutputDirectories': False,
    'mechanism': 'Direct',
    'environmentVariableName': 'annonces_ds',
   

In [58]:
run.download_file('outputs/cleantest', output_file_path='./cleantmp.csv')
!head ./cleantmp.csv

,idannonce,typedebien,codepostal,idtypechauffage,idtypecuisine,si_balcon,nb_chambres,nb_pieces,si_sdbain,si_sdEau,etage,prix,surface,dpeC,date
0,154680523,Appartement,33000,gaz collectif,,0.0,2.0,3,1,0,2.0,843,60.0,0.0,2019-03-28 15:30:41.565724000
1,155037587,Appartement,33700,individuel électrique radiateur,aucune,0.0,0.0,1,0,0,2.0,512,21.35,195.0,2019-03-28 19:37:48.081944028
2,152350643,Appartement,33140,,,1.0,2.0,3,0,0,3.0,859,69.48,138.0,2019-03-28 23:44:54.598164056
3,155513075,Appartement,33400,,,1.0,1.0,2,0,0,3.0,695,47.25,162.0,2019-03-29 03:52:01.114384084
4,155025785,Appartement,33000,gaz collectif,,0.0,2.0,3,1,0,4.0,814,68.0,0.0,2019-03-29 07:59:07.630604112
5,155551059,Appartement,33700,individuel électrique,américaine,1.0,2.0,3,1,0,0.0,756,65.51,0.0,2019-03-29 12:06:14.146824141
7,154854333,Appartement,33600,,équipée,0.0,1.0,2,0,1,0.0,720,36.0,0.0,2019-03-29 20:20:27.179264197
8,155325225,Appartement,33600,individuel,,1.0,1.0,2,0,1,1.0,564,37.01,167.0,2019-03-30 

In [77]:
clean_ref = def_blob_store.upload_files(
    ['./cleantmp.csv'],
    target_path='tmp/clean.csv',
    overwrite=True)
clean_ref.data_reference_name = "clean_data"

Uploading an estimated of 1 files
Uploading ./cleantmp.csv
Uploaded ./cleantmp.csv, 1 files out of an estimated total of 1
Uploaded 1 files


# Testing split.py

In [None]:
split_est = Estimator(source_directory='./pipeline_steps', entry_script='split.py',
                script_params= {"--dataset": "clean_data",
                                "--train" "train_ds",
                                "--valid" "valid_ds",
                                "--trainsize" 400,
                                "--validsize" 100},
                inputs=[clean_ref],
                compute_target='local',
                use_docker=False
               )

run = experiment.submit(split_est)

run.wait_for_completion(show_output=True)

In [None]:
run.download_file('outputs/train_ds', output_file_path='./train_dstmp.csv')
!head ./cleantmp.csv
run.download_file('outputs/valid_ds', output_file_path='./valid_dstmp.csv')
!head ./cleantmp.csv

In [None]:
train_ref = def_blob_store.upload_files(
    ['./train_dstmp.csv'],
    target_path='tmp/train.csv',
    overwrite=True)
train_ref.data_reference_name = "train_data"

valid_ref = def_blob_store.upload_files(
    ['./valid_dstmp.csv'],
    target_path='tmp/valid.csv',
    overwrite=True)
valid_ref.data_reference_name = "valid_data"

# Testing train.py

In [None]:
split_est = Estimator(source_directory='.', entry_script='./pipeline_steps/train.py',
                script_params= {"--dataset": "train_data",
                                "--model": "model.pkl"},
                inputs=[train_ref],
                compute_target='local',
                use_docker=False
               )

run = experiment.submit(split_est)

run.wait_for_completion(show_output=True)

In [None]:
model = run.register_model(model_name='test_model',
                           tags={'test': 'test'},
                           model_path='outputs/model.pkl')

# Testing eval.py

In [None]:
eval_est = Estimator(source_directory='.', entry_script='./pipeline_steps/eval.py',
                script_params= {"--dataset": "valid_data",
                                "--model": "model.pkl"},
                inputs=[valid_ref],
                compute_target='local',
                use_docker=False
               )

run = experiment.submit(eval_est)

run.wait_for_completion(show_output=True)