In [7]:
#https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml
#Importing libraries
import azureml.core
from azureml.core import Workspace, Dataset
from azureml.core import Experiment
from azureml.widgets import RunDetails
from azureml.core import Run
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails
import mlflow

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Subir los datos a Azure Blob

In [8]:
from azureml.core import Workspace

airlines_delay = '../airlines_delay/airlines_delay.csv'

ws = Workspace.from_config()

# Default datastore
default_store = ws.get_default_datastore() 

default_store.upload_files([airlines_delay], 
                           target_path = 'airlines', 
                           overwrite = True, 
                           show_progress = True)

print("Upload calls completed.")

KeyboardInterrupt: 

## Crear y registrar datasets

In [9]:
from azureml.core import Dataset, Datastore
from azureml.core import Workspace

airlines_delay = '../airlines_delay/airlines_delay.csv'

ws = Workspace.from_config()

# Default datastore
#default_store = ws.get_default_datastore() 
# Al crear un conjunto de datos, crea una referencia a la ubicación de la fuente de datos. 
# Si aplicó transformaciones de subconjuntos al conjunto de datos, también se almacenarán en el conjunto de datos. 
# Los datos permanecen en su ubicación actual, por lo que no se incurre en ningún costo adicional de almacenamiento.
#airlines_delay_data = Dataset.Tabular.from_delimited_files(default_store.path('UI/2023-03-07_192934_UTC/airlines_delay.csv'))

# Registrar los conjuntos de datos con el workspace de trabajo para que pueda reutilizarlos en otros experimentos
#  o compartirlos.

#airlines_delay_data = airlines_delay_data.register(ws, 'airlines_delay_data')

## Setup compute

In [11]:
from azureml.core.compute import ComputeTarget, ComputeInstance
from azureml.core.compute_target import ComputeTargetException

# nombre del cluster
compute_name = "prueba-DS"

# verificación de exixtencia del cluster
try:
    aml_compute = ComputeTarget(workspace=ws, name=compute_name)
    print('Existe!')
except ComputeTargetException:
    
    compute_config = ComputeInstance.provisioning_configuration(vm_size='Standard_DS11_v2',
                                                           ssh_public_access=False)
    aml_compute = ComputeTarget.create(ws, compute_name, compute_config)

aml_compute.wait_for_completion(show_output=True)

Existe!

Running


## Definir el ambiente de trabajo

In [12]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Experiment, ScriptRunConfig, Environment
# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env",  './env/environment.yml')

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = aml_compute

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the source_directory for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the source_directory would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the source_directory of the step.

In [None]:
# preprocesamiento
#1. eliminar columna que no importa
#2. cambiar variables categoricas en numericas
#3. Estandarizar variables numericas
#4. split resultados

## Limpieza de los datos

In [136]:
from azureml.pipeline.core import PipelineData
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import mlflow
#Linkg workspace with mlflow
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

airlines_data_prueba = ws.datasets.get('AirlinesDelay')
#https://learn.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.pipelinedata?view=azure-ml-py
#clean_airlines_data = PipelineData("clean_airlines_data", datastore=default_store, is_directory=True).as_dataset()
clean_airlines_data = OutputFileDatasetConfig('cleaned_data')

clean_step = PythonScriptStep(
    name="Clean airlines data",
    script_name="data_clean.py", 
    arguments=["--output_cleanse", clean_airlines_data],
    inputs=[airlines_data_prueba.as_named_input('raw_data')],
    outputs=[clean_airlines_data],
    compute_target=aml_compute,
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True
)

print("Done.")

#transformed_data = PipelineData('transformed_data', datastore=default_store, is_directory=True).as_dataset()
transformed_data = OutputFileDatasetConfig('transformed_data')

transform_step = PythonScriptStep(
    name="transform airlines data",
    script_name="data_transform.py", 
    arguments=['--input_data', clean_airlines_data.as_input(name='Clean_Data'),
               "--output_transform", transformed_data],
    #inputs=[clean_airlines_data.as_input(name='cleaned_data')],
    #outputs=[transformed_data],
    compute_target=aml_compute,
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True
)

# train and test splits output
#output_split_train = PipelineData("output_split_train", datastore=default_store, is_directory=True).as_dataset()
#output_split_test = PipelineData("output_split_test", datastore=default_store, is_directory=True).as_dataset()
output_split_train = OutputFileDatasetConfig("output_split_train")
output_split_test = OutputFileDatasetConfig("output_split_test")
output_split_validation = OutputFileDatasetConfig("output_split_validation")

train_test_split_step = PythonScriptStep(
    name="split data train test",
    script_name="train_test_split.py", 
    arguments=["--input_data", transformed_data.as_input(name='Tranformed_Data'),
               "--output_train_data", output_split_train,
               "--output_test_data", output_split_test,
               "--output_val_data", output_split_validation],
    #inputs=[transformed_data.as_input()],
    #outputs=[output_split_train, output_split_test],
    compute_target=aml_compute,
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True
)

datastore = ws.get_default_datastore()
step_output = PipelineData("model", datastore=datastore)

train_step = PythonScriptStep(
    name = 'Training model',
    script_name = 'train_model.py',
    arguments = ["--input_data_train", output_split_train.as_input(name='train_Data'),
                 "--input_data_val", output_split_validation.as_input(name="val_data"),
                 "--output_model", step_output],
    #inputs = [output_split_train],
    outputs = [step_output],
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True

)

validation_step = PythonScriptStep(
    name = 'Validation model',
    script_name = 'val_model.py',
    arguments = ["--model_out", step_output.as_input(input_name='model_output')],
    inputs = [step_output],
    #outputs = [],
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True
)

Done.


In [140]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [clean_step, transform_step, train_test_split_step, train_step, validation_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")


# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'exp-Airlines')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Clean airlines data [11e44937][1f5973f7-e91e-42f4-a1b0-6d8124d5dc41], (This step will run and generate new outputs)
Created step transform airlines data [0597984c][b8279037-3c8b-48c6-b3b0-7e150ecbb890], (This step will run and generate new outputs)
Created step split data train test [3848cd1f][12b29be6-3bf1-42c2-b16b-dfac9f9a11a9], (This step will run and generate new outputs)
Created step Training model [c4ccc360][326636f7-6c61-4c3e-9e08-9725c12805f4], (This step will run and generate new outputs)
Created step Validation model [1e3db30f][94164fc2-e626-4b50-ac61-4c13e6f840e2], (This step will run and generate new outputs)
Submitted PipelineRun 321aa4a8-f796-49ff-b1ff-0ede6b48cde3
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/321aa4a8-f796-49ff-b1ff-0ede6b48cde3?wsid=/subscriptions/20d4fdf3-6a4b-4f0b-a842-bd7392136332/resourcegroups/cienciadatos/workspaces/azureml&tid=c7db5234-eb19-42fd-8840-a85829ea4628
Pipeline submitted for execution

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 321aa4a8-f796-49ff-b1ff-0ede6b48cde3
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/321aa4a8-f796-49ff-b1ff-0ede6b48cde3?wsid=/subscriptions/20d4fdf3-6a4b-4f0b-a842-bd7392136332/resourcegroups/cienciadatos/workspaces/azureml&tid=c7db5234-eb19-42fd-8840-a85829ea4628
PipelineRun Status: Running


StepRunId: 4e04ba61-5d6d-4255-98b7-9698198bd4e2
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4e04ba61-5d6d-4255-98b7-9698198bd4e2?wsid=/subscriptions/20d4fdf3-6a4b-4f0b-a842-bd7392136332/resourcegroups/cienciadatos/workspaces/azureml&tid=c7db5234-eb19-42fd-8840-a85829ea4628

StepRun(Clean airlines data) Execution Summary
StepRun( Clean airlines data ) Status: Finished
{'runId': '4e04ba61-5d6d-4255-98b7-9698198bd4e2', 'target': 'prueba-DS', 'status': 'Completed', 'startTimeUtc': '2023-03-22T01:15:04.336427Z', 'endTimeUtc': '2023-03-22T01:15:29.972676Z', 'services': {}, 'properties': {'ContentSnapshotId': '8f96223c-f20c-4ef8-a4b4-61ac1dbf8c68', 

'Finished'

In [101]:
runs = mlflow.search_runs(
    experiment_names=["exp-Airlines"]
)


In [102]:
runs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 68 columns):
 #   Column                          Non-Null Count  Dtype              
---  ------                          --------------  -----              
 0   run_id                          188 non-null    object             
 1   experiment_id                   188 non-null    object             
 2   status                          188 non-null    object             
 3   artifact_uri                    188 non-null    object             
 4   start_time                      188 non-null    datetime64[ns, UTC]
 5   end_time                        188 non-null    datetime64[ns, UTC]
 6   metrics.Airport GUM             1 non-null      float64            
 7   metrics.OH                      1 non-null      float64            
 8   metrics.Airport ATL             1 non-null      float64            
 9   metrics.Airport LAX             1 non-null      float64            
 10  metrics.9E    

In [103]:
runs[['run_id', 'metrics.accuracy', 'metrics.recall', 'metrics.precision' ]].sort_values(by='metrics.accuracy', ascending=False).dropna(axis=0).head(7)

Unnamed: 0,run_id,metrics.accuracy,metrics.recall,metrics.precision
137,6bc3cd25-4c02-4113-92d3-b2abf9a481cb,0.645866,0.637199,0.656305
157,b27cf29b-9396-49c0-95bf-bc093e8ae0eb,0.645828,0.637103,0.656285
162,4a8b63ca-e873-4fe5-9643-72079fb9650b,0.645828,0.637103,0.656285
172,376da46d-f885-4b25-83bf-3bdc690be3ad,0.645828,0.637103,0.656285
177,03393d79-b365-47cb-a26c-207fddc1ab2c,0.645828,0.637103,0.656285
182,8e236cf5-fd9d-43ff-9a90-b9712c479e0c,0.645828,0.637103,0.656285
187,dd354a13-abc8-4e3f-bde6-bc587b2a7c19,0.645828,0.637103,0.656285


In [104]:
runs[['run_id', 'metrics.accuracy', 'metrics.recall', 'metrics.precision' ]].loc[(runs['tags.mlflow.source.name'] == 'train_model.py') & (runs['status'] == 'FINISHED')].sort_values(by='metrics.accuracy', ascending=False)

Unnamed: 0,run_id,metrics.accuracy,metrics.recall,metrics.precision
157,b27cf29b-9396-49c0-95bf-bc093e8ae0eb,0.645828,0.637103,0.656285
177,03393d79-b365-47cb-a26c-207fddc1ab2c,0.645828,0.637103,0.656285
182,8e236cf5-fd9d-43ff-9a90-b9712c479e0c,0.645828,0.637103,0.656285
187,dd354a13-abc8-4e3f-bde6-bc587b2a7c19,0.645828,0.637103,0.656285


In [105]:
best_id_acc = runs[['run_id', 'metrics.accuracy']].loc[(runs['tags.mlflow.source.name'] == 'train_model.py') & (runs['status'] == 'FINISHED')].sort_values(by='metrics.accuracy', ascending=False).head(1)
print(f"Best model accuracy: {round(best_id_acc['metrics.accuracy'].to_numpy()[0], 5)}")
print(f"Best model run id: {best_id_acc['run_id'].to_string().split()[1]}")


Best model accuracy: 0.64583
Best model run id: b27cf29b-9396-49c0-95bf-bc093e8ae0eb


In [116]:
recent_run = runs[['run_id', 'metrics.accuracy', 'metrics.recall', 'metrics.precision', 'end_time' ]].loc[(runs['tags.mlflow.source.name'] == 'train_model.py') & (runs['status'] == 'FINISHED')].sort_values(by='end_time', ascending=False).head(1)
recent_run_id = recent_run['run_id'].to_string().split()[1]

todo = mlflow.get_run(recent_run_id)

print(todo.data.metrics)

{'precision': 0.6562854343494391, 'recall': 0.6371029795237891, 'accuracy': 0.6458278994957398}


In [76]:
import mlflow

with mlflow.start_run(run_name='prueba') as run:
    mlflow.log_metric('prueba', 1)
    mlflow.log_metric('prueba_2', 2)

    run_id = run.info.run_id
    todo = mlflow.get_run(run_id)

    print(todo.data.metrics)

    runs = mlflow.search_runs(
    experiment_names=["exp-Airlines"]
    )

    runs.head()


    

{'prueba': 1.0, 'prueba_2': 2.0}


In [118]:
#mlflow.get_run('b27cf29b-9396-49c0-95bf-bc093e8ae0eb')
mlflow.get_run("b27cf29b-9396-49c0-95bf-bc093e8ae0eb")

<Run: data=<RunData: metrics={'accuracy': 0.6458278994957398,
 'precision': 0.6562854343494391,
 'recall': 0.6371029795237891}, params={'C': '0.6', 'penalty': 'l2', 'solver': 'newton-cg'}, tags={'azureml.nodeid': '374fe9b2',
 'azureml.pipeline': '9f3692b3-c958-4d83-a9d5-55cbeff15e76',
 'mlflow.parentRunId': '9f3692b3-c958-4d83-a9d5-55cbeff15e76',
 'mlflow.rootRunId': '9f3692b3-c958-4d83-a9d5-55cbeff15e76',
 'mlflow.runName': 'patient_honey_0ft0b569',
 'mlflow.source.name': 'train_model.py',
 'mlflow.source.type': 'JOB',
 'mlflow.user': 'Pablo Andrés Tamayo Flórez'}>, info=<RunInfo: artifact_uri='azureml://eastus.api.azureml.ms/mlflow/v2.0/subscriptions/20d4fdf3-6a4b-4f0b-a842-bd7392136332/resourceGroups/cienciadatos/providers/Microsoft.MachineLearningServices/workspaces/azureml/experiments/76a663a5-eaca-46bd-a40c-8d80d8e136b8/runs/b27cf29b-9396-49c0-95bf-bc093e8ae0eb/artifacts', end_time=1679001907439, experiment_id='76a663a5-eaca-46bd-a40c-8d80d8e136b8', lifecycle_stage='active', run_