In [1]:
#https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml
#Importing libraries
import azureml.core
from azureml.core import Workspace, Dataset
from azureml.core import Experiment
from azureml.widgets import RunDetails
from azureml.core import Run
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails
import mlflow

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Subir los datos a Azure Blob

In [None]:
from azureml.core import Workspace

airlines_delay = '../airlines_delay/airlines_delay.csv'

ws = Workspace.from_config()

# Default datastore
default_store = ws.get_default_datastore() 

default_store.upload_files([airlines_delay], 
                           target_path = 'airlines', 
                           overwrite = True, 
                           show_progress = True)

print("Upload calls completed.")

## Crear y registrar datasets

In [2]:
from azureml.core import Dataset, Datastore
from azureml.core import Workspace

airlines_delay = '../airlines_delay/airlines_delay.csv'

ws = Workspace.from_config()

'''from azureml.core import Workspace  
subscription_id =  "20d4fdf3-6a4b-4f0b-a842-bd7392136332"
resource_group =  "cienciadatos"
workspace_name = "azureml"
  
try:  
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)  
    # write the details of the workspace to a configuration file to the notebook library  
    ws.write_config()  
    print("Workspace configuration succeeded. Skip the workspace creation steps below")  
except:  
    print("Workspace not accessible. Change your parameters or create a new workspace below")  
'''
# Default datastore
#default_store = ws.get_default_datastore() 
# Al crear un conjunto de datos, crea una referencia a la ubicación de la fuente de datos. 
# Si aplicó transformaciones de subconjuntos al conjunto de datos, también se almacenarán en el conjunto de datos. 
# Los datos permanecen en su ubicación actual, por lo que no se incurre en ningún costo adicional de almacenamiento.
#airlines_delay_data = Dataset.Tabular.from_delimited_files(default_store.path('UI/2023-03-07_192934_UTC/airlines_delay.csv'))

# Registrar los conjuntos de datos con el workspace de trabajo para que pueda reutilizarlos en otros experimentos
#  o compartirlos.

#airlines_delay_data = airlines_delay_data.register(ws, 'airlines_delay_data')

'from azureml.core import Workspace  \nsubscription_id =  "20d4fdf3-6a4b-4f0b-a842-bd7392136332"\nresource_group =  "cienciadatos"\nworkspace_name = "azureml"\n  \ntry:  \n    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)  \n    # write the details of the workspace to a configuration file to the notebook library  \n    ws.write_config()  \n    print("Workspace configuration succeeded. Skip the workspace creation steps below")  \nexcept:  \n    print("Workspace not accessible. Change your parameters or create a new workspace below")  \n'

## Setup compute

In [12]:
from azureml.core.compute import ComputeTarget, ComputeInstance
from azureml.core.compute_target import ComputeTargetException

# nombre del cluster
compute_name = "prueba-DS"

# verificación de exixtencia del cluster
try:
    aml_compute = ComputeTarget(workspace=ws, name=compute_name)
    print('Existe!')
except ComputeTargetException:
    
    compute_config = ComputeInstance.provisioning_configuration(vm_size='Standard_DS11_v2',
                                                           ssh_public_access=False)
    aml_compute = ComputeTarget.create(ws, compute_name, compute_config)

aml_compute.wait_for_completion(show_output=True)


Creating..........................................
Running


## Definir el ambiente de trabajo

In [4]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Experiment, ScriptRunConfig, Environment
# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env",  './env/environment.yml')

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = aml_compute

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the source_directory for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the source_directory would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the source_directory of the step.

In [None]:
# preprocesamiento
#1. eliminar columna que no importa
#2. cambiar variables categoricas en numericas
#3. Estandarizar variables numericas
#4. split resultados

## Pipeline Steps

In [5]:
from azureml.pipeline.core import PipelineData
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
import mlflow
#Linkg workspace with mlflow
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

airlines_data_prueba = ws.datasets.get('AirlinesDelay')
#https://learn.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.pipelinedata?view=azure-ml-py
#clean_airlines_data = PipelineData("clean_airlines_data", datastore=default_store, is_directory=True).as_dataset()
clean_airlines_data = OutputFileDatasetConfig('cleaned_data')

clean_step = PythonScriptStep(
    name="Clean airlines data",
    script_name="data_clean.py", 
    arguments=["--output_cleanse", clean_airlines_data],
    inputs=[airlines_data_prueba.as_named_input('raw_data')],
    outputs=[clean_airlines_data],
    compute_target=aml_compute,
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True
)


#transformed_data = PipelineData('transformed_data', datastore=default_store, is_directory=True).as_dataset()
transformed_data = OutputFileDatasetConfig('transformed_data')

transform_step = PythonScriptStep(
    name="transform airlines data",
    script_name="data_transform.py", 
    arguments=['--input_data', clean_airlines_data.as_input(name='Clean_Data'),
               "--output_transform", transformed_data],
    #inputs=[clean_airlines_data.as_input(name='cleaned_data')],
    #outputs=[transformed_data],
    compute_target=aml_compute,
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True
)

# train and test splits output
#output_split_train = PipelineData("output_split_train", datastore=default_store, is_directory=True).as_dataset()
#output_split_test = PipelineData("output_split_test", datastore=default_store, is_directory=True).as_dataset()
output_split_train = OutputFileDatasetConfig("output_split_train")
output_split_test = OutputFileDatasetConfig("output_split_test")
output_split_validation = OutputFileDatasetConfig("output_split_validation")

train_test_split_step = PythonScriptStep(
    name="split data train test",
    script_name="train_test_split.py", 
    arguments=["--input_data", transformed_data.as_input(name='Tranformed_Data'),
               "--output_train_data", output_split_train,
               "--output_test_data", output_split_test,
               "--output_val_data", output_split_validation],
    #inputs=[transformed_data.as_input()],
    #outputs=[output_split_train, output_split_test],
    compute_target=aml_compute,
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True
)

datastore = ws.get_default_datastore()
step_output = PipelineData("model", datastore=datastore)

train_step = PythonScriptStep(
    name = 'Training model',
    script_name = 'train_model.py',
    arguments = ["--input_data_train", output_split_train.as_input(name='train_Data'),
                 "--input_data_val", output_split_validation.as_input(name="val_data"),
                 "--output_model", step_output],
    #inputs = [output_split_train],
    outputs = [step_output],
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True

)

validation_step = PythonScriptStep(
    name = 'Validation model',
    script_name = 'val_model.py',
    arguments = ["--model_out", step_output.as_input(input_name='model_output')],
    inputs = [step_output],
    #outputs = [],
    runconfig=pipeline_run_config,
    source_directory='./scripts/',
    allow_reuse=True
)

print("Done.")


Done.


In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [clean_step, transform_step, train_test_split_step, train_step, validation_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")


# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'exp-Airlines')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

## inference step

In [18]:
from azureml.pipeline.steps import PythonScriptStep
import mlflow
#Linkg workspace with mlflow
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

inference_step = PythonScriptStep(
    name = 'inference',
    script_name = 'inference.py',
    #arguments = ["--model_out", step_output.as_input(input_name='model_output')],
    #inputs = [step_output],
    #outputs = [],
    runconfig=pipeline_run_config,
    source_directory='./scripts_inference/',
    allow_reuse=True
)

In [20]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps_inference = [inference_step]
pipeline_inference = Pipeline(workspace=ws, steps=pipeline_steps_inference)
print("Pipeline is built.")


# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'exp-Airlines_inference')
pipeline_run = experiment.submit(pipeline_inference, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step inference [b4b89a3c][d411fcb7-60f1-41b0-acd0-a331ecffdd19], (This step will run and generate new outputs)
Submitted PipelineRun 75cd7140-672f-44df-9d6b-2cd8478b5ee7
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/75cd7140-672f-44df-9d6b-2cd8478b5ee7?wsid=/subscriptions/20d4fdf3-6a4b-4f0b-a842-bd7392136332/resourcegroups/cienciadatos/workspaces/azureml&tid=c7db5234-eb19-42fd-8840-a85829ea4628
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 75cd7140-672f-44df-9d6b-2cd8478b5ee7
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/75cd7140-672f-44df-9d6b-2cd8478b5ee7?wsid=/subscriptions/20d4fdf3-6a4b-4f0b-a842-bd7392136332/resourcegroups/cienciadatos/workspaces/azureml&tid=c7db5234-eb19-42fd-8840-a85829ea4628
PipelineRun Status: Running


StepRunId: 1609daf2-6738-4e15-841d-95b5fff0df99
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1609daf2-6738-4e15-841d-95b5fff0df99?wsid=/subscriptions/20d4fdf3-6a4b-4f0b-a842-bd7392136332/resourcegroups/cienciadatos/workspaces/azureml&tid=c7db5234-eb19-42fd-8840-a85829ea4628
StepRun( inference ) Status: NotStarted
StepRun( inference ) Status: Running

StepRun(inference) Execution Summary
StepRun( inference ) Status: Finished
{'runId': '1609daf2-6738-4e15-841d-95b5fff0df99', 'target': 'prueba-DS', 'status': 'Completed', 'startTimeUtc': '2023-03-23T15:50:53.981791Z', 'endTimeUtc': '2023-03-23T16:03:55.371434Z', 'services': {}, 'properties': {'Con

'Finished'