In [3]:
# client class interact with azure ml for workspace validation 
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

#validate the credential - use rource group informtion to fill the sections-
ml_client = MLClient(
    credential=credential,
    subscription_id="ff2d2126-f4aa-459f-b255-4420677f910a",
    resource_group_name="Azure-2022-project",
    workspace_name="Fraud-detection-system",
)

In [4]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
#data source CSV format
web_path = "https://datahub.io/machine-learning/creditcard/r/creditcard.csv"

#data entities 
credit_card_dataset = Data(
    name="creditcard_raw_data",
    path=web_path,
    type=AssetTypes.URI_FILE,
    description="Dataset for credit card defaults",
    tags={"source_type": "web", "source": "UCI ML Repo"},
    version="1.0.0",
)

In [5]:
#create_or_update creats or updates the azure resources 
#ml_client client class interact with azure ML

credit_card_dataset = ml_client.data.create_or_update(credit_card_dataset)
#print for Dataset registery
print(
    f"Dataset with name {credit_card_dataset.name} was registered to workspace, the dataset version is {credit_card_dataset.version}"
)

Dataset with name creditcard_raw_data was registered to workspace, the dataset version is 1.0.0


In [6]:
from azure.ai.ml.entities import AmlCompute
#AMlCompute used for defining compute resources 

cpu_compute_target = "cpu-cluster"

try:
    # check if cluster already exists 
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"You already have a cluster named {cpu_compute_target}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new cpu compute target...")

    # throw and exception incase of not having a cluster 
    cpu_cluster = AmlCompute(
        # Name assigned to the compute cluster
        name="cpu-cluster",
        # Azure ML Compute is the on-demand VM service
        type="amlcompute",
        # VM Family
        size="STANDARD_DS3_V2",
        # Minimum running nodes when there is no job running
        min_instances=0,
        # Nodes in cluster
        max_instances=4,
        # How many seconds will the node running after the job termination
        idle_time_before_scale_down=180,
        # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
        tier="Dedicated",
    )

    # creat azure ml service (computing cluster async)
    cpu_cluster = ml_client.begin_create_or_update(cpu_cluster)

print(
    f"AMLCompute with name {cpu_cluster.name} is created, the compute size is {cpu_cluster.size}"
)

Creating a new cpu compute target...
AMLCompute with name cpu-cluster is created, the compute size is STANDARD_DS3_V2


In [8]:
import os
#make directory for dependencies by using os module 
dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [10]:
from azure.ai.ml.entities import Environment
#register the environment to the workspace 
custom_env_name = "aml-scikit-learn2"

pipeline_job_envo = Environment(
    name=custom_env_name,
    description="Custom environment for Credit Card Raw pipeline",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
    version="1.0",
)
pipeline_job_envo = ml_client.environments.create_or_update(pipeline_job_envo)

print(
    f"Environment with name {pipeline_job_envo.name} is registered to workspace, the environment version is {pipeline_job_envo.version}"
)

Environment with name aml-scikit-learn2 is registered to workspace, the environment version is 1.0


In [11]:
import os
#make directory for dependencies by using os module 
data_prep_src_dir = "./components/data_prep"
os.makedirs(data_prep_src_dir, exist_ok=True)

In [12]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

data_prep_component = command(
    name="data_prep_credit_defaults2",
    display_name="Data preparation for training",
    description="reads a csv input, split the input to train and test",
    inputs={
        "data" : Input(type="uri_folder"),
     },
    outputs=dict(
        train_data=Output(type="uri_folder", mode="rw_mount"),
        test_data=Output(type="uri_folder", mode="rw_mount"),
    ),
    # The source folder of the component
    code=data_prep_src_dir,
    command="""python data_prep.py \
            --data ${{inputs.data}} \
            --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \
            """,
    environment=f"{pipeline_job_envo.name}:{pipeline_job_envo.version}",
)

In [13]:
# importing the Component Package
from azure.ai.ml import load_component

# Loading the component from the yml file
prep_component = load_component(path=os.path.join(data_prep_src_dir, "prep.yml"))

In [14]:
# Now we register the component to the workspace
prep_component = ml_client.create_or_update(prep_component)

# Create (register) the component in your workspace
print(
    f"Component {prep_component.name} with Version {prep_component.version} is registered"
)

Component data_prep_credit_defaults with Version 2022-08-27-04-51-41-5304130 is registered


In [15]:
import os

train_src_dir = "./components/train"
os.makedirs(train_src_dir, exist_ok=True)

In [16]:
# importing the Component Package
from azure.ai.ml import load_component

# Loading the component from the yml file
train_component = load_component(path=os.path.join(train_src_dir, "train.yml"))

In [17]:
# Now we register the component to the workspace
train_component = ml_client.create_or_update(train_component)

# Create (register) the component in your workspace
print(
    f"Component {train_component.name} with Version {train_component.version} is registered"
)

Component train_credit_defaults_model with Version 2022-08-27-04-51-50-9190946 is registered


In [19]:
# the dsl decorator tells the sdk that we are defining an Azure ML pipeline
from azure.ai.ml import dsl, Input, Output


@dsl.pipeline(
    compute=cpu_compute_target,
    description="E2E data_perp-train pipeline",
)
def credit_defaults_pipeline(
    pipeline_job_data_input,
    pipeline_job_registered_model_name,
    pipeline_job_learning_rate,
):
    # using data_prep_function like a python call with its own inputs
    data_prep_job = data_prep_component(
        data=pipeline_job_data_input,
    )

    # using train_func like a python call with its own inputs
    train_job = train_component(
        train_data=data_prep_job.outputs.train_data,  # note: using outputs from previous step
        test_data=data_prep_job.outputs.test_data,  # note: using outputs from previous step
        registered_model_name=pipeline_job_registered_model_name,
       learning_rate = pipeline_job_learning_rate,
    )

    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "pipeline_job_train_data": data_prep_job.outputs.train_data,
        "pipeline_job_test_data": data_prep_job.outputs.test_data,
    }

In [20]:
registered_model_name = "credit_defaults_model"

# Let's instantiate the pipeline with the parameters of our choice
pipeline = credit_defaults_pipeline(
    # pipeline_job_data_input=credit_data,
    pipeline_job_data_input=Input(type="uri_file", path=web_path),
    pipeline_job_learning_rate=0.25,
    pipeline_job_registered_model_name=registered_model_name,
)

In [21]:
import webbrowser

# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="e2e_registered_components",
)
# open the pipeline in web browser
webbrowser.open(pipeline_job.services["Studio"].endpoint)

False

In [22]:
ml_client.jobs.stream(pipeline_job.name)

RunId: purple_soursop_tsb6cms96g
Web View: https://ml.azure.com/runs/purple_soursop_tsb6cms96g?wsid=/subscriptions/ff2d2126-f4aa-459f-b255-4420677f910a/resourcegroups/Azure-2022-project/workspaces/Fraud-detection-system

Streaming logs/azureml/executionlogs.txt

[2022-08-27 04:52:12Z] Submitting 1 runs, first five are: dc903422:4124ffa4-e441-4a0d-aae3-be0b89f58813
