In [52]:
# Handle to the workspace
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()


ml_client = MLClient(
    credential=credential,
    subscription_id="ff2d2126-f4aa-459f-b255-4420677f910a",
    resource_group_name="Azure-2022-project",
    workspace_name="Fraud-detection-system",
)

In [53]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

web_path = "https://datahub.io/machine-learning/creditcard/r/creditcard.csv"

credit_card_dataset = Data(
    name="creditcard_raw_data",
    path=web_path,
    type=AssetTypes.URI_FILE,
    description="Dataset for credit card defaults",
    tags={"source_type": "web", "source": "UCI ML Repo"},
    version="1.0.0",
)

In [54]:
credit_card_dataset = ml_client.data.create_or_update(credit_card_dataset)
print(
    f"Dataset with name {credit_card_dataset.name} was registered to workspace, the dataset version is {credit_card_dataset.version}"
)

Dataset with name creditcard_raw_data was registered to workspace, the dataset version is 1.0.0


In [55]:
from azure.ai.ml.entities import AmlCompute

cpu_compute_target = "cpu-cluster"

try:
    # let's see if the compute target already exists
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"You already have a cluster named {cpu_compute_target}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new cpu compute target...")

    # Let's create the Azure ML compute object with the intended parameters
    cpu_cluster = AmlCompute(
        # Name assigned to the compute cluster
        name="cpu-cluster",
        # Azure ML Compute is the on-demand VM service
        type="amlcompute",
        # VM Family
        size="STANDARD_DS3_V2",
        # Minimum running nodes when there is no job running
        min_instances=0,
        # Nodes in cluster
        max_instances=4,
        # How many seconds will the node running after the job termination
        idle_time_before_scale_down=180,
        # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
        tier="Dedicated",
    )

    # Now, we pass the object to MLClient's create_or_update method
    cpu_cluster = ml_client.begin_create_or_update(cpu_cluster)

print(
    f"AMLCompute with name {cpu_cluster.name} is created, the compute size is {cpu_cluster.size}"
)

You already have a cluster named cpu-cluster, we'll reuse it as is.
AMLCompute with name cpu-cluster is created, the compute size is STANDARD_D4S_V3


In [57]:
import os

dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [59]:
from azure.ai.ml.entities import Environment

custom_env_name = "aml-scikit-learn"

pipeline_job_envo = Environment(
    name=custom_env_name,
    description="Custom environment for Credit Card Raw pipeline",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
    version="1.0",
)
pipeline_job_envo = ml_client.environments.create_or_update(pipeline_job_envo)

print(
    f"Environment with name {pipeline_job_envo.name} is registered to workspace, the environment version is {pipeline_job_envo.version}"
)

Environment with name aml-scikit-learn is registered to workspace, the environment version is 1.0


In [60]:
import os

data_prep_src_dir = "./components/data_prep"
os.makedirs(data_prep_src_dir, exist_ok=True)

In [67]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

data_prep_component = command(
    name="data_prep_credit_defaults",
    display_name="Data preparation for training",
    description="reads a csv input, split the input to train and test",
    inputs={
        "data" : Input(type="uri_folder"),
     },
    outputs=dict(
        train_data=Output(type="uri_folder", mode="rw_mount"),
        test_data=Output(type="uri_folder", mode="rw_mount"),
    ),
    # The source folder of the component
    code=data_prep_src_dir,
    command="""python data_prep.py \
            --data ${{inputs.data}} \
            --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \
            """,
    environment=f"{pipeline_job_envo.name}:{pipeline_job_envo.version}",
)

In [68]:
# importing the Component Package
from azure.ai.ml import load_component

# Loading the component from the yml file
prep_component = load_component(path=os.path.join(data_prep_src_dir, "prep.yml"))

In [70]:
# Now we register the component to the workspace
prep_component = ml_client.create_or_update(prep_component)

# Create (register) the component in your workspace
print(
    f"Component {prep_component.name} with Version {prep_component.version} is registered"
)

AssetException: Error with code: You don't have permission to alter this storage account. Ensure that you have been assigned both Storage Blob Data Reader and Storage Blob Data Contributor roles.

In [None]:
import os

train_src_dir = "./components/train"
os.makedirs(train_src_dir, exist_ok=True)