# Job creation

In [1]:
from azure.ai.ml import command, Input, MLClient, Output
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import AmlCompute, Environment, Data, AzureBlobDatastore
from dotenv import load_dotenv
from azure.ai.ml.constants import AssetTypes, InputOutputModes

import os

In [2]:
load_dotenv()

SUBSCRIPTION_ID = os.environ.get("SUBSCRIPTION_ID")
RESOURCE_GROUP = os.environ.get("RESOURCE_GROUP")
WORKSPACE_NAME = os.environ.get("WORKSPACE_NAME")
DATASTORE_NAME = os.environ.get("DATASTORE_NAME")
ACCOUNT_NAME = os.environ.get("ACCOUNT_NAME")
CONTAINER_NAME = os.environ.get("CONTAINER_NAME")
COMPUTE_CLUSTER_NAME = os.environ.get("COMPUTE_NAME")
LOCATION = os.environ.get("LOCATION")
CONNECTION_KEY = os.environ.get("CONNECTION_KEY")
DATASET_NAME= os.environ.get("DATASET_NAME")

In [3]:
WORKSPACE_NAME

'aml-review-analysis-teamc1'

In [4]:
# connect to AML
ml_client = MLClient(
    DefaultAzureCredential(),
    SUBSCRIPTION_ID,
    RESOURCE_GROUP,
    WORKSPACE_NAME
)

In [5]:
# get path where data is stored in AML
datastore_uri = ml_client.data.get(name="amazon_reviews_folder", version="1")

In [6]:
reviews_path = "../../data/reviews.csv"

reviews = Data(
    path=reviews_path,
    type=AssetTypes.URI_FILE,
    description="amazon reviews file",
    name="reviews",
    version="raw"
)

try:
    data_asset = ml_client.data.get(name="reviews", version="raw")
    print(
        f"Data asset already exists. Name: {reviews.name}, version: {reviews.version}"
    )
except Exception as e:
    print(e)
    ml_client.data.create_or_update(reviews)
    print(f"Data asset created. Name: {reviews.name}, version: {reviews.version}")

(UserError) Data version reviews:raw (dataContainerName:version) not found.
Code: UserError
Message: Data version reviews:raw (dataContainerName:version) not found.


[32mUploading reviews.csv[32m (< 1 MB): 226MB [00:11, 19.1MB/s]                                                                [0m
[39m



Data asset created. Name: reviews, version: raw


In [7]:
data_asset = ml_client.data.get(name="reviews", version="raw")

In [8]:
# save data 
data_type = AssetTypes.URI_FOLDER
output_path = f"{datastore_uri.path}"
output_mode = InputOutputModes.RW_MOUNT
outputs = {
    "output_data": Output(type=data_type, 
                          path=output_path, 
                          mode=output_mode,
                  )
}

In [9]:
# example to prep data in AML
job = command(
    inputs=dict(
        raw_data=data_asset.path,
        prep_data="reviews-prepped.csv"
    ),
    outputs=outputs,
    code="src/prep",
    command="python prep.py --raw_data ${{inputs.raw_data}} --prep_data ${{inputs.prep_data}}",
    environment="keras-env@latest",
    experiment_name="reviews_analysis__prep",
    display_name="amazon_reviews__prep",
    compute="cpu-cluster"
)

returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor the job here : ", aml_url)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading prep (0.02 MBs): 100%|#

Monitor the job here :  https://ml.azure.com/runs/placid_rail_0zn4l4tt6x?wsid=/subscriptions/ce96fbca-fc23-466f-87e4-9b8cb5316116/resourcegroups/rg-review-analysis-teamc/workspaces/aml-review-analysis-teamc1&tid=65810625-201f-44a9-ba8a-3af31d76a870


In [57]:
# get prepared dataset
data_asset = ml_client.data.get(name="review-prepped", version="1")
data_asset.path

'azureml://subscriptions/ce96fbca-fc23-466f-87e4-9b8cb5316116/resourcegroups/rg-review-analysis-teamc/workspaces/aml-review-analysis-teamc/datastores/workspaceblobstore/paths/UI/2024-04-02_183355_UTC/reviews_preprocessed.csv'

In [59]:
# example to train a job in AML - it works
job = command(
    inputs=dict(
        prep_data=data_asset.path,
        registered_model_name="naive_bayes_baseline"
    ),
    code="src/train",
    command="python train.py --prep_data ${{inputs.prep_data}} --registered_model_name ${{inputs.registered_model_name}}",
    environment="keras-env@latest",
    experiment_name="reviews_analysis__train",
    display_name="amazon_reviews__train",
    compute="cpu-cluster"
)

returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor the job here : ", aml_url)

[32mUploading train (0.0 MBs): 100%|################################################| 1549/1549 [00:00<00:00, 32544.11it/s][0m
[39m



Monitor the job here :  https://ml.azure.com/runs/sincere_avocado_6dgsk1vp2d?wsid=/subscriptions/ce96fbca-fc23-466f-87e4-9b8cb5316116/resourcegroups/rg-review-analysis-teamc/workspaces/aml-review-analysis-teamc&tid=cf36141c-ddd7-45a7-b073-111f66d0b30c
