# 3.- Azure ML Resources

In [1]:
import yaml
import os
from tqdm import tqdm

from azure.identity import DefaultAzureCredential

from azure.mgmt.resource import ResourceManagementClient

from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError, HttpResponseError

from azure.ai.ml import MLClient
from azure.ai.ml.entities import Workspace

from azure.ai.ml.entities import AmlCompute

## Define Variables

In [16]:
# Load configuration from the YAML file
with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [17]:
subscription_id = config["azure"]["subscription_id"]
resource_group_name = config["azure"]["resource_group_name"]
workspace_name = config["azure"]["workspace_name"]
location = config["azure"]["location"]
training_gpu_cluster = config["azure"]["training_gpu_cluster"]

## Azure Authentication

In [4]:
# Initialize DefaultAzureCredential
credential = DefaultAzureCredential()

## Resource Group

In [5]:
# Initialize the Resource Management client
resource_client = ResourceManagementClient(credential, subscription_id)

In [6]:
def create_resource_group(resource_client, resource_group_name, location):
    try:
        # Intenta obtener el grupo de recursos
        resource_group = resource_client.resource_groups.get(resource_group_name)
        print(f"Resource Group '{resource_group_name}' already exists in '{resource_group.location}'.")
    except ResourceNotFoundError:
        # Si el grupo de recursos no existe, créalo
        resource_group_params = {"location": location}
        resource_group = resource_client.resource_groups.create_or_update(
            resource_group_name,
            resource_group_params
        )
        print(f"Resource Group '{resource_group_name}' created in '{resource_group.location}'.")
    except Exception as e:
        # Maneja otras excepciones
        print(f"An error occurred: {e}")
        return None
    return resource_group

In [None]:
# Call the function to create the Resource Group
resource_group = create_resource_group(resource_client, resource_group_name, location)

## Workspace

In [8]:
ml_client = MLClient(credential, subscription_id, resource_group_name)

In [9]:
def create_workspace(ml_client, workspace_name, location):
    try:
        # Try to get the existing Workspace
        workspace = ml_client.workspaces.get(workspace_name)
        print(f"Workspace '{workspace_name}' already exists in '{workspace.location}'.")
        return workspace
    except ResourceNotFoundError:
        # If the Workspace does not exist, create it asynchronously
        workspace_poller = ml_client.workspaces.begin_create(
            Workspace(
                name=workspace_name,
                location=location  # Use the 'location' variable
            )
        )
        workspace = workspace_poller.result()  # Wait for the operation to complete
        print(f"Workspace '{workspace_name}' created in '{workspace.location}'.")
        return workspace
    except Exception as e:
        # Handle other exceptions
        print(f"An error occurred: {e}")
        return None


In [None]:
workspace = create_workspace(ml_client, workspace_name, location)

## Get Woskspace Storage Account Name

In [None]:
storage_account_name = workspace.storage_account.split('/')[-1]

## Create a Compute Resource

In [None]:
ml_client = MLClient(credential, subscription_id, resource_group_name, workspace_name)

In [21]:
try:
    # let's see if the compute target already exists
    gpu_cluster = ml_client.compute.get(training_gpu_cluster)
    print(
        f"You already have a cluster named {training_gpu_cluster}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new gpu compute target...")

    # Let's create the Azure ML compute object with the intended parameters
    gpu_cluster = AmlCompute(
        # Name assigned to the compute cluster
        name=training_gpu_cluster,
        # Azure ML Compute is the on-demand VM service
        type="amlcompute",
        # VM Family
        size="STANDARD_D2_V3",
        # Minimum running nodes when there is no job running
        min_instances=0,
        # Nodes in cluster
        max_instances=4,
        # How many seconds will the node running after the job termination
        idle_time_before_scale_down=180,
        # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
        tier="Dedicated",
    )

    # Now, we pass the object to MLClient's create_or_update method
    gpu_cluster = ml_client.begin_create_or_update(gpu_cluster).result()

print(
    f"AMLCompute with name {gpu_cluster.name} is created, the compute size is {gpu_cluster.size}"
)