In [None]:
### At this point the ./src/main.py file needs to be executed with the preprocessed data. In the original training, Azure ML Studio was used to train the model. If you use another cloud computing service, you will need to adapt the code to your needs.

from azure.ai.ml import MLClient
from azure.ai.ml import command, Input, Output
from azure.identity import DefaultAzureCredential
from azure.ai.ml.sweep import Choice, Uniform, MedianStoppingPolicy
from azureml.core import Workspace,Dataset, Datastore, Experiment
import os
import pandas as pd
import numpy as np

import sys
sys.path.append("../../../")

# Get a handle to the workspace
credential = DefaultAzureCredential()
from config import subscription_id, resource_group, workspace_name

ml_client = MLClient(
    credential=credential,
    subscription_id = subscription_id,
    resource_group_name = resource_group,
    workspace_name = workspace_name
)

workspace = Workspace(subscription_id, resource_group, workspace_name)
datastore = Datastore.get(workspace, "workspaceblobstore")

## 1. Set up job environment (only first time)

In [None]:
dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True) 

In [None]:
%%writefile {dependencies_dir}/conda.yml
name: benchmark-env
channels:
  - conda-forge
dependencies:
  - python=3.10
  - numpy
  - pandas
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - mlflow== 2.4.1
    - azureml-mlflow==1.51.0
    - azureml-core==1.53.0
    - psutil>=5.8,<5.9
    - ipykernel~=6.0
    - u8darts[all]

In [None]:
from azure.ai.ml.entities import Environment

custom_env_name = "benchmark_env"

custom_job_env = Environment(
    name=custom_env_name,
    description="Virtual environment for Benchmarking",
    tags={"additional": "darts"},
    conda_file=os.path.join(dependencies_dir, "conda.yml"),
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
)
custom_job_env = ml_client.environments.create_or_update(custom_job_env)

print(
    f"Environment with name {custom_job_env.name} is registered to workspace, the environment version is {custom_job_env.version}"
)

## 2. Set up input data

In [None]:
# path to input file
data_path= "azureml://subscriptions/workspaceblobstore/paths/LocalUpload/00_load_country.csv"

In [None]:
# validate the input data
short_path = 'LocalUpload' + data_path.split('LocalUpload')[1] 
dataset = Dataset.Tabular.from_delimited_files(path=(datastore, short_path))
df = dataset.to_pandas_dataframe()

## 3.XGBoost

### 3.1 Hyperparameter tuning

In [None]:
# Create your base command job
command_job = command(
    code="./src",
    command=""" python xgboost_hyperparam.py \
            --data ${{inputs.data}} \
            --learning_rate ${{inputs.learning_rate}} \
            --subsample ${{inputs.subsample}} \
            --max_leaves ${{inputs.max_leaves}} \
            --max_depth ${{inputs.max_depth}} \
            --gamma ${{inputs.gamma}} \
            --colsample_bytree ${{inputs.colsample_bytree}} \
            --min_child_weight ${{inputs.min_child_weight}} \
            --results ${{outputs.results}} \
            """,
    environment="benchmark_env@latest",
    inputs=dict(
        data=Input(
            type="uri_file",
            path=data_path
        ),
        #max_depth = 3,
        #gamma = 1.0,
        #reg_alpha = 40,
        #reg_lambda = 0.0,
        #colsample_bytree = 0.5,
        #min_child_weight = 0,
        #n_estimators = 180,
        #seed = 0,
        learning_rate = 0.1,
        subsample = 1,
        max_leaves = 100,
        max_depth = 5,
        gamma = 0,
        colsample_bytree = 1,
        min_child_weight = 1,
    ),
    outputs=dict(
        results=Output(type="uri_folder", 
                     mode="rw_mount"),
    ),
    compute="Standard-NC24ads-A100-v4-10nodes",
)

# override inputs with hyperparameter values
#command_job_for_sweep = command_job(
#    max_depth=Choice(values=range(3, 19, 1)), #maximum depth of a tree
#    gamma=Uniform(1.0, 9.0), #minimum loss reduction required to make a further partition on a leaf node of the tree
#    reg_alpha=Choice(values=range(40, 181, 1)), #L1 regularization term on weights
#    reg_lambda=Uniform(0.0, 1.0), #L2 regularization term on weights
#    colsample_bytree=Uniform(0.5, 1.0), #fraction of columns to be randomly sampled for each tree
#    min_child_weight=Choice(values=range(0, 11, 1)), #minimum sum of instance weight (hessian) needed in a child
#    n_estimators=180, #number of trees
#    seed=0, #random seed
#)

command_job_for_sweep = command_job(
    learning_rate=Uniform(0.005, 0.2),
    subsample=Uniform(0.8, 1),
    max_leaves=Choice(values=range(10, 201, 10)),
    max_depth=Choice(values=range(5, 31, 5)),
    gamma=Uniform(0, 0.02),
    colsample_bytree=Uniform(0.8, 1),
    min_child_weight=Choice(values=range(0, 11, 1)),
)

# Call sweep() on your command job to sweep over your parameter expressions
sweep_job = command_job_for_sweep.sweep(
    compute="Standard-NC24ads-A100-v4-10nodes",
    sampling_algorithm="bayesian",
    primary_metric="MASE",
    goal="Minimize",
)

#Specify your experiment details
sweep_job.display_name = "Bayesian hyperparameter tuning for XGBoost"
sweep_job.experiment_name = "0821_Benchmarking"
sweep_job.description = "Hyperparameter tuning for XGBoost using Bayesian sampling and Azure ML"

# Define the limits for this sweep
sweep_job.set_limits(max_total_trials=100, max_concurrent_trials=10, timeout=10*60*60)

# Set early stopping on this one
sweep_job.early_termination = MedianStoppingPolicy(
    delay_evaluation=5, evaluation_interval=1
)

In [None]:
# submit the sweep
returned_sweep_job = ml_client.create_or_update(sweep_job)

### 3.2 Train model

In [None]:
# best parameters
best_params = {
    "learning_rate": 0.123,
    "subsample": 0.861,
    "max_leaves": 20,
    "max_depth": 30,
    "gamma": 0.00039,
    "colsample_bytree": 0.874,
    "min_child_weight": 7,
}

In [None]:
# Create your base command job
job = command(
    code="./src",
    command=""" python xgboost_main.py \
            --data ${{inputs.data}} \
            --results ${{outputs.results}} \
            """,
    # use environment version 12
    environment="benchmark_env:20",
    #environment="benchmark_env@latest",
    inputs=dict(
        data=Input(
            type="uri_file",
            path=data_path
        ),
    ),
    outputs=dict(
        results=Output(type="uri_folder", 
                     mode="rw_mount"),
    ),
    compute="Standard-NC24ads-A100-v4-10nodes",
    #compute="Standard-NC24ads-A100-v4",
    #compute="Standard-NC6",
    #compute='leoniew-StandardNC6',
    #compute="Standard-HB120rs-v3",
    display_name=f"Full model xgboost",
    experiment_name="0821_Benchmarking",
    description="Benchmark country data",
)

# submit the job
ml_client.create_or_update(job)

## 4. Start ARIMA job

In [None]:
IDs = df['country'].unique()
IDs = ['DEU']
#for id in IDs[8:]:
for id in IDs:
    # Create your base command job
    job = command(
        code="./src",
        command=""" python arima_main.py \
                --data ${{inputs.data}} --id ${{inputs.id}} \
                --results ${{outputs.results}} \
                """,
        # use environment version 12
        environment="benchmark_env:20",
        #environment="benchmark_env@latest",
        inputs=dict(
            data=Input(
                type="uri_file",
                path=data_path
            ),
            id=id,
        ),
        outputs=dict(
            results=Output(type="uri_folder", 
                        mode="rw_mount"),
        ),
        compute="Standard-NC24ads-A100-v4-10nodes",
        #compute="Standard-NC6",
        #compute='leoniew-StandardNC6',
        #compute="Standard-HB120rs-v3",
        display_name=f"Full model arima for {id}",
        experiment_name="0821_Benchmarking",
        description="Benchmark country data",
        name=f"arima_{id}_1630"
    )

    # submit the job
    ml_client.create_or_update(job)