# Select a Kernel

Set the kernel to 'Python 3.10 - SDK v2' and then select the 'Authenticate' button (if shown). 

Use the [azure.ai.ml](https://learn.microsoft.com/en-us/python/api/overview/azure/ai-ml-readme?view=azure-python)
module to connect to the AML Workspace in Python.

`azure.ai.ml` requires the following parameters to make the connection: 

 - SUBSCRIPTION
 - RESOURCE_GROUP
 - WS_NAME

The values can be obtained using the dropdown menu on the subscription name, in the top-left corner of the
AMLS workspace page. 

In [None]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# authenticate
credential = DefaultAzureCredential()

SUBSCRIPTION = "<SUBSCRIPTION_ID>"
RESOURCE_GROUP = "<RESOURCE_GROUP>"
WS_NAME = "<AML_WORKSPACE_NAME>"

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=SUBSCRIPTION,
    resource_group_name=RESOURCE_GROUP,
    workspace_name=WS_NAME,
    enable_telemetry=False,
)

## Verifiy that a connection can be made

In [None]:
ws = ml_client.workspaces.get(WS_NAME)
print(ws.location, ":", ws.resource_group)

## Create a folder for the training script

In [None]:
import os

train_src_dir = "./src"
os.makedirs(train_src_dir, exist_ok=True)

# Create a Training Script

Create the `main.py` file in the source folder:

In [None]:
%%writefile {train_src_dir}/main.py
import os
import argparse
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--n_estimators", required=False, default=100, type=int)
    parser.add_argument("--learning_rate", required=False, default=0.1, type=float)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    args = parser.parse_args()
   
    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.sklearn.autolog()

    ###################
    # prepare the data
    ###################
    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)
    
    credit_df = pd.read_csv(args.data, header=1, index_col=0)

    mlflow.log_metric("num_samples", credit_df.shape[0])
    mlflow.log_metric("num_features", credit_df.shape[1] - 1)

    train_df, test_df = train_test_split(
        credit_df,
        test_size=args.test_train_ratio,
    )

    ###############################
    # prepare the data for training
    ###############################
    # Extracting the label column
    y_train = train_df.pop("default payment next month")

    # convert the dataframe values to array
    X_train = train_df.values

    # Extracting the label column
    y_test = test_df.pop("default payment next month")

    # convert the dataframe values to array
    X_test = test_df.values

    #################
    # train the model
    #################
    print(f"Training with data of shape {X_train.shape}")

    clf = GradientBoostingClassifier(
        n_estimators=args.n_estimators, learning_rate=args.learning_rate
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print(classification_report(y_test, y_pred))

    ##########################
    # save and register model
    ##########################
    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=clf,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=clf,
        path=os.path.join(args.registered_model_name, "trained_model"),
    )

    ##############
    # Stop Logging
    ##############
    mlflow.end_run()

if __name__ == "__main__":
    main()

# Configure the Command

Now that you have a script that can perform the desired tasks, and a compute cluster to run the script, 
you use a general purpose command that can run command line actions. 

There are several points to note in this command script:

- input is specified as a file_uri with the path taken from our credit_card_data data asset
- environment is specified as sklearn-1.5: other environments are available to use if your training script requires additional packages or features. You can explore these environments by following the Assets > Environments link in the left-hand navigation pane
- compute is specified as the same compute instance that we're running for the notebook. But you can target a job to a different compute instance or cluster in your workspace, if the job needs additional compute power or a GPU access.


In [None]:
from azure.ai.ml import command
from azure.ai.ml import Input

registered_model_name = "credit_defaults_model"

job = command(
    inputs=dict(
        data=Input(
            type="uri_file",
            path="<named_asset_uri>", # find this in 'Asssets > Data > $dataset_name > Named asset URI
        ),
        test_train_ratio=0.2,
        learning_rate=0.25,
        registered_model_name=registered_model_name,
    ),
    code="./src/",  # location of source code
    command="python main.py --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} --learning_rate ${{inputs.learning_rate}} --registered_model_name ${{inputs.registered_model_name}}",
    environment="azureml://registries/azureml/environments/sklearn-1.5/labels/latest",
    display_name="credit_default_prediction",
    compute='<compute_name>' # the name of your compute instance
)

# Submit the job

To submit the job, run the following:

In [None]:
ml_client.create_or_update(job)