## Container Runtime HPO Example
This example notebook demonstrates how to use the container runtime HPO API to train a simple XGBoost model using Bayesian optimization, random search, and grid search. And how to leverage the API to retrieve the results.

In [1]:
import xgboost as xgb
import pandas as pd
from snowflake.ml.data.data_connector import DataConnector
from snowflake.ml.modeling import tune
from snowflake.ml.modeling.tune import get_tuner_context
from sklearn import datasets
from entities import search_algorithm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_regression
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions


### Data Ingestion & Define Training Function

In [11]:
######### STEP 0: FOLLOWING CODE SHOULD ALREADY BE AUTO-GENERATED IN SNOWFLAKE NOTEBOOK ##########
try:
    # This should work out inside Snowflake notebook
    session = get_active_session()
except:
    # Fall back to use local snowflake connection configured at "~/.snowsql/config"
    session = Session.builder.configs(SnowflakeLoginOptions()).getOrCreate()

    
######### STEP 1: GENERATE ARTIFICIAL TRAINING DATA FOR ILLUSTRATION PURPOSES ##########
X, y = datasets.load_digits(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
dataset_map = {
    "x_train": DataConnector.from_dataframe(session.create_dataframe(X_train)),
    "y_train": DataConnector.from_dataframe(
        session.create_dataframe(y_train.to_frame())
    ),
    "x_test": DataConnector.from_dataframe(session.create_dataframe(X_test)),
    "y_test": DataConnector.from_dataframe(
        session.create_dataframe(y_test.to_frame())
    ),
}


######### STEP 2: DEFINE TRAINING FUNCTION ##########
def train_func():
    tuner_context = get_tuner_context()
    config = tuner_context.get_hyper_params()
    dm = tuner_context.get_dataset_map()
    model = xgb.XGBClassifier(
        **{k: int(v) if k != "learning_rate" else v for k, v in config.items()},
        random_state=42,
    )
    model.fit(dm["x_train"].to_pandas(), dm["y_train"].to_pandas())
    accuracy = accuracy_score(
        dm["y_train"].to_pandas(), model.predict(dm["x_train"].to_pandas())
    )
    tuner_context.report(metrics={"accuracy": accuracy}, model=model)



  success, _, _, ci_output = write_pandas(
17-Feb-25 17:50:51 - MLRC - INFO - Number of results in first chunk: 1
17-Feb-25 17:50:52 - MLRC - INFO - Number of results in first chunk: 1
17-Feb-25 17:50:52 - MLRC - INFO - Number of results in first chunk: 65
17-Feb-25 17:50:52 - MLRC - INFO - Number of results in first chunk: 1
17-Feb-25 17:50:53 - MLRC - INFO - Number of results in first chunk: 1
  success, _, _, ci_output = write_pandas(
17-Feb-25 17:50:54 - MLRC - INFO - Number of results in first chunk: 1
17-Feb-25 17:50:54 - MLRC - INFO - Number of results in first chunk: 1
17-Feb-25 17:50:54 - MLRC - INFO - Number of results in first chunk: 2
17-Feb-25 17:50:55 - MLRC - INFO - Number of results in first chunk: 1
17-Feb-25 17:50:56 - MLRC - INFO - Number of results in first chunk: 1
17-Feb-25 17:50:56 - MLRC - INFO - Number of results in first chunk: 1
17-Feb-25 17:50:56 - MLRC - INFO - Number of results in first chunk: 1
17-Feb-25 17:50:57 - MLRC - INFO - Number of results in first

### Bayesian Optimization Search

In [None]:
######### STEP 3: START HPO RUN With Bayes Opt Search ##########

tuner = tune.Tuner(
    train_func=train_func,
    search_space={
        "n_estimators": tune.uniform(50, 200),
        "max_depth": tune.uniform(3, 10),
        "learning_rate": tune.uniform(0.01, 0.3),
    },
    tuner_config=tune.TunerConfig(
        metric="accuracy",
        mode="max",
        search_alg=search_algorithm.BayesOpt(),
        num_trials=2,
        max_concurrent_trials=1,
    ),
)

tuner_results = tuner.run(dataset_map=dataset_map)

In [7]:
######### STEP 4: EVALUATE THE HPO RUN RESULT ##########

tuner_results.best_result

Unnamed: 0,accuracy,should_checkpoint,trial_id,time_total_s,config/max_depth,config/n_estimators,config/learning_rate
0,1.0,True,c05571f1,5.695417,9.655,159.799091,0.118617


In [8]:
tuner_results.results

Unnamed: 0,accuracy,should_checkpoint,trial_id,time_total_s,config/max_depth,config/n_estimators,config/learning_rate
0,1.0,True,c05571f1,5.695417,9.655,159.799091,0.118617
1,1.0,True,9e10d9ad,4.097825,4.09213,73.399178,0.183611


In [9]:
tuner_results.best_model

### Random Search

In [14]:
######### STEP 3: START HPO RUN With Random Search ##########

tuner = tune.Tuner(
    train_func=train_func,
    search_space={
        "n_estimators": tune.uniform(50, 200),
        "max_depth": tune.uniform(3, 10),
        "learning_rate": tune.uniform(0.01, 0.3),
    },
    tuner_config=tune.TunerConfig(
        metric="accuracy",
        mode="max",
        search_alg=search_algorithm.RandomSearch(),
        num_trials=2,
        max_concurrent_trials=1,
    ),
)

tuner_results = tuner.run(dataset_map=dataset_map)

17-Feb-25 17:52:33 - MLRC - INFO - Reading /Users/shchen/.snowsql/config for connection parameters defined as connections
17-Feb-25 17:52:33 - MLRC - INFO - Reading /Users/shchen/.snowsql/config for connection parameters defined as connections
17-Feb-25 17:52:33 - MLRC - INFO - Reading /Users/shchen/.snowsql/config for connection parameters defined as connections
17-Feb-25 17:52:33 - MLRC - INFO - Reading /Users/shchen/.snowsql/config for connection parameters defined as connections


2025-02-17 17:52:34,025	INFO job_manager.py:528 -- Runtime env is setting up.

Trial status: 1 PENDING
Current time: 2025-02-17 17:52:37. Total running time: 0s


(ReadResultSetDataSource pid=29929)  * To change owner, run `chown $USER "/Users/shchen/.snowflake/connections.toml"`.
(ReadResultSetDataSource pid=29929)  * To restrict permissions, run `chmod 0600 "/Users/shchen/.snowflake/connections.toml"`.
(ReadResultSetDataSource pid=29929) 

(ReadResultSetDataSource pid=29929)   warn(f"Bad owner or permissions on {str(filep)}{chmod_message}")

(ReadResultSetDataSource pid=29928)  * To change owner, run `chown $USER "/Users/shchen/.snowflake/connections.toml"`.
(ReadResultSetDataSource pid=29928)  * To restrict permissions, run `chmod 0600 "/Users/shchen/.snowflake/connections.toml"`.

(ReadResultSetDataSource pid=29928) 
(ReadResultSetDataSource pid=29928)   warn(f"Bad owner or permissions on {str(filep)}{chmod_message}")

(pid=29920) ✔️  Dataset execution finished in 2.14 seconds: : 0

In [15]:
tuner_results.results

Unnamed: 0,accuracy,should_checkpoint,trial_id,time_total_s,config/learning_rate,config/n_estimators,config/max_depth
0,1.0,True,06deb_00000,5.873114,0.099543,160.051226,9.329063
1,1.0,True,06deb_00001,4.871954,0.221614,167.962748,3.433297


### Grid Search

In [16]:
######### STEP 3: START HPO RUN With Grid Search ##########

tuner = tune.Tuner(
    train_func=train_func,
    search_space = {
        "n_estimators": [50, 51],
        "max_depth": [4,5],
        "learning_rate": [0.01, 0.03]
    },
    tuner_config=tune.TunerConfig(
        metric="accuracy",
        mode="max",
        search_alg=search_algorithm.GridSearch(),
        num_trials=2,
        max_concurrent_trials=1,
    ),
)

tuner_results = tuner.run(dataset_map=dataset_map)

17-Feb-25 17:54:24 - MLRC - INFO - Reading /Users/shchen/.snowsql/config for connection parameters defined as connections
17-Feb-25 17:54:25 - MLRC - INFO - Reading /Users/shchen/.snowsql/config for connection parameters defined as connections
17-Feb-25 17:54:25 - MLRC - INFO - Reading /Users/shchen/.snowsql/config for connection parameters defined as connections
17-Feb-25 17:54:25 - MLRC - INFO - Reading /Users/shchen/.snowsql/config for connection parameters defined as connections


2025-02-17 17:54:25,394	INFO job_manager.py:528 -- Runtime env is setting up.

Trial status: 1 PENDING
Current time: 2025-02-17 17:54:28. Total running time: 0s


(ReadResultSetDataSource pid=30036)  * To change owner, run `chown $USER "/Users/shchen/.snowflake/connections.toml"`.
(ReadResultSetDataSource pid=30036)  * To restrict permissions, run `chmod 0600 "/Users/shchen/.snowflake/connections.toml"`.
(ReadResultSetDataSource pid=30036) 

(ReadResultSetDataSource pid=30036)   warn(f"Bad owner or permissions on {str(filep)}{chmod_message}")
(pid=30028) ✔️  Dataset execution finished in 2.03 seconds: : 0.00 row [00:01, ? row/s]                                                  
(pid=30028) ✔️  Dataset execution finished in 2.03 seconds: : 0.00 row [00:01, ? row/s]
(pid=30028) ✔️  Dataset execution finished in 2.03 seconds: : 0.00 row [00:01, ? row/s]

(pid=30028) ✔️  Dataset execution finished in 2.03 seconds: : 0.00 row [00:01, ? row/s]

(pid=30028) ✔️  Dataset execution finished in 1

In [19]:
# In this example, each parameter has 2 possible values, so the total number of unique combinations is 2 × 2 × 2 = 8.
tuner_results.results

Unnamed: 0,accuracy,should_checkpoint,trial_id,time_total_s,config/n_estimators,config/learning_rate,config/max_depth
0,0.967989,True,493d5_00000,5.543272,50,0.01,4
1,0.983299,True,493d5_00001,4.38812,50,0.03,4
2,0.981211,True,493d5_00002,3.590924,50,0.01,5
3,0.995129,True,493d5_00003,3.486435,50,0.03,5
4,0.967989,True,493d5_00004,3.259031,51,0.01,4
5,0.98469,True,493d5_00005,3.522211,51,0.03,4
6,0.982603,True,493d5_00006,3.254324,51,0.01,5
7,0.995825,True,493d5_00007,3.338931,51,0.03,5
