In [1]:
import os
import yaml

import emoji
import tempfile

import mlflow
from mlflow import MlflowClient
from mlflow.entities import ViewType
from mlflow.models import infer_signature
from mlflow.artifacts import download_artifacts

import sklearn.datasets
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import train_test_split

import xgboost as xgb

import ray
from ray import train, tune
from ray.air.integrations.mlflow import setup_mlflow

In [2]:
TRACKING_URI = "http://0.0.0.0:5000"
EXP_NAME = "ml_platform"
MODEL_NAME = "iris-classifer"
ALIAS = "champion"
MINIMUM_REQUIRED_ACCURACY = 0.95

In [3]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(experiment_name=EXP_NAME)

client = MlflowClient()

2025/01/31 00:06:52 INFO mlflow.tracking.fluent: Experiment with name 'ml_platform' does not exist. Creating a new experiment.


In [None]:
ray.init(num_cpus=6)

In [6]:
conda_env = {
    "name": "mlflow-env",
    "channels": ["conda-forge"],
    "dependencies": [
        "python=3.9.1",
        "pip<=24.0",
        {"pip": ["xgboost==2.1.3", "scikit-learn==1.5.2"]},
    ],
}


def train_function_mlflow(config: dict) -> None:
    setup_mlflow(
        config,
        experiment_name=EXP_NAME,
        tracking_uri=TRACKING_URI,
    )

    # Load dataset
    iris = sklearn.datasets.load_iris(as_frame=True)
    train_x, test_x, train_y, test_y = train_test_split(
        iris.data, iris.target, test_size=0.2
    )

    # Pass the trial's config (hyperparameters for the xgb classifier)
    model = xgb.XGBClassifier(**config)
    model.fit(train_x, train_y)

    predictions = model.predict(test_x)
    # Measure accuracy and recall for this trial
    accuracy = accuracy_score(test_y, predictions)
    recall = recall_score(test_y, predictions, average="micro")

    signature = infer_signature(train_x, model.predict(train_x))

    # Log the metrics as well as artifacts for this trial
    mlflow.log_metrics({"recall": recall, "accuracy": accuracy})
    mlflow.xgboost.log_model(
        model,
        "iris_xgb",
        conda_env=conda_env,
        signature=signature,
        model_format="json",
    )
    # Get the best result later based on the following metrics
    train.report({"accuracy": accuracy, "recall": recall})


def tune_with_setup() -> tune.ResultGrid:
    # Each trial uses 2 cpus. Therfore, we have at most 3 trials running concurrently
    trainable_with_resources = tune.with_resources(train_function_mlflow, {"cpu": 2})
    tuner = tune.Tuner(
        trainable_with_resources,
        tune_config=tune.TuneConfig(
            num_samples=20,
        ),
        run_config=train.RunConfig(
            name="mlflow",
        ),
        param_space={
            "objective": "multi:softmax",
            "eval_metric": ["logloss", "error"],
            "max_depth": tune.randint(1, 9),
            "min_child_weight": tune.choice([1, 2, 3]),
            "subsample": tune.uniform(0.5, 1.0),
            "eta": tune.loguniform(1e-4, 1e-1),
        },
    )

    results = tuner.fit()
    return results

In [7]:
results = tune_with_setup()

0,1
Current time:,2025-01-31 00:07:43
Running for:,00:00:33.12
Memory:,10.0/62.5 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,iter,total time (s),accuracy,recall
train_function_mlflow_ef30c_00000,TERMINATED,192.168.0.38:1324893,0.0525817,6,1,0.692685,1,1.59573,0.866667,0.866667
train_function_mlflow_ef30c_00001,TERMINATED,192.168.0.38:1324891,0.000325841,4,3,0.924252,1,1.56864,0.9,0.9
train_function_mlflow_ef30c_00002,TERMINATED,192.168.0.38:1324892,0.00112543,3,3,0.915171,1,1.63149,0.966667,0.966667
train_function_mlflow_ef30c_00003,TERMINATED,192.168.0.38:1325334,0.0877102,4,2,0.988244,1,2.26393,0.9,0.9
train_function_mlflow_ef30c_00004,TERMINATED,192.168.0.38:1325335,0.0228883,7,1,0.924898,1,2.23254,0.933333,0.933333
train_function_mlflow_ef30c_00005,TERMINATED,192.168.0.38:1325342,0.0050706,6,3,0.942398,1,2.79626,0.9,0.9
train_function_mlflow_ef30c_00006,TERMINATED,192.168.0.38:1325712,0.0904068,6,1,0.648268,1,3.44265,0.966667,0.966667
train_function_mlflow_ef30c_00007,TERMINATED,192.168.0.38:1325713,0.0584087,4,2,0.698348,1,3.47976,1.0,1.0
train_function_mlflow_ef30c_00008,TERMINATED,192.168.0.38:1325743,0.000547302,6,2,0.567053,1,1.9792,1.0,1.0
train_function_mlflow_ef30c_00009,TERMINATED,192.168.0.38:1326042,0.00228894,4,1,0.689806,1,3.11315,1.0,1.0


2025-01-31 00:07:43,630	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/ssafarveisi/ray_results/mlflow' in 0.0038s.
2025-01-31 00:07:43,635	INFO tune.py:1041 -- Total run time: 33.18 seconds (33.12 seconds for the tuning loop).


In [13]:
# Best run after tunning
best_result = results.get_best_result(metric="accuracy", mode="max")

In [12]:
# Hyperparameters for the best run
best_result.config

{'objective': 'multi:softmax',
 'eval_metric': ['logloss', 'error'],
 'max_depth': 4,
 'min_child_weight': 2,
 'subsample': 0.6983477902707006,
 'eta': 0.05840872266708342}

In [14]:
# Best run metrics (e.g., accuracy and recall)
best_result.metrics_dataframe

Unnamed: 0,accuracy,recall,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,...,hostname,node_ip,time_since_restore,iterations_since_restore,config/objective,config/eval_metric,config/max_depth,config/min_child_weight,config/subsample,config/eta
0,1.0,1.0,1738278445,,False,1,ef30c_00007,2025-01-31_00-07-25,3.479762,3.479762,...,LXKA-J9SYDX3,192.168.0.38,3.479762,1,multi:softmax,"[logloss, error]",4,2,0.698348,0.058409


In [15]:

# Gather all runs in the experiment
experiment_id = mlflow.get_experiment_by_name(name=EXP_NAME).experiment_id
runs = mlflow.search_runs(
    experiment_ids=[experiment_id], run_view_type=ViewType.ACTIVE_ONLY
)

In [16]:
# Find the run id that maximizes the accuracy
best_run = runs.loc[runs["metrics.accuracy"].idxmax()]
best_run_id = best_run.run_id
best_run_accuracy = best_run["metrics.accuracy"]
model_uri = f"runs:/{best_run_id}/iris_xgb"

In [17]:
# Register the run id model if it meets the minimum required accuracy
if best_run_accuracy >= MINIMUM_REQUIRED_ACCURACY:
    print(emoji.emojize("Model accuracy met the required minimum accuracy :fire:"))
    result = mlflow.register_model(model_uri, MODEL_NAME)
else:
    print(
        emoji.emojize(
            "Best run did not meet the required minimum accuracy :sad_but_relieved_face:"
        )
    )

Successfully registered model 'iris-classifer'.
2025/01/31 00:11:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: iris-classifer, version 1


Model accuracy met the required minimum accuracy 🔥


Created version '1' of model 'iris-classifer'.


In [18]:
# Select an alias for the latest model's version (optional)
client.set_registered_model_alias(MODEL_NAME, ALIAS, result.version)

In [19]:
# Load the model and do some predictions
champion_version = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}@{ALIAS}")

In [20]:
iris = sklearn.datasets.load_iris(as_frame=True)
_, test_x, _, test_y = train_test_split(iris.data, iris.target, test_size=0.2)


champion_version.predict(test_x)

array([0, 2, 2, 0, 0, 1, 1, 0, 1, 2, 2, 0, 0, 0, 1, 2, 2, 1, 2, 1, 2, 1,
       2, 2, 1, 2, 2, 0, 0, 0], dtype=int32)

In [21]:
# Update best model's dependencies

mlflow.models.update_model_requirements(
    model_uri=model_uri,
    operation="add",
    requirement_list=["boto3==1.35.99", "kserve[ray]"],
)

2025/01/31 00:12:48 INFO mlflow.models.model: Retrieving model requirements files from mlflow-artifacts:/1/e950a645ccb140bfb1ebc01726c393d7/artifacts/iris_xgb...
2025/01/31 00:12:49 INFO mlflow.models.model: Done updating requirements!

Old requirements:
['mlflow==2.20.0', 'xgboost==2.1.3', 'scikit-learn==1.5.2']

Updated requirements:
['mlflow==2.20.0',
 'xgboost==2.1.3',
 'scikit-learn==1.5.2',
 'boto3==1.35.99',
 'kserve[ray]']

2025/01/31 00:12:49 INFO mlflow.models.model: Uploading updated requirements files to mlflow-artifacts:/1/e950a645ccb140bfb1ebc01726c393d7/artifacts/iris_xgb...


In [22]:
# Validate the model before deployment
mlflow.models.predict(
    model_uri=model_uri,
    input_data=test_x,
    env_manager="uv",
    install_mlflow=False,
)

2025/01/31 00:13:02 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'
2025/01/31 00:13:03 INFO mlflow.utils.virtualenv: Creating a new environment in /tmp/tmpr3wdtic7/envs/virtualenv_envs/mlflow-ab20aa967a187f9b9ec2be80aa3fdcbd7627537c with python version 3.9.1 using uv
Using CPython [36m3.9.1[39m
Creating virtual environment at: [36m/tmp/tmpr3wdtic7/envs/virtualenv_envs/mlflow-ab20aa967a187f9b9ec2be80aa3fdcbd7627537c[39m
Activate with: [32msource /tmp/tmpr3wdtic7/envs/virtualenv_envs/mlflow-ab20aa967a187f9b9ec2be80aa3fdcbd7627537c/bin/activate[39m
2025/01/31 00:13:03 INFO mlflow.utils.virtualenv: Installing dependencies
[2mUsing Python 3.9.1 environment at: /tmp/tmpr3wdtic7/envs/virtualenv_envs/mlflow-ab20aa967a187f9b9ec2be80aa3fdcbd7627537c[0m
[2mResolved [1m3 packages[0m [2min 93ms[0m[0m
[2mInstalled [1m3 packages[0m [2min 21ms[0m[0m
 [32m+[39m [1mpip[0m[2m==24.2[0m
 [32m+[39m [1msetuptools[0m[2m==74.1.2[0m
 [3

{"predictions": [0, 2, 2, 0, 0, 1, 1, 0, 1, 2, 2, 0, 0, 0, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 0, 0, 0]}

Download the best model's artifacts (e.g., `requirements.txt` and `.python-version`). This is necessary to create the Kserve inference service later. The artifacts will be saved in a specific directory (`best_model_artifacts`) of the git repository.

In [26]:
def get_best_model_artifacts(s3_path: str) -> None:
    """Saves requirements.txt and .python-version in /best_model_artifacts"""

    download_artifacts(
        artifact_uri=s3_path + "requirements.txt", dst_path="./best_model_artifacts"
    )
    print("Saved requirements.txt in /best_model_artifacts")

    with tempfile.TemporaryDirectory() as td:
        download_artifacts(artifact_uri=s3_path + "python_env.yaml", dst_path=td)
        python_env_file_path = os.path.join(td, "python_env.yaml")
        with open(python_env_file_path, "r") as f:
            data = yaml.safe_load(f)
            python_version = data.get("python")
            if not python_version:
                print("Error: No 'python' key found in the YAML file.")
            else:
                python_version_file_path = os.path.join(
                    "./best_model_artifacts", ".python-version"
                )
                with open(python_version_file_path, "w") as f:
                    f.write(str(python_version) + "\n")
    print("Saved .python-version in /best_model_artifacts")

In [27]:
s3_path = f"s3://customerintelligence/ml_platform/mlartifacts/{experiment_id}/{best_run_id}/artifacts/iris_xgb/"
get_best_model_artifacts(s3_path=s3_path)

Saved requirements.txt in /best_model_artifacts
Saved .python-version in /best_model_artifacts


In [25]:
ray.shutdown()