In [1]:
import emoji

import mlflow
from mlflow.models import infer_signature
from mlflow import MlflowClient
from mlflow.models.model import update_model_requirements

import sklearn.datasets
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import train_test_split

import xgboost as xgb

import ray
from ray import train, tune
from ray.air.integrations.mlflow import setup_mlflow

In [2]:
TRACKING_URI = "http://0.0.0.0:5000"
EXP_NAME = "ml_platform"
MODEL_NAME = "xgb-breast-cancer-classifer"
ALIAS = "champion"
MINIMUM_REQUIRED_ACCURACY = 0.95

In [3]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(experiment_name=EXP_NAME)

client = MlflowClient()

2025/01/29 01:34:51 INFO mlflow.tracking.fluent: Experiment with name 'ml_platform' does not exist. Creating a new experiment.


In [None]:
ray.init(num_cpus=6)

2025-01-29 01:34:53,838	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.9.1
Ray version:,2.41.0
Dashboard:,http://127.0.0.1:8265


[36m(train_function_mlflow pid=1033594)[0m *** SIGSEGV received at time=1738110937 on cpu 15 ***
[36m(train_function_mlflow pid=1033594)[0m PC: @     0x7655e40d345e  (unknown)  ray::gcs::TaskInfoAccessor::AsyncAddTaskEventData()
[36m(train_function_mlflow pid=1033594)[0m     @     0x7655e5242520       3680  (unknown)
[36m(train_function_mlflow pid=1033594)[0m     @     0x7655e3fb61a5       1392  ray::core::worker::TaskEventBufferImpl::FlushEvents()
[36m(train_function_mlflow pid=1033594)[0m     @     0x7655e3f4216c       1488  ray::core::CoreWorker::Disconnect()
[36m(train_function_mlflow pid=1033594)[0m     @     0x7655e3f4251d       1152  ray::core::CoreWorker::ForceExit()
[36m(train_function_mlflow pid=1033594)[0m     @     0x7655e3f4294f       1680  ray::core::CoreWorker::HandleKillActor()
[36m(train_function_mlflow pid=1033594)[0m     @     0x7655e3f39e54        192  ray::rpc::ServerCallImpl<>::HandleRequestImpl()
[36m(train_function_mlflow pid=1033594)[0m     @ 

In [5]:
conda_env = {
    "name": "mlflow-env",
    "channels": ["conda-forge"],
    "dependencies": [
        "python=3.9.1",
        "pip<=24.0",
        {"pip": ["mlflow==2.20.0", "xgboost==2.1.3", "scikit-learn==1.5.2"]},
    ],
}


def train_function_mlflow(config: dict) -> None:
    setup_mlflow(
        config,
        experiment_name=EXP_NAME,
        tracking_uri=TRACKING_URI,
    )

    # Load dataset
    data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.2)

    # Pass the trial's config (hyperparameters for the xgb classifier)
    model = xgb.XGBClassifier(**config)
    model.fit(train_x, train_y)

    predictions = model.predict(test_x)
    # Measure accuracy and recall for this trial
    accuracy = accuracy_score(test_y, predictions)
    recall = recall_score(test_y, predictions)

    signature = infer_signature(train_x, model.predict(train_x))

    # Log the metrics as well as artifacts for this trial
    mlflow.log_metrics({"recall": recall, "accuracy": accuracy})
    mlflow.xgboost.log_model(
        model,
        "xgb_models",
        conda_env=conda_env,
        signature=signature,
        model_format="json",
    )
    # Get the best result later based on the following metrics
    train.report({"accuracy": accuracy, "recall": recall})


def tune_with_setup() -> tune.ResultGrid:
    # Each trial uses 2 cpus. Therfore, we have at most 3 trials running concurrently
    trainable_with_resources = tune.with_resources(train_function_mlflow, {"cpu": 2})
    tuner = tune.Tuner(
        trainable_with_resources,
        tune_config=tune.TuneConfig(
            num_samples=20,
        ),
        run_config=train.RunConfig(
            name="mlflow",
        ),
        param_space={
            "objective": "binary:logistic",
            "eval_metric": ["logloss", "error"],
            "max_depth": tune.randint(1, 9),
            "min_child_weight": tune.choice([1, 2, 3]),
            "subsample": tune.uniform(0.5, 1.0),
            "eta": tune.loguniform(1e-4, 1e-1),
        },
    )

    results = tuner.fit()
    return results

In [6]:
results = tune_with_setup()

0,1
Current time:,2025-01-29 01:35:37
Running for:,00:00:27.73
Memory:,10.9/62.5 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,iter,total time (s),accuracy,recall
train_function_mlflow_e519d_00000,TERMINATED,192.168.0.58:1031249,0.000192248,1,3,0.605626,1,1.26374,0.666667,1.0
train_function_mlflow_e519d_00001,TERMINATED,192.168.0.58:1031250,0.000715613,7,3,0.64082,1,2.5392,0.684211,1.0
train_function_mlflow_e519d_00002,TERMINATED,192.168.0.58:1031251,0.000136967,6,3,0.537129,1,1.42656,0.587719,1.0
train_function_mlflow_e519d_00003,TERMINATED,192.168.0.58:1031605,0.00336098,4,1,0.988544,1,1.27694,0.947368,1.0
train_function_mlflow_e519d_00004,TERMINATED,192.168.0.58:1031604,0.0445515,1,1,0.640442,1,1.00827,0.964912,0.985915
train_function_mlflow_e519d_00005,TERMINATED,192.168.0.58:1031719,0.0643398,4,1,0.81071,1,0.996416,0.991228,1.0
train_function_mlflow_e519d_00006,TERMINATED,192.168.0.58:1032009,0.0646932,5,3,0.859108,1,0.957932,0.964912,0.985915
train_function_mlflow_e519d_00007,TERMINATED,192.168.0.58:1032008,0.000262611,3,2,0.550478,1,1.10322,0.631579,1.0
train_function_mlflow_e519d_00008,TERMINATED,192.168.0.58:1032016,0.000211546,6,2,0.92184,1,0.920392,0.657895,1.0
train_function_mlflow_e519d_00009,TERMINATED,192.168.0.58:1032419,0.0496187,5,3,0.684554,1,0.849073,0.982456,0.986111


2025-01-29 01:35:37,584	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/ssafarveisi/ray_results/mlflow' in 0.0067s.
2025-01-29 01:35:37,593	INFO tune.py:1041 -- Total run time: 27.79 seconds (27.72 seconds for the tuning loop).


In [7]:
best_result = results.get_best_result(metric="accuracy", mode="max")

In [8]:
best_result.config

{'objective': 'binary:logistic',
 'eval_metric': ['logloss', 'error'],
 'max_depth': 4,
 'min_child_weight': 1,
 'subsample': 0.8107097678436095,
 'eta': 0.06433975027506766}

In [9]:
best_result.metrics_dataframe

Unnamed: 0,accuracy,recall,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,...,hostname,node_ip,time_since_restore,iterations_since_restore,config/objective,config/eval_metric,config/max_depth,config/min_child_weight,config/subsample,config/eta
0,0.991228,1.0,1738110917,,False,1,e519d_00005,2025-01-29_01-35-17,0.996416,0.996416,...,LXKA-J9SYDX3,192.168.0.58,0.996416,1,binary:logistic,"[logloss, error]",4,1,0.81071,0.06434


In [10]:
from mlflow.entities import ViewType

experiment_id = mlflow.get_experiment_by_name(name=EXP_NAME).experiment_id
runs = mlflow.search_runs(
    experiment_ids=[experiment_id], run_view_type=ViewType.ACTIVE_ONLY
)

In [11]:
best_run = runs.loc[runs["metrics.accuracy"].idxmax()]
best_run_id = best_run.run_id
best_run_accuracy = best_run["metrics.accuracy"]
model_uri = f"runs:/{best_run_id}/xgb_models"

In [12]:
if best_run_accuracy >= MINIMUM_REQUIRED_ACCURACY:
    print(emoji.emojize("Model accuracy met the required minimum accuracy :fire:"))
    result = mlflow.register_model(model_uri, MODEL_NAME)
else:
    print(
        emoji.emojize(
            "Best run did not meet the required minimum accuracy :sad_but_relieved_face:"
        )
    )

Successfully registered model 'xgb-breast-cancer-classifer'.
2025/01/29 01:35:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-breast-cancer-classifer, version 1


Model accuracy met the required minimum accuracy 🔥


Created version '1' of model 'xgb-breast-cancer-classifer'.


In [13]:
client.set_registered_model_alias(MODEL_NAME, ALIAS, result.version)

In [14]:
champion_version = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}@{ALIAS}")

In [15]:
data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
_, test_x, _, test_y = train_test_split(data, labels, test_size=0.2)


champion_version.predict(test_x)

array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1])

In [16]:
# Validate the model before deployment
mlflow.models.predict(
    model_uri=model_uri,
    input_data=test_x,
    env_manager="uv",
    install_mlflow=False,
)

2025/01/29 01:36:01 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'
2025/01/29 01:36:02 INFO mlflow.utils.virtualenv: Creating a new environment in /tmp/tmpvp2apo75/envs/virtualenv_envs/mlflow-96f1969b3ba316dbc956b1190898f30eb7ddec6d with python version 3.9.1 using uv
Using CPython [36m3.9.1[39m
Creating virtual environment at: [36m/tmp/tmpvp2apo75/envs/virtualenv_envs/mlflow-96f1969b3ba316dbc956b1190898f30eb7ddec6d[39m
Activate with: [32msource /tmp/tmpvp2apo75/envs/virtualenv_envs/mlflow-96f1969b3ba316dbc956b1190898f30eb7ddec6d/bin/activate.fish[39m
2025/01/29 01:36:02 INFO mlflow.utils.virtualenv: Installing dependencies
[2mUsing Python 3.9.1 environment at: /tmp/tmpvp2apo75/envs/virtualenv_envs/mlflow-96f1969b3ba316dbc956b1190898f30eb7ddec6d[0m
[2mResolved [1m3 packages[0m [2min 60ms[0m[0m
[2mInstalled [1m3 packages[0m [2min 19ms[0m[0m
 [32m+[39m [1mpip[0m[2m==24.0[0m
 [32m+[39m [1msetuptools[0m[2m==69.5.1[0m

{"predictions": [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1]}