In [17]:
import os
import yaml

import emoji
import tempfile

import mlflow
from mlflow import MlflowClient
from mlflow.entities import ViewType
from mlflow.models import infer_signature
from mlflow.artifacts import download_artifacts

import sklearn.datasets
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import train_test_split

import xgboost as xgb

import ray
from ray import train, tune
from ray.air.integrations.mlflow import setup_mlflow

## Glossary

* Ds: Data Scientist

In [18]:
# URI for the mlflow tracking server
TRACKING_URI = "http://0.0.0.0:5000"
EXP_NAME = "ml_platform"
# Name for the mlflow registered model
MODEL_NAME = "iris-classifer"
# Alias for the mlflow registered model
ALIAS = "champion"
# Minimum accuracy before registering a new version of a model 
MINIMUM_REQUIRED_ACCURACY = 0.95

In [19]:
mlflow.set_tracking_uri(TRACKING_URI)
# Ds sets the experiment to which the mlflow runs' metrics and artifacts should be logged
mlflow.set_experiment(experiment_name=EXP_NAME)

client = MlflowClient()

In [4]:
# Start a local ray cluster
ray.init(num_cpus=6)

2025-01-31 19:12:46,376	INFO worker.py:1841 -- Started a local Ray instance.


0,1
Python version:,3.9.1
Ray version:,2.41.0


In [56]:
# Ds sets the dependencies for the logged artifacts
conda_env = {
    "name": "mlflow-env",
    "channels": ["conda-forge"],
    "dependencies": [
        "python=3.9.1",
        "pip<=24.0",
        {"pip": ["xgboost==2.1.3", "scikit-learn==1.5.2"]}, # Minimum dependencies for xgboost model
    ],
}


# Training for the selected hyperparameters happens here
def train_function_mlflow(config: dict) -> None:
    setup_mlflow(
        config,
        experiment_name=EXP_NAME,
        tracking_uri=TRACKING_URI,
    )

    # Load sampple dataset
    iris = sklearn.datasets.load_iris(as_frame=True)
    train_x, test_x, train_y, test_y = train_test_split(
        iris.data, iris.target, test_size=0.2
    )

    # Pass the trial's config (hyperparameters for the xgb classifier)
    model = xgb.XGBClassifier(**config)
    model.fit(train_x, train_y)
    predictions = model.predict(test_x)
    
    # Measure accuracy and recall for this trial
    accuracy = accuracy_score(test_y, predictions)
    recall = recall_score(test_y, predictions, average="micro")

    signature = infer_signature(train_x, model.predict(train_x))

    # Log the metrics as well as artifacts for this trial
    mlflow.log_metrics({"recall": recall, "accuracy": accuracy})
    mlflow.xgboost.log_model(
        model,
        "iris_xgb",
        conda_env=conda_env,
        signature=signature,
        model_format="json",
    )

    # Get the best result later based on the following metrics
    train.report({"accuracy": accuracy, "recall": recall})


def tune_with_setup() -> tune.ResultGrid:
    '''Concurrent hyperparameter tunning starts here'''
    # Each trial uses 2 cpus. Therfore, we have at most 3 trials running concurrently (there are 6 cpus available in the local cluster)
    trainable_with_resources = tune.with_resources(train_function_mlflow, {"cpu": 2})
    tuner = tune.Tuner(
        trainable_with_resources,
        tune_config=tune.TuneConfig(
            num_samples=20, # Total number of trials
        ),
        run_config=train.RunConfig(
            name="mlflow",
        ),
        param_space={
            "objective": "multi:softmax", # Multi-class classification
            "eval_metric": ["logloss", "error"],
            "max_depth": tune.randint(1, 9),
            "min_child_weight": tune.choice([1, 2, 3]),
            "subsample": tune.uniform(0.5, 1.0),
            "eta": tune.loguniform(1e-4, 1e-1),
        },
    )

    # Start concurrent hyperparameter tunning
    results = tuner.fit()
    # Return the ray train results for later inspection
    return results

In [6]:
results = tune_with_setup()

0,1
Current time:,2025-01-31 19:13:19
Running for:,00:00:27.78
Memory:,11.2/62.5 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,iter,total time (s),accuracy,recall
train_function_mlflow_fc00c_00000,TERMINATED,192.168.0.40:1474766,0.000533959,2,2,0.678547,1,2.43252,0.966667,0.966667
train_function_mlflow_fc00c_00001,TERMINATED,192.168.0.40:1474767,0.00049573,2,1,0.642216,1,1.5755,0.966667,0.966667
train_function_mlflow_fc00c_00002,TERMINATED,192.168.0.40:1474768,0.0282576,7,1,0.603154,1,1.56627,1.0,1.0
train_function_mlflow_fc00c_00003,TERMINATED,192.168.0.40:1475145,0.0158271,6,2,0.861275,1,1.53177,0.966667,0.966667
train_function_mlflow_fc00c_00004,TERMINATED,192.168.0.40:1475144,0.000364151,1,1,0.984392,1,0.915632,0.9,0.9
train_function_mlflow_fc00c_00005,TERMINATED,192.168.0.40:1475253,0.0996692,8,2,0.686948,1,1.34163,1.0,1.0
train_function_mlflow_fc00c_00006,TERMINATED,192.168.0.40:1475429,0.0916705,8,2,0.778498,1,1.44207,0.866667,0.866667
train_function_mlflow_fc00c_00007,TERMINATED,192.168.0.40:1475507,0.000394149,2,1,0.948909,1,0.868601,0.966667,0.966667
train_function_mlflow_fc00c_00008,TERMINATED,192.168.0.40:1475508,0.000826573,7,1,0.912702,1,0.960833,1.0,1.0
train_function_mlflow_fc00c_00009,TERMINATED,192.168.0.40:1475787,0.000119442,6,3,0.510851,1,0.815149,0.933333,0.933333


2025-01-31 19:13:19,251	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/ssafarveisi/ray_results/mlflow' in 0.0056s.
2025-01-31 19:13:19,257	INFO tune.py:1041 -- Total run time: 27.82 seconds (27.77 seconds for the tuning loop).


In [7]:
# Best run after tunning
best_result = results.get_best_result(metric="accuracy", mode="max")

In [8]:
# Hyperparameters for the best run
best_result.config

{'objective': 'multi:softmax',
 'eval_metric': ['logloss', 'error'],
 'max_depth': 7,
 'min_child_weight': 1,
 'subsample': 0.6031536534091184,
 'eta': 0.028257638523651336}

In [9]:
# Best run metrics (e.g., accuracy and recall)
best_result.metrics_dataframe

Unnamed: 0,accuracy,recall,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,...,hostname,node_ip,time_since_restore,iterations_since_restore,config/objective,config/eval_metric,config/max_depth,config/min_child_weight,config/subsample,config/eta
0,1.0,1.0,1738347174,,False,1,fc00c_00002,2025-01-31_19-12-54,1.566267,1.566267,...,LXKA-J9SYDX3,192.168.0.40,1.566267,1,multi:softmax,"[logloss, error]",7,1,0.603154,0.028258


In [None]:
# Stop the local ray cluster
ray.shutdown()

In [20]:
# Ds gathers all runs in the experiment
experiment_id = mlflow.get_experiment_by_name(name=EXP_NAME).experiment_id
runs = mlflow.search_runs(
    experiment_ids=[experiment_id], run_view_type=ViewType.ACTIVE_ONLY
)

In [21]:
# Ds finds the run id that maximizes the accuracy (performance metric)
best_run = runs.loc[runs["metrics.accuracy"].idxmax()]
best_run_id = best_run.run_id
best_run_accuracy = best_run["metrics.accuracy"]
model_uri = f"runs:/{best_run_id}/iris_xgb"

In [None]:
# Ds registers the run id model if it meets the minimum required accuracy
if best_run_accuracy >= MINIMUM_REQUIRED_ACCURACY:
    print(emoji.emojize("Model accuracy met the required minimum accuracy :fire:"))
    result = mlflow.register_model(model_uri, MODEL_NAME)
else:
    print(
        emoji.emojize(
            "Best run did not meet the required minimum accuracy :sad_but_relieved_face:"
        )
    )

Successfully registered model 'iris-classifer'.
2025/01/31 19:13:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: iris-classifer, version 1


Model accuracy met the required minimum accuracy 🔥


Created version '1' of model 'iris-classifer'.


In [68]:
print(f"Updated the version of the registered model '{MODEL_NAME}' to '{result.version}'")

Updated the version of the registered model 'iris-classifer' to '1'


In [13]:
# Ds selects an alias for the latest model's version (optional as the model deployment is based on the artifact s3 path)
client.set_registered_model_alias(MODEL_NAME, ALIAS, result.version)

In [50]:
# Ds updates the best model's dependencies (mandatory for the model deployment into Kserve cluster)
mlflow.models.update_model_requirements(
    model_uri=model_uri,
    operation="add",
    requirement_list=["mlserver==1.6.1", "mlserver-mlflow==1.6.1", "pydantic==2.7.1", "conda-pack==0.8.1"],
)

2025/02/01 01:10:35 INFO mlflow.models.model: Retrieving model requirements files from mlflow-artifacts:/1/ca1b0f2603494e489c589f45dbaf547f/artifacts/iris_xgb...
2025/02/01 01:10:36 INFO mlflow.models.model: Done updating requirements!

Old requirements:
['mlflow==2.20.0',
 'xgboost==2.1.3',
 'scikit-learn==1.5.2',
 'mlserver==1.6.1',
 'mlserver-mlflow',
 'pydantic==2.7.1',
 'conda-pack==0.8.1']

Updated requirements:
['mlflow==2.20.0',
 'xgboost==2.1.3',
 'scikit-learn==1.5.2',
 'mlserver==1.6.1',
 'mlserver-mlflow==1.6.1',
 'pydantic==2.7.1',
 'conda-pack==0.8.1']

2025/02/01 01:10:36 INFO mlflow.models.model: Uploading updated requirements files to mlflow-artifacts:/1/ca1b0f2603494e489c589f45dbaf547f/artifacts/iris_xgb...


In [72]:
# Ds reloads the data to do a sanity check
iris = sklearn.datasets.load_iris(as_frame=True)
_, test_x, _, test_y = train_test_split(iris.data, iris.target, test_size=0.2)

# Ds validates the model before deployment
mlflow.models.predict(
    model_uri=model_uri,
    input_data=test_x,
    env_manager="uv",
    install_mlflow=False,
)

2025/02/01 19:08:31 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'
2025/02/01 19:08:46 INFO mlflow.utils.virtualenv: Creating a new environment in /tmp/tmpwnl239k8/envs/virtualenv_envs/mlflow-c551d6c3d16ace22ef4327d512732163beac06b9 with python version 3.9.1 using uv
Using CPython [36m3.9.1[39m
Creating virtual environment at: [36m/tmp/tmpwnl239k8/envs/virtualenv_envs/mlflow-c551d6c3d16ace22ef4327d512732163beac06b9[39m
Activate with: [32msource /tmp/tmpwnl239k8/envs/virtualenv_envs/mlflow-c551d6c3d16ace22ef4327d512732163beac06b9/bin/activate[39m
2025/02/01 19:08:46 INFO mlflow.utils.virtualenv: Installing dependencies
[2mUsing Python 3.9.1 environment at: /tmp/tmpwnl239k8/envs/virtualenv_envs/mlflow-c551d6c3d16ace22ef4327d512732163beac06b9[0m
[2mResolved [1m3 packages[0m [2min 36ms[0m[0m
[2mInstalled [1m3 packages[0m [2min 20ms[0m[0m
 [32m+[39m [1mpip[0m[2m==24.2[0m
 [32m+[39m [1msetuptools[0m[2m==74.1.2[0m
 [3

{"predictions": [2, 0, 2, 1, 2, 1, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0, 2, 0, 1, 1, 2, 0, 2, 1, 0, 1, 0, 1, 1]}

Download the best model's artifacts (e.g., `requirements.txt` and `.python-version`). This is necessary to create the Kserve inference service later. The artifacts will be saved in a specific directory (`best_model_artifacts`) of the git repository.

In [22]:
def get_best_model_artifacts(s3_path: str) -> None:
    """Saves requirements.txt and .python-version in /best_model_artifacts"""

    download_artifacts(
        artifact_uri=s3_path + "requirements.txt", dst_path="./best_model_artifacts"
    )
    print("Saved requirements.txt in /best_model_artifacts")
    
    download_artifacts(
        artifact_uri=s3_path + "conda.yaml", dst_path="./best_model_artifacts"
    )
    print("Saved conda.yaml in /best_model_artifacts")

    with tempfile.TemporaryDirectory() as td:
        download_artifacts(artifact_uri=s3_path + "python_env.yaml", dst_path=td)
        python_env_file_path = os.path.join(td, "python_env.yaml")
        with open(python_env_file_path, "r") as f:
            data = yaml.safe_load(f)
            python_version = data.get("python")
            if not python_version:
                print("Error: No 'python' key found in the YAML file.")
            else:
                python_version_file_path = os.path.join(
                    "./best_model_artifacts", ".python-version"
                )
                with open(python_version_file_path, "w") as f:
                    f.write(str(python_version) + "\n")
    print("Saved .python-version in /best_model_artifacts")

In [23]:
s3_path = f"s3://customerintelligence/ml_platform/mlartifacts/{experiment_id}/{best_run_id}/artifacts/iris_xgb/"
print(s3_path)
get_best_model_artifacts(s3_path=s3_path)

s3://customerintelligence/ml_platform/mlartifacts/1/ca1b0f2603494e489c589f45dbaf547f/artifacts/iris_xgb/
Saved requirements.txt in /best_model_artifacts
Saved conda.yaml in /best_model_artifacts
Saved .python-version in /best_model_artifacts


In [75]:
mlflow.artifacts.list_artifacts(
    artifact_uri=s3_path
)

[<FileInfo: file_size=987, is_dir=False, path='MLmodel'>,
 <FileInfo: file_size=240, is_dir=False, path='conda.yaml'>,
 <FileInfo: file_size=678210690, is_dir=False, path='environment.tar.gz'>,
 <FileInfo: file_size=184676, is_dir=False, path='model.json'>,
 <FileInfo: file_size=111, is_dir=False, path='python_env.yaml'>,
 <FileInfo: file_size=122, is_dir=False, path='requirements.txt'>]

In [10]:
def write_best_run_spec(experiment_id: str, run_id: str) -> None:
    """
    Creates (or overwrites) a file named 'run_spec' with the specified content.
    """
    with open("best_model_artifacts/run_spec", "w") as f:
        f.write(f'EXPERIMENT_ID={experiment_id}\n')
        f.write(f'RUN_ID={run_id}\n')

In [12]:
# Update the run sepc for the best run
write_best_run_spec(experiment_id="1", run_id="ca1b0f2603494e489c589f45dbaf547f")