# BentoML + MLFlow
I am foolwing the tutorial available here: [bentoml+MLFlow](https://www.bentoml.com/blog/building-ml-pipelines-with-mlflow-and-bentoml)

## Create Model

In [1]:
import mlflow
from mlflow.models import infer_signature

import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the Iris dataset
X, y = datasets.load_iris(return_X_y=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define the model hyperparameters
params = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "random_state": 8888,
}

# Train the model
lr = LogisticRegression(**params)
lr.fit(X_train, y_train)

# Predict on the test set
y_pred = lr.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

1.0


In [4]:
from datetime import date

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow+BentoML Quickstart")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic LR model for iris data")

    # Infer the model signature
    signature = infer_signature(X_train, lr.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=lr,
        name="iris_model", # demo is out of date. Artifact_uri deperecated
        signature=signature,
        input_example=X_train,
        registered_model_name="iris_demo",
    )
    # this function no longer works because MLflow 3.x seperates where models are stored now
    # model_uri = mlflow.get_artifact_uri("iris_model") 

Registered model 'iris_demo' already exists. Creating a new version of this model...
2025/06/23 18:10:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: iris_demo, version 2
Created version '2' of model 'iris_demo'.


🏃 View run illustrious-goat-675 at: http://127.0.0.1:8080/#/experiments/1/runs/3bce4092ddfd45ef9ec692d0e779099d
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/1


In [5]:
model_uri = model_info.model_uri
print("Model URI:", model_uri)  # e.g., models:/<model_id>/1

Model URI: models:/m-c63fc5329d3f45f5bfa6aa5ce8864763


In [4]:
mlflow.artifacts.list_artifacts(model_uri)

[<FileInfo: file_size=1061, is_dir=False, path='MLmodel'>,
 <FileInfo: file_size=229, is_dir=False, path='conda.yaml'>,
 <FileInfo: file_size=2640, is_dir=False, path='input_example.json'>,
 <FileInfo: file_size=838, is_dir=False, path='model.pkl'>,
 <FileInfo: file_size=123, is_dir=False, path='python_env.yaml'>,
 <FileInfo: file_size=107, is_dir=False, path='requirements.txt'>,
 <FileInfo: file_size=6740, is_dir=False, path='serving_input_example.json'>]

In [5]:
mlflow.artifacts.download_artifacts(model_uri)

'/tmp/tmprq1pkjtd/'

In [6]:
import bentoml

bento_model = bentoml.mlflow.import_model(
    'iris', 
    model_uri=model_uri,
    labels={
        "team": "bento",
        "stage": "dev",
        "accuracy": accuracy,
        "training_date": str(date.today())
    }
)

  __import__("pkg_resources").declare_namespace(__name__)  # type: ignore
'labels' should be a dict[str, str] and enforced by BentoML. Converting all values to string.


In [7]:
import numpy as np

import bentoml

# Load the latest version of iris model:
iris_model = bentoml.mlflow.load_model("iris:latest")

# Alternatively, load the model by specifying the model tag
# iris_model = bentoml.mlflow.load_model("iris:hu5d7xxs3oxmnuqj")

input_data = np.array([[5.9, 3, 5.1, 1.8]])
res = iris_model.predict(input_data)
print(res)

[2]


In [None]:
# !bentoml serve service.py:IrisClassifier

  __import__("pkg_resources").declare_namespace(__name__)  # type: ignore
2025-06-22T22:33:07+0000 [INFO] [cli] Starting production HTTP BentoServer from "service.py:IrisClassifier" listening on http://localhost:3000 (Press CTRL+C to quit)
  __import__("pkg_resources").declare_namespace(__name__)  # type: ignore
2025-06-22T22:33:11+0000 [INFO] [entry_service:IrisClassifier:1] Service IrisClassifier initialized
^C
2025-06-22T22:33:41+0000 [INFO] [entry_service:IrisClassifier:1] Service instance cleanup finalized


In [14]:
import bentoml
import numpy as np

client = bentoml.SyncHTTPClient("http://localhost:3000")
client.predict(np.array([[5.9, 3, 5.1, 1.8]])) # ['virginica']

['virginica']

In [15]:
# Example of API call with no input validation. expects float inputs
!curl -X 'POST' \
  'http://localhost:3000/predict' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{"input_data": [[5,3,5,2]]}'

{"error":"An unexpected error has occurred, please check the server log."}

In [16]:
import bentoml
import numpy as np

# Example of manual input validation where we force type to float
client = bentoml.SyncHTTPClient("http://localhost:3000")
client.predict(np.array([[1,1,1,1]], dtype='float64'))

['setosa']

In [17]:
import requests
from concurrent.futures import ThreadPoolExecutor
import time
import random

CONCURRENCY = 20        # Number of threads (concurrent requests)
TOTAL_REQUESTS = 1000     # Total number of requests to send
client = bentoml.SyncHTTPClient("http://localhost:3000")

from sklearn.datasets import load_iris
iris = load_iris()
data_samples = iris.data.tolist()
payloads = [random.choice(data_samples) for _ in range(TOTAL_REQUESTS)]

def send_request(index, data):
    """Send a single HTTP request and print the result."""
    try:
        start_time = time.time()
        response = client.predict(np.array([data]))
        duration = time.time() - start_time
    except Exception as e:
        print(f"Request {index}: Error -> {e}")

print(f"Sending {TOTAL_REQUESTS} requests to {client.url} with concurrency {CONCURRENCY}...")
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
    for i, data in enumerate(payloads, start=1):
        executor.submit(send_request, i, data)

print("Done.")

Sending 1000 requests to http://localhost:3000 with concurrency 20...
Done.
