Skip to content

Commit

Permalink
Add example deployment scripts (#553)
Browse files Browse the repository at this point in the history
  • Loading branch information
amrit110 committed Feb 16, 2024
1 parent 6243956 commit e1ca767
Show file tree
Hide file tree
Showing 10 changed files with 1,686 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ repos:
types_or: [python, jupyter]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.7.1
rev: v1.8.0
hooks:
- id: mypy
entry: python3 -m mypy --config-file pyproject.toml
Expand Down
53 changes: 53 additions & 0 deletions deploy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Model Deployment with BentoML and Triton Inference Server

1. Install the required dependencies with poetry.
```bash
poetry install --with deploy
```
2. Serialize trained model and move it to the `model_repo` directory. Then create
a `config.pbtxt` file for the model.

**Example - torchxrayvision model**
```python
import torch
import torchxrayvision as xrv


model = xrv.models.ResNet(weights="resnet50-res512-all").eval().cuda()

dummy_input = (-1024 - 1024) * torch.rand(1, 1, 512, 512) + 1024
dummy_input = dummy_input.cuda()

torch.jit.trace(model, dummy_input).save("model_repo/resnet50_res512_all/1/model.pt")
```
See `model_repo/resnet50_res512_all/config.pbtxt` for an example of a pytorch model configuration file.

**Example - sklearn model**
```python
from skl2onnx import to_onnx


onnx_model = to_onnx(
<sklearn_model>,
<input_data>,
options={"zipmap": False},
)
with open("model_repo/<model_name>/1/model.onnx", "wb") as f:
f.write(onnx_model.SerializeToString())
```
See `model_repo/heart_failure_prediction/config.pbtxt` for an example of an ONNX model configuration file.
3. Create a service with BentoML with a triton runner. See `service.py` for an example.
4. Define a bentofile to specify which files to include in the bento. See `bentofile.yaml` for an example.
5. Build a bento.
```bash
bentoml build --do-not-track
```
6. Containerize the bento.
```bash
bentoml containerize -t model-service:alpha --enable-features=triton --do-not-track model-service:latest
```

7. Run the container with docker.
```bash
docker run -d --gpus=1 --rm -p 3000:3000 model-service:alpha
```
12 changes: 12 additions & 0 deletions deploy/bentofile.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
service: service:svc
include:
- /model_repo
- /*.py
exclude:
- /__pycache__
python:
packages:
- bentoml[triton]
- torchxrayvision==1.2.1
docker:
base_image: nvcr.io/nvidia/tritonserver:24.01-py3
5 changes: 5 additions & 0 deletions deploy/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
monitoring:
enabled: true
type: default
options:
output_dir: ./monitoring
41 changes: 41 additions & 0 deletions deploy/model_repo/densenet121_res224_all/config.pbtxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
backend: "pytorch"
name: "densenet121_res224_all"
max_batch_size: 64
dynamic_batching {
max_queue_delay_microseconds: 100
}
input {
name: "INPUT__0"
data_type: TYPE_FP32
dims: 1
dims: 224
dims: 224
}
output {
name: "OUTPUT__0"
data_type: TYPE_FP32
dims: -1
dims: 18
}
instance_group [
{
count: 1
kind: KIND_GPU
gpus: [0]
}
]
model_warmup [
{
name : "random sample"
count: 1
batch_size: 1
inputs {
key: "INPUT__0"
value: {
data_type: TYPE_FP32
dims: [1, 224, 224]
random_data: true
}
}
}
]
24 changes: 24 additions & 0 deletions deploy/model_repo/heart_failure_prediction/config.pbtxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
backend: "onnxruntime"
name: "heart_failure_prediction"
max_batch_size: 0
input {
name: "X"
data_type: TYPE_FP32
dims: [-1, 21]
}
output {
name: "label"
data_type: TYPE_INT64
dims: -1
}
instance_group [
{
count: 1
kind: KIND_CPU
}
]
optimization { execution_accelerators {
cpu_execution_accelerator : [ {
name : "openvino"
}]
}}
38 changes: 38 additions & 0 deletions deploy/model_repo/resnet50_res512_all/config.pbtxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
backend: "pytorch"
name: "resnet50_res512_all"
max_batch_size: 32
dynamic_batching {
max_queue_delay_microseconds: 100
}
input {
name: "INPUT__0"
data_type: TYPE_FP32
dims: 1
dims: 512
dims: 512
}
output {
name: "OUTPUT__0"
data_type: TYPE_FP32
dims: -1
dims: 18
}
instance_group [
{
count: 1
kind: KIND_GPU
gpus: [0]
}
]
model_warmup [{
name : "random sample"
batch_size: 1
inputs {
key: "INPUT__0"
value: {
data_type: TYPE_FP32
dims: [1, 512, 512]
random_data: true
}
}
}]
115 changes: 115 additions & 0 deletions deploy/service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Model serving service with Triton Inference Server as backend."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Literal

import bentoml
import numpy as np
import torchxrayvision as xrv
from torchvision import transforms


if TYPE_CHECKING:
from PIL.Image import Image


def get_transform(image_size: int) -> transforms.Compose:
"""Get image transformation for model inference."""
return transforms.Compose(
[
xrv.datasets.XRayCenterCrop(),
xrv.datasets.XRayResizer(image_size),
],
)


triton_runner = bentoml.triton.Runner(
"triton_runner",
"src/model_repo",
tritonserver_type="http",
cli_args=[
"--exit-on-error=true", # exits if any error occurs during initialization
"--http-restricted-api=model-repository:access-key=admin", # restrict access to load/unload APIs
"--model-control-mode=explicit", # enable explicit model loading/unloading
"--load-model=resnet50_res512_all",
],
)
svc = bentoml.Service("model-service", runners=[triton_runner])


@svc.api( # type: ignore
input=bentoml.io.Multipart(im=bentoml.io.Image(), model_name=bentoml.io.Text()),
output=bentoml.io.JSON(),
)
async def classify_xray(im: Image, model_name: str) -> dict[str, float]:
"""Classify X-ray image using specified model."""
img = np.asarray(im)
img = xrv.datasets.normalize(
img,
img.max(),
reshape=True, # normalize image to [-1024, 1024]
)

model_repo_index = await triton_runner.get_model_repository_index()
available_models = [model["name"] for model in model_repo_index]
if model_name not in available_models:
raise bentoml.exceptions.InvalidArgument(
f"Expected model name to be one of {available_models}, but got {model_name}",
)

img_size = 224
if "resnet" in model_name:
img_size = 512

img = get_transform(img_size)(img)

if len(img.shape) == 3:
img = img[None] # add batch dimension

InferResult = await getattr(triton_runner, model_name).async_run(img) # noqa: N806
return dict(
zip(xrv.datasets.default_pathologies, InferResult.as_numpy("OUTPUT__0")[0]),
)


@svc.api( # type: ignore
input=bentoml.io.NumpyNdarray(dtype="float32", shape=(-1, 21)),
output=bentoml.io.NumpyNdarray(dtype="int64", shape=(-1,)),
)
async def predict_heart_failure(X: np.ndarray) -> np.ndarray: # type: ignore
"""Run inference on heart failure prediction model."""
InferResult = await triton_runner.heart_failure_prediction.async_run( # noqa: N806
X,
)
return InferResult.as_numpy("label") # type: ignore[no-any-return]


# Triton Model management API
@svc.api(input=bentoml.io.JSON(), output=bentoml.io.JSON()) # type: ignore
async def model_config(input_model: dict[Literal["model_name"], str]) -> dict[str, Any]:
"""Retrieve model configuration from Triton Inference Server."""
return await triton_runner.get_model_config(input_model["model_name"]) # type: ignore


@svc.api(input=bentoml.io.Text(), output=bentoml.io.JSON()) # type: ignore
async def unload_model(input_model: str, ctx: bentoml.Context) -> dict[str, str]:
"""Unload a model from memory."""
await triton_runner.unload_model(
input_model,
headers=ctx.request.headers,
) # noqa: E501
return {"unloaded": input_model}


@svc.api(input=bentoml.io.Text(), output=bentoml.io.JSON()) # type: ignore
async def load_model(input_model: str, ctx: bentoml.Context) -> dict[str, str]:
"""Load a model into memory."""
await triton_runner.load_model(input_model, headers=ctx.request.headers)
return {"loaded": input_model}


@svc.api(input=bentoml.io.Text(), output=bentoml.io.JSON()) # type: ignore
async def list_models(_: str, ctx: bentoml.Context) -> list[str]:
"""Return a list of models available in the model repository."""
return await triton_runner.get_model_repository_index(headers=ctx.request.headers) # type: ignore
Loading

0 comments on commit e1ca767

Please sign in to comment.