Skip to content

Commit

Permalink
feat: keep_warm=1 for noromaid mixtral & bagel (#75)
Browse files Browse the repository at this point in the history
* deps: bump modal to 0.61.30

* refactor: stop parametrizing the model container classes

* feat: add keep_warm=1 for noromaid & bagel

* feat: use @modal.enter decorator to cold start before marking warm
  • Loading branch information
sambarnes committed Mar 8, 2024
1 parent db6ed5d commit c22a91c
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 38 deletions.
6 changes: 3 additions & 3 deletions modal/runner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from modal import Secret, asgi_app

from runner.containers import DEFAULT_CONTAINERS
from runner.containers.vllm_unified import REGISTERED_CONTAINERS
from runner.shared.clean import clean_models_volume
from runner.shared.common import stub
from runner.shared.download import download_model, downloader_image
Expand Down Expand Up @@ -40,7 +40,7 @@ def completion(): # named for backwards compatibility with the Modal URL
def download(force: bool = False):
logger = get_logger("download")
logger.info("Downloading all models...")
for model in DEFAULT_CONTAINERS:
for model in REGISTERED_CONTAINERS:
# Can't be parallelized because of a modal volume corruption issue
download_model.local(model, force=force)
logger.info("ALL DONE!")
Expand All @@ -56,5 +56,5 @@ def download(force: bool = False):
def clean(all: bool = False, dry: bool = False):
logger = get_logger("clean")
logger.info(f"Cleaning models volume. ALL: {all}. DRY: {dry}")
remaining_models = [] if all else [m.lower() for m in DEFAULT_CONTAINERS]
remaining_models = [] if all else [m.lower() for m in REGISTERED_CONTAINERS]
clean_models_volume(remaining_models, dry)
17 changes: 0 additions & 17 deletions modal/runner/containers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +0,0 @@
from .vllm_unified import (
VllmContainer_IntelNeuralChat7B,
VllmContainer_JebCarterPsyfighter13B,
VllmContainer_JohnDurbinBagel34B,
VllmContainer_KoboldAIPsyfighter2,
VllmContainer_MicrosoftPhi2,
VllmContainer_NeverSleepNoromaidMixtral8x7B,
)

DEFAULT_CONTAINERS = {
"microsoft/phi-2": VllmContainer_MicrosoftPhi2,
"Intel/neural-chat-7b-v3-1": VllmContainer_IntelNeuralChat7B,
"jebcarter/Psyfighter-13B": VllmContainer_JebCarterPsyfighter13B,
"KoboldAI/LLaMA2-13B-Psyfighter2": VllmContainer_KoboldAIPsyfighter2,
"NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": VllmContainer_NeverSleepNoromaidMixtral8x7B,
"jondurbin/bagel-34b-v0.2": VllmContainer_JohnDurbinBagel34B,
}
48 changes: 38 additions & 10 deletions modal/runner/containers/vllm_unified.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,31 @@
import os
from pathlib import Path
from typing import Optional

import modal.gpu
import sentry_sdk

from runner.engines.vllm import VllmEngine, VllmParams, vllm_image
from runner.shared.common import stub
from shared.config import is_env_dev
from shared.logging import (
get_logger,
get_observability_secrets,
)
from shared.protocol import GPUType
from shared.volumes import does_model_exist, models_path, models_volume
from shared.volumes import (
does_model_exist,
get_model_path,
models_path,
models_volume,
)


def _make_container(
name: str,
model_name: str,
gpu: modal.gpu = modal.gpu.A100(count=1, memory=40),
concurrent_inputs: int = 8,
max_containers: int = None,
keep_warm: int = None,
):
"""Helper function to create a container with the given GPU configuration."""

Expand All @@ -31,14 +37,16 @@ def _make_container(
else:
raise ValueError(f"Unknown GPU type: {gpu}")

# Avoid wasting resources & money in dev
if keep_warm and is_env_dev():
print("Dev environment detected, disabling keep_warm for", name)
keep_warm = None

class _VllmContainer(VllmEngine):
def __init__(
self,
model_path: Path,
max_model_len: Optional[int] = None,
):
def __init__(self):
logger = get_logger(name)
try:
model_path = get_model_path(model_name=model_name)
if not does_model_exist(model_path):
raise Exception("Unable to locate model {}", model_path)

Expand All @@ -58,10 +66,16 @@ def __init__(
params=VllmParams(
model=str(model_path),
tensor_parallel_size=num_gpus,
max_model_len=max_model_len,
),
)

# For any containers with keep_warm, we need to skip cold-start usage
# billing. This is because the first request might be minutes after
# the container is started, and we don't want to record that time as
# usage.
if keep_warm:
self.is_first_request = False

# Performance improvement from https://github.com/vllm-project/vllm/issues/2073#issuecomment-1853422529
if num_gpus > 1:
import subprocess
Expand Down Expand Up @@ -91,39 +105,53 @@ def __init__(
timeout=10 * 60,
secrets=[*get_observability_secrets()],
concurrency_limit=max_containers,
keep_warm=keep_warm,
)
return wrap(_VllmContainer)
_cls = wrap(_VllmContainer)
REGISTERED_CONTAINERS[model_name] = _cls
return _cls


# A mapping of model names to their respective container classes.
REGISTERED_CONTAINERS = {}

VllmContainer_MicrosoftPhi2 = _make_container(
name="VllmContainer_MicrosoftPhi2",
model_name="microsoft/phi-2",
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=120,
)
VllmContainer_IntelNeuralChat7B = _make_container(
name="VllmContainer_IntelNeuralChat7B",
model_name="Intel/neural-chat-7b-v3-1",
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=100,
)
VllmContainer_JebCarterPsyfighter13B = _make_container(
"VllmContainer_JebCarterPsyfighter13B",
model_name="jebcarter/Psyfighter-13B",
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=32,
)
VllmContainer_KoboldAIPsyfighter2 = _make_container(
name="VllmContainer_KoboldAIPsyfighter2",
model_name="KoboldAI/LLaMA2-13B-Psyfighter2",
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=32,
)
VllmContainer_NeverSleepNoromaidMixtral8x7B = _make_container(
name="VllmContainer_NeverSleepNoromaidMixtral8x7B",
model_name="NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3",
gpu=modal.gpu.A100(count=2, memory=80),
concurrent_inputs=4,
max_containers=3,
keep_warm=1,
)
VllmContainer_JohnDurbinBagel34B = _make_container(
name="VllmContainer_JohnDurbinBagel34B",
model_name="jondurbin/bagel-34b-v0.2",
gpu=modal.gpu.A100(count=2, memory=80),
concurrent_inputs=4,
max_containers=1,
keep_warm=1,
)
6 changes: 3 additions & 3 deletions modal/runner/endpoints/completion.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from fastapi import Request, status
from fastapi.responses import StreamingResponse

from runner.containers import DEFAULT_CONTAINERS
from runner.containers.vllm_unified import REGISTERED_CONTAINERS
from runner.shared.common import BACKLOG_THRESHOLD
from runner.shared.sampling_params import SamplingParams
from shared.logging import get_logger
Expand Down Expand Up @@ -38,7 +38,7 @@ def completion(
f"Unable to locate model {payload.model}",
)

container = DEFAULT_CONTAINERS.get(payload.model)
container = REGISTERED_CONTAINERS.get(payload.model)
if container is None:
message = f"Unable to locate container type for model {payload.model}"
logger.error(message)
Expand All @@ -47,7 +47,7 @@ def completion(
f"Unable to locate container type for model {payload.model}",
)

runner = container(model_path)
runner = container()

stats = runner.generate.get_current_stats()
logger.info(stats)
Expand Down
5 changes: 4 additions & 1 deletion modal/runner/engines/vllm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import time
from typing import Optional

from modal import Image, method
from modal import Image, enter, method
from pydantic import BaseModel

from shared.logging import (
Expand Down Expand Up @@ -70,11 +70,14 @@ def __init__(
self.gpu_type = gpu_type
self.is_first_request = True
self.t_cold_start = time.time()
self.engine = None
self.engine_args = AsyncEngineArgs(
**params.dict(),
disable_log_requests=True,
)

@enter()
def startup(self):
with timer("engine init", model=self.engine_args.model):
self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)

Expand Down
5 changes: 5 additions & 0 deletions modal/shared/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
_auth = HTTPBearer()


def is_env_dev() -> bool:
"""Returns whether this is running in a development environment."""
return os.getenv("DD_ENV", "development") == "development"


class Config(BaseModel):
name: str
api_key_id: str
Expand Down
6 changes: 3 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ readme = "README.md"
[tool.poetry.dependencies]
python = ">=3.10,<3.12"
huggingface-hub = "^0.17.1"
modal = "^0.61.24"
modal = "^0.61.30"
scipy = "^1.11.3"
fastapi = "^0.108.0"
sentry-sdk = "1.39.1"
Expand Down

0 comments on commit c22a91c

Please sign in to comment.