feat: keep_warm=1 for noromaid mixtral & bagel (#75)

* deps: bump modal to 0.61.30 * refactor: stop parametrizing the model container classes * feat: add keep_warm=1 for noromaid & bagel * feat: use @modal.enter decorator to cold start before marking warm
OpenRouterTeam · Mar 8, 2024 · c22a91c · c22a91c
1 parent db6ed5d
commit c22a91c
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 38 deletions.
diff --git a/modal/runner/__init__.py b/modal/runner/__init__.py
@@ -1,6 +1,6 @@
 from modal import Secret, asgi_app
 
-from runner.containers import DEFAULT_CONTAINERS
+from runner.containers.vllm_unified import REGISTERED_CONTAINERS
 from runner.shared.clean import clean_models_volume
 from runner.shared.common import stub
 from runner.shared.download import download_model, downloader_image
@@ -40,7 +40,7 @@ def completion():  # named for backwards compatibility with the Modal URL
 def download(force: bool = False):
     logger = get_logger("download")
     logger.info("Downloading all models...")
-    for model in DEFAULT_CONTAINERS:
+    for model in REGISTERED_CONTAINERS:
         # Can't be parallelized because of a modal volume corruption issue
         download_model.local(model, force=force)
     logger.info("ALL DONE!")
@@ -56,5 +56,5 @@ def download(force: bool = False):
 def clean(all: bool = False, dry: bool = False):
     logger = get_logger("clean")
     logger.info(f"Cleaning models volume. ALL: {all}. DRY: {dry}")
-    remaining_models = [] if all else [m.lower() for m in DEFAULT_CONTAINERS]
+    remaining_models = [] if all else [m.lower() for m in REGISTERED_CONTAINERS]
     clean_models_volume(remaining_models, dry)
diff --git a/modal/runner/containers/__init__.py b/modal/runner/containers/__init__.py
@@ -1,17 +0,0 @@
-from .vllm_unified import (
-    VllmContainer_IntelNeuralChat7B,
-    VllmContainer_JebCarterPsyfighter13B,
-    VllmContainer_JohnDurbinBagel34B,
-    VllmContainer_KoboldAIPsyfighter2,
-    VllmContainer_MicrosoftPhi2,
-    VllmContainer_NeverSleepNoromaidMixtral8x7B,
-)
-
-DEFAULT_CONTAINERS = {
-    "microsoft/phi-2": VllmContainer_MicrosoftPhi2,
-    "Intel/neural-chat-7b-v3-1": VllmContainer_IntelNeuralChat7B,
-    "jebcarter/Psyfighter-13B": VllmContainer_JebCarterPsyfighter13B,
-    "KoboldAI/LLaMA2-13B-Psyfighter2": VllmContainer_KoboldAIPsyfighter2,
-    "NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": VllmContainer_NeverSleepNoromaidMixtral8x7B,
-    "jondurbin/bagel-34b-v0.2": VllmContainer_JohnDurbinBagel34B,
-}

diff --git a/modal/runner/containers/vllm_unified.py b/modal/runner/containers/vllm_unified.py
@@ -1,25 +1,31 @@
 import os
-from pathlib import Path
-from typing import Optional
 
 import modal.gpu
 import sentry_sdk
 
 from runner.engines.vllm import VllmEngine, VllmParams, vllm_image
 from runner.shared.common import stub
+from shared.config import is_env_dev
 from shared.logging import (
     get_logger,
     get_observability_secrets,
 )
 from shared.protocol import GPUType
-from shared.volumes import does_model_exist, models_path, models_volume
+from shared.volumes import (
+    does_model_exist,
+    get_model_path,
+    models_path,
+    models_volume,
+)
 
 
 def _make_container(
     name: str,
+    model_name: str,
     gpu: modal.gpu = modal.gpu.A100(count=1, memory=40),
     concurrent_inputs: int = 8,
     max_containers: int = None,
+    keep_warm: int = None,
 ):
     """Helper function to create a container with the given GPU configuration."""
 
@@ -31,14 +37,16 @@ def _make_container(
     else:
         raise ValueError(f"Unknown GPU type: {gpu}")
 
+    # Avoid wasting resources & money in dev
+    if keep_warm and is_env_dev():
+        print("Dev environment detected, disabling keep_warm for", name)
+        keep_warm = None
+
     class _VllmContainer(VllmEngine):
-        def __init__(
-            self,
-            model_path: Path,
-            max_model_len: Optional[int] = None,
-        ):
+        def __init__(self):
             logger = get_logger(name)
             try:
+                model_path = get_model_path(model_name=model_name)
                 if not does_model_exist(model_path):
                     raise Exception("Unable to locate model {}", model_path)
 
@@ -58,10 +66,16 @@ def __init__(
                     params=VllmParams(
                         model=str(model_path),
                         tensor_parallel_size=num_gpus,
-                        max_model_len=max_model_len,
                     ),
                 )
 
+                # For any containers with keep_warm, we need to skip cold-start usage
+                # billing. This is because the first request might be minutes after
+                # the container is started, and we don't want to record that time as
+                # usage.
+                if keep_warm:
+                    self.is_first_request = False
+
                 # Performance improvement from https://github.com/vllm-project/vllm/issues/2073#issuecomment-1853422529
                 if num_gpus > 1:
                     import subprocess
@@ -91,39 +105,53 @@ def __init__(
         timeout=10 * 60,
         secrets=[*get_observability_secrets()],
         concurrency_limit=max_containers,
+        keep_warm=keep_warm,
     )
-    return wrap(_VllmContainer)
+    _cls = wrap(_VllmContainer)
+    REGISTERED_CONTAINERS[model_name] = _cls
+    return _cls
+
 
+# A mapping of model names to their respective container classes.
+REGISTERED_CONTAINERS = {}
 
 VllmContainer_MicrosoftPhi2 = _make_container(
     name="VllmContainer_MicrosoftPhi2",
+    model_name="microsoft/phi-2",
     gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=120,
 )
 VllmContainer_IntelNeuralChat7B = _make_container(
     name="VllmContainer_IntelNeuralChat7B",
+    model_name="Intel/neural-chat-7b-v3-1",
     gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=100,
 )
 VllmContainer_JebCarterPsyfighter13B = _make_container(
     "VllmContainer_JebCarterPsyfighter13B",
+    model_name="jebcarter/Psyfighter-13B",
     gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=32,
 )
 VllmContainer_KoboldAIPsyfighter2 = _make_container(
     name="VllmContainer_KoboldAIPsyfighter2",
+    model_name="KoboldAI/LLaMA2-13B-Psyfighter2",
     gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=32,
 )
 VllmContainer_NeverSleepNoromaidMixtral8x7B = _make_container(
     name="VllmContainer_NeverSleepNoromaidMixtral8x7B",
+    model_name="NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3",
     gpu=modal.gpu.A100(count=2, memory=80),
     concurrent_inputs=4,
     max_containers=3,
+    keep_warm=1,
 )
 VllmContainer_JohnDurbinBagel34B = _make_container(
     name="VllmContainer_JohnDurbinBagel34B",
+    model_name="jondurbin/bagel-34b-v0.2",
     gpu=modal.gpu.A100(count=2, memory=80),
     concurrent_inputs=4,
     max_containers=1,
+    keep_warm=1,
 )
diff --git a/modal/runner/endpoints/completion.py b/modal/runner/endpoints/completion.py
@@ -1,7 +1,7 @@
 from fastapi import Request, status
 from fastapi.responses import StreamingResponse
 
-from runner.containers import DEFAULT_CONTAINERS
+from runner.containers.vllm_unified import REGISTERED_CONTAINERS
 from runner.shared.common import BACKLOG_THRESHOLD
 from runner.shared.sampling_params import SamplingParams
 from shared.logging import get_logger
@@ -38,7 +38,7 @@ def completion(
             f"Unable to locate model {payload.model}",
         )
 
-    container = DEFAULT_CONTAINERS.get(payload.model)
+    container = REGISTERED_CONTAINERS.get(payload.model)
     if container is None:
         message = f"Unable to locate container type for model {payload.model}"
         logger.error(message)
@@ -47,7 +47,7 @@ def completion(
             f"Unable to locate container type for model {payload.model}",
         )
 
-    runner = container(model_path)
+    runner = container()
 
     stats = runner.generate.get_current_stats()
     logger.info(stats)

diff --git a/modal/runner/engines/vllm.py b/modal/runner/engines/vllm.py
@@ -1,7 +1,7 @@
 import time
 from typing import Optional
 
-from modal import Image, method
+from modal import Image, enter, method
 from pydantic import BaseModel
 
 from shared.logging import (
@@ -70,11 +70,14 @@ def __init__(
         self.gpu_type = gpu_type
         self.is_first_request = True
         self.t_cold_start = time.time()
+        self.engine = None
         self.engine_args = AsyncEngineArgs(
             **params.dict(),
             disable_log_requests=True,
         )
 
+    @enter()
+    def startup(self):
         with timer("engine init", model=self.engine_args.model):
             self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)
 

diff --git a/modal/shared/config.py b/modal/shared/config.py
@@ -8,6 +8,11 @@
 _auth = HTTPBearer()
 
 
+def is_env_dev() -> bool:
+    """Returns whether this is running in a development environment."""
+    return os.getenv("DD_ENV", "development") == "development"
+
+
 class Config(BaseModel):
     name: str
     api_key_id: str

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = ">=3.10,<3.12"
 huggingface-hub = "^0.17.1"
-modal = "^0.61.24"
+modal = "^0.61.30"
 scipy = "^1.11.3"
 fastapi = "^0.108.0"
 sentry-sdk = "1.39.1"