perf: serve quantized versions of noromaid mixtral & bagel (#77)

* perf: serve quantized versions of noromaid mixtral & bagel * fix: A100 40G instead of 80G * chore: reformat for readability * refactor: dedupe model names
OpenRouterTeam · Mar 11, 2024 · 1a41b87 · 1a41b87
1 parent 49f8622
commit 1a41b87
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 11 deletions.
diff --git a/modal/runner/containers/vllm_unified.py b/modal/runner/containers/vllm_unified.py
@@ -26,6 +26,7 @@ def _make_container(
     concurrent_inputs: int = 8,
     max_containers: int = None,
     keep_warm: int = None,
+    **vllm_opts,
 ):
     """Helper function to create a container with the given GPU configuration."""
 
@@ -66,6 +67,7 @@ def __init__(self):
                     params=VllmParams(
                         model=str(model_path),
                         tensor_parallel_size=num_gpus,
+                        **vllm_opts,
                     ),
                 )
 
@@ -113,6 +115,7 @@ def __init__(self):
 
 
 # A mapping of model names to their respective container classes.
+# Automatically populated by _make_container.
 REGISTERED_CONTAINERS = {}
 
 VllmContainer_MicrosoftPhi2 = _make_container(
@@ -139,19 +142,36 @@ def __init__(self):
     gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=32,
 )
+
+_noromaid = "TheBloke/Noromaid-v0.1-mixtral-8x7b-Instruct-v3-GPTQ"
 VllmContainer_NeverSleepNoromaidMixtral8x7B = _make_container(
     name="VllmContainer_NeverSleepNoromaidMixtral8x7B",
-    model_name="NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3",
-    gpu=modal.gpu.A100(count=2, memory=80),
+    model_name=_noromaid,
+    gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=4,
     max_containers=3,
     keep_warm=1,
+    quantization="GPTQ",
+    dtype="float16",  # vLLM errors when using dtype="auto" with this model
 )
+
+_bagel = "TheBloke/bagel-34b-v0.2-GPTQ"
 VllmContainer_JohnDurbinBagel34B = _make_container(
     name="VllmContainer_JohnDurbinBagel34B",
-    model_name="jondurbin/bagel-34b-v0.2",
-    gpu=modal.gpu.A100(count=2, memory=80),
+    model_name=_bagel,
+    gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=4,
     max_containers=1,
     keep_warm=1,
+    max_model_len=8_000,  # Reduced from original 200k
+    quantization="GPTQ",
+    dtype="float16",  # vLLM errors when using dtype="auto" with this model
 )
+
+# A re-mapping of model names to their respective quantized models.
+# From the outside, the model name is the original, but internally,
+# we use the quantized model name.
+QUANTIZED_MODELS = {
+    "NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": _noromaid,
+    "jondurbin/bagel-34b-v0.2": _bagel,
+}
diff --git a/modal/runner/endpoints/completion.py b/modal/runner/endpoints/completion.py
@@ -1,7 +1,10 @@
 from fastapi import Request, status
 from fastapi.responses import StreamingResponse
 
-from runner.containers.vllm_unified import REGISTERED_CONTAINERS
+from runner.containers.vllm_unified import (
+    QUANTIZED_MODELS,
+    REGISTERED_CONTAINERS,
+)
 from runner.shared.common import BACKLOG_THRESHOLD
 from runner.shared.sampling_params import SamplingParams
 from shared.logging import get_logger
@@ -18,7 +21,12 @@ def completion(
     request: Request,
     payload: CompletionPayload,
 ):
-    model_path = get_model_path(payload.model)
+    # Some models are served quantized, so we try re-mapping them first
+    model_name = payload.model
+    if model_name in QUANTIZED_MODELS:
+        model_name = QUANTIZED_MODELS[model_name]
+
+    model_path = get_model_path(model_name)
     logger.info(
         "Received completion request",
         extra={
@@ -31,20 +39,20 @@ def completion(
         },
     )  # use path to match runner
     if not does_model_exist(model_path):
-        message = f"Unable to locate model {payload.model}"
+        message = f"Unable to locate model {model_name}"
         logger.error(message)
         return create_error_response(
             status.HTTP_400_BAD_REQUEST,
-            f"Unable to locate model {payload.model}",
+            f"Unable to locate model {model_name}",
         )
 
-    container = REGISTERED_CONTAINERS.get(payload.model)
+    container = REGISTERED_CONTAINERS.get(model_name)
     if container is None:
-        message = f"Unable to locate container type for model {payload.model}"
+        message = f"Unable to locate container type for model {model_name}"
         logger.error(message)
         return create_error_response(
             status.HTTP_400_BAD_REQUEST,
-            f"Unable to locate container type for model {payload.model}",
+            f"Unable to locate container type for model {model_name}",
         )
 
     runner = container()