Skip to content

Commit

Permalink
perf: serve quantized versions of noromaid mixtral & bagel (#77)
Browse files Browse the repository at this point in the history
* perf: serve quantized versions of noromaid mixtral & bagel

* fix: A100 40G instead of 80G

* chore: reformat for readability

* refactor: dedupe model names
  • Loading branch information
sambarnes committed Mar 11, 2024
1 parent 49f8622 commit 1a41b87
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 11 deletions.
28 changes: 24 additions & 4 deletions modal/runner/containers/vllm_unified.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def _make_container(
concurrent_inputs: int = 8,
max_containers: int = None,
keep_warm: int = None,
**vllm_opts,
):
"""Helper function to create a container with the given GPU configuration."""

Expand Down Expand Up @@ -66,6 +67,7 @@ def __init__(self):
params=VllmParams(
model=str(model_path),
tensor_parallel_size=num_gpus,
**vllm_opts,
),
)

Expand Down Expand Up @@ -113,6 +115,7 @@ def __init__(self):


# A mapping of model names to their respective container classes.
# Automatically populated by _make_container.
REGISTERED_CONTAINERS = {}

VllmContainer_MicrosoftPhi2 = _make_container(
Expand All @@ -139,19 +142,36 @@ def __init__(self):
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=32,
)

_noromaid = "TheBloke/Noromaid-v0.1-mixtral-8x7b-Instruct-v3-GPTQ"
VllmContainer_NeverSleepNoromaidMixtral8x7B = _make_container(
name="VllmContainer_NeverSleepNoromaidMixtral8x7B",
model_name="NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3",
gpu=modal.gpu.A100(count=2, memory=80),
model_name=_noromaid,
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=4,
max_containers=3,
keep_warm=1,
quantization="GPTQ",
dtype="float16", # vLLM errors when using dtype="auto" with this model
)

_bagel = "TheBloke/bagel-34b-v0.2-GPTQ"
VllmContainer_JohnDurbinBagel34B = _make_container(
name="VllmContainer_JohnDurbinBagel34B",
model_name="jondurbin/bagel-34b-v0.2",
gpu=modal.gpu.A100(count=2, memory=80),
model_name=_bagel,
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=4,
max_containers=1,
keep_warm=1,
max_model_len=8_000, # Reduced from original 200k
quantization="GPTQ",
dtype="float16", # vLLM errors when using dtype="auto" with this model
)

# A re-mapping of model names to their respective quantized models.
# From the outside, the model name is the original, but internally,
# we use the quantized model name.
QUANTIZED_MODELS = {
"NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": _noromaid,
"jondurbin/bagel-34b-v0.2": _bagel,
}
22 changes: 15 additions & 7 deletions modal/runner/endpoints/completion.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from fastapi import Request, status
from fastapi.responses import StreamingResponse

from runner.containers.vllm_unified import REGISTERED_CONTAINERS
from runner.containers.vllm_unified import (
QUANTIZED_MODELS,
REGISTERED_CONTAINERS,
)
from runner.shared.common import BACKLOG_THRESHOLD
from runner.shared.sampling_params import SamplingParams
from shared.logging import get_logger
Expand All @@ -18,7 +21,12 @@ def completion(
request: Request,
payload: CompletionPayload,
):
model_path = get_model_path(payload.model)
# Some models are served quantized, so we try re-mapping them first
model_name = payload.model
if model_name in QUANTIZED_MODELS:
model_name = QUANTIZED_MODELS[model_name]

model_path = get_model_path(model_name)
logger.info(
"Received completion request",
extra={
Expand All @@ -31,20 +39,20 @@ def completion(
},
) # use path to match runner
if not does_model_exist(model_path):
message = f"Unable to locate model {payload.model}"
message = f"Unable to locate model {model_name}"
logger.error(message)
return create_error_response(
status.HTTP_400_BAD_REQUEST,
f"Unable to locate model {payload.model}",
f"Unable to locate model {model_name}",
)

container = REGISTERED_CONTAINERS.get(payload.model)
container = REGISTERED_CONTAINERS.get(model_name)
if container is None:
message = f"Unable to locate container type for model {payload.model}"
message = f"Unable to locate container type for model {model_name}"
logger.error(message)
return create_error_response(
status.HTTP_400_BAD_REQUEST,
f"Unable to locate container type for model {payload.model}",
f"Unable to locate container type for model {model_name}",
)

runner = container()
Expand Down

0 comments on commit 1a41b87

Please sign in to comment.