Skip to content

Commit

Permalink
perf: serve quantized Psyfighter2 (#81)
Browse files Browse the repository at this point in the history
* perf: serve quantized Psyfighter2

* fix: add GPTQ quant param
  • Loading branch information
sambarnes committed Mar 19, 2024
1 parent 2b1cb4a commit 754d41f
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions modal/runner/containers/vllm_unified.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def __init__(self):
gpu=modal.gpu.A10G(count=1),
concurrent_inputs=4,
max_containers=5,
quantization="GPTQ",
)

_neural_chat = "TheBloke/neural-chat-7b-v3-1-GPTQ"
Expand All @@ -119,6 +120,7 @@ def __init__(self):
gpu=modal.gpu.A10G(count=1),
concurrent_inputs=4,
max_containers=5,
quantization="GPTQ",
)

_psyfighter = "TheBloke/Psyfighter-13B-GPTQ"
Expand All @@ -128,14 +130,17 @@ def __init__(self):
gpu=modal.gpu.A10G(count=1),
concurrent_inputs=4,
max_containers=5,
quantization="GPTQ",
)

# TODO: quantize this one too. shipping the others first to limit blast radius
_psyfighter2 = "TheBloke/LLaMA2-13B-Psyfighter2-GPTQ"
VllmContainer_KoboldAIPsyfighter2 = _make_container(
name="VllmContainer_KoboldAIPsyfighter2",
model_name="KoboldAI/LLaMA2-13B-Psyfighter2",
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=32,
model_name=_psyfighter2,
gpu=modal.gpu.A10G(count=1),
concurrent_inputs=4,
max_containers=5,
quantization="GPTQ",
)

_noromaid = "TheBloke/Noromaid-v0.1-mixtral-8x7b-Instruct-v3-GPTQ"
Expand Down Expand Up @@ -174,6 +179,7 @@ def __init__(self):
"microsoft/phi-2": _phi2,
"Intel/neural-chat-7b-v3-1": _neural_chat,
"jebcarter/Psyfighter-13B": _psyfighter,
"KoboldAI/LLaMA2-13B-Psyfighter2": _psyfighter2,
"NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": _noromaid,
"jondurbin/bagel-34b-v0.2": _bagel,
}

0 comments on commit 754d41f

Please sign in to comment.