Skip to content

Commit

Permalink
refactor: move all models to their own unique containers (#71)
Browse files Browse the repository at this point in the history
* refactor: move all models to their own unique containers

* fix: bug in modal object initialization

* fix: logging bug

* chore: remove dead code
  • Loading branch information
sambarnes committed Mar 6, 2024
1 parent a60f30d commit 8d140f8
Show file tree
Hide file tree
Showing 9 changed files with 61 additions and 140 deletions.
10 changes: 3 additions & 7 deletions modal/runner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from modal import Secret, asgi_app

from runner.containers import (
DEFAULT_CONTAINER_TYPES,
)
from runner.containers import DEFAULT_CONTAINERS
from runner.shared.clean import clean_models_volume
from runner.shared.common import stub
from runner.shared.download import download_model, downloader_image
Expand Down Expand Up @@ -42,7 +40,7 @@ def completion(): # named for backwards compatibility with the Modal URL
def download(force: bool = False):
logger = get_logger("download")
logger.info("Downloading all models...")
for model in DEFAULT_CONTAINER_TYPES:
for model in DEFAULT_CONTAINERS:
# Can't be parallelized because of a modal volume corruption issue
download_model.local(model, force=force)
logger.info("ALL DONE!")
Expand All @@ -58,7 +56,5 @@ def download(force: bool = False):
def clean(all: bool = False, dry: bool = False):
logger = get_logger("clean")
logger.info(f"Cleaning models volume. ALL: {all}. DRY: {dry}")
remaining_models = (
[] if all else [m.lower() for m in DEFAULT_CONTAINER_TYPES]
)
remaining_models = [] if all else [m.lower() for m in DEFAULT_CONTAINERS]
clean_models_volume(remaining_models, dry)
56 changes: 13 additions & 43 deletions modal/runner/containers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,17 @@
from pathlib import Path

import modal

from shared.protocol import ContainerType

from .vllm_unified import (
VllmContainer_3B,
VllmContainer_7B,
VllmContainerA100_40G,
VllmContainerA100_80G,
VllmContainerA100_160G,
VllmContainerA100_160G_Isolated,
VllmContainer_IntelNeuralChat7B,
VllmContainer_JebCarterPsyfighter13B,
VllmContainer_JohnDurbinBagel34B,
VllmContainer_KoboldAIPsyfighter2,
VllmContainer_MicrosoftPhi2,
VllmContainer_NeverSleepNoromaidMixtral8x7B,
)

DEFAULT_CONTAINER_TYPES = {
"microsoft/phi-2": ContainerType.VllmContainer_3B,
"Intel/neural-chat-7b-v3-1": ContainerType.VllmContainer_7B,
# "PygmalionAI/mythalion-13b": ContainerType.VllmContainerA100_40G,
"jebcarter/Psyfighter-13B": ContainerType.VllmContainerA100_40G,
"KoboldAI/LLaMA2-13B-Psyfighter2": ContainerType.VllmContainerA100_40G,
# "Austism/chronos-hermes-13b-v2": ContainerType.VllmContainerA100_40G,
# "NeverSleep/Noromaid-20b-v0.1.1": ContainerType.VllmContainerA100_80G,
# "cognitivecomputations/dolphin-2.6-mixtral-8x7b": ContainerType.VllmContainerA100_160G,
"NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": ContainerType.VllmContainerA100_160G,
"jondurbin/bagel-34b-v0.2": ContainerType.VllmContainerA100_160G_Isolated,
DEFAULT_CONTAINERS = {
"microsoft/phi-2": VllmContainer_MicrosoftPhi2,
"Intel/neural-chat-7b-v3-1": VllmContainer_IntelNeuralChat7B,
"jebcarter/Psyfighter-13B": VllmContainer_JebCarterPsyfighter13B,
"KoboldAI/LLaMA2-13B-Psyfighter2": VllmContainer_KoboldAIPsyfighter2,
"NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": VllmContainer_NeverSleepNoromaidMixtral8x7B,
"jondurbin/bagel-34b-v0.2": VllmContainer_JohnDurbinBagel34B,
}


def get_container(
model_path: Path, container_type: ContainerType
) -> modal.cls.Obj:
match container_type:
case ContainerType.VllmContainer_3B:
return VllmContainer_3B(model_path)
case ContainerType.VllmContainer_7B:
return VllmContainer_7B(model_path)
case ContainerType.VllmContainerA100_40G:
return VllmContainerA100_40G(model_path)
case ContainerType.VllmContainerA100_80G:
return VllmContainerA100_80G(model_path)
case ContainerType.VllmContainerA100_80G_32K:
return VllmContainerA100_80G(model_path, max_model_len=32_000)
case ContainerType.VllmContainerA100_160G:
return VllmContainerA100_160G(model_path)
case ContainerType.VllmContainerA100_160G_Isolated:
return VllmContainerA100_160G_Isolated(model_path)
38 changes: 23 additions & 15 deletions modal/runner/containers/vllm_unified.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,26 +96,34 @@ def __init__(
return wrap(_VllmContainer)


VllmContainer_3B = _make_container(
"VllmContainer_3B", num_gpus=1, concurrent_inputs=120
VllmContainer_MicrosoftPhi2 = _make_container(
name="VllmContainer_MicrosoftPhi2",
num_gpus=1,
concurrent_inputs=120,
)

VllmContainer_7B = _make_container(
"VllmContainer_7B", num_gpus=1, concurrent_inputs=100
VllmContainer_IntelNeuralChat7B = _make_container(
name="VllmContainer_IntelNeuralChat7B",
num_gpus=1,
concurrent_inputs=100,
)
VllmContainerA100_40G = _make_container(
"VllmContainerA100_40G", num_gpus=1, concurrent_inputs=32
VllmContainer_JebCarterPsyfighter13B = _make_container(
"VllmContainer_JebCarterPsyfighter13B",
num_gpus=1,
concurrent_inputs=32,
)
VllmContainerA100_80G = _make_container(
"VllmContainerA100_80G", num_gpus=1, memory=80
VllmContainer_KoboldAIPsyfighter2 = _make_container(
name="VllmContainer_KoboldAIPsyfighter2",
num_gpus=1,
concurrent_inputs=32,
)
VllmContainerA100_160G = _make_container(
"VllmContainerA100_160G", num_gpus=2, memory=80, concurrent_inputs=4
VllmContainer_NeverSleepNoromaidMixtral8x7B = _make_container(
name="VllmContainer_NeverSleepNoromaidMixtral8x7B",
num_gpus=2,
memory=80,
concurrent_inputs=4,
)

# Allow new models to be tested on the isolated container
VllmContainerA100_160G_Isolated = _make_container(
"VllmContainerA100_160G_Isolated",
VllmContainer_JohnDurbinBagel34B = _make_container(
name="VllmContainer_JohnDurbinBagel34B",
num_gpus=2,
memory=80,
concurrent_inputs=4,
Expand Down
24 changes: 9 additions & 15 deletions modal/runner/endpoints/completion.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from fastapi import Request, status
from fastapi.responses import StreamingResponse

from runner.containers import DEFAULT_CONTAINER_TYPES, get_container
from runner.containers import DEFAULT_CONTAINERS
from runner.shared.common import BACKLOG_THRESHOLD
from runner.shared.sampling_params import SamplingParams
from shared.logging import get_logger, timer
from shared.logging import get_logger
from shared.protocol import (
CompletionPayload,
create_error_response,
Expand Down Expand Up @@ -38,21 +38,16 @@ def completion(
f"Unable to locate model {payload.model}",
)

container_type = (
payload.runner.container
if payload.runner
else DEFAULT_CONTAINER_TYPES.get(payload.model)
)

if container_type is None:
container = DEFAULT_CONTAINERS.get(payload.model)
if container is None:
message = f"Unable to locate container type for model {payload.model}"
logger.error(message)
return create_error_response(
status.HTTP_400_BAD_REQUEST,
f"Unable to locate container type for model {payload.model}",
)

runner = get_container(model_path, container_type)
runner = container(model_path)

stats = runner.generate.get_current_stats()
logger.info(stats)
Expand Down Expand Up @@ -96,11 +91,10 @@ def completion(
return create_error_response(status.HTTP_400_BAD_REQUEST, str(e))

async def generate():
with timer("runner.generate", str(model_path), container_type):
async for text in runner.generate.remote_gen.aio(
payload, sampling_params
):
yield text
async for text in runner.generate.remote_gen.aio(
payload, sampling_params
):
yield text

return StreamingResponse(
generate(),
Expand Down
5 changes: 5 additions & 0 deletions modal/runner/engines/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ def __init__(
def gpu_count(self) -> int:
return self.engine_args.tensor_parallel_size

@property
def cost_per_second(self) -> float:
return self.gpu_count * self.gpu_type.cost_per_second

# @method()
# async def tokenize_prompt(self, payload: Payload) -> List[int]:
# return self.tokenizer(payload.prompt).input_ids
Expand Down Expand Up @@ -147,6 +151,7 @@ async def generate(self, payload: CompletionPayload, params):
"tokens": resp.usage.completion_tokens,
"tps": resp.usage.completion_tokens / t_start_inference,
"duration": resp.usage.duration,
"cost": resp.usage.duration * self.cost_per_second,
},
)
except Exception as err:
Expand Down
7 changes: 0 additions & 7 deletions modal/shared/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
from modal import Image, Secret
from sentry_sdk.scrubber import DEFAULT_DENYLIST, EventScrubber

from shared.protocol import ContainerType

_sentry_denylist = DEFAULT_DENYLIST + ["prompt"]
sentry_sdk.init(
dsn=os.environ.get("SENTRY_DSN"),
Expand Down Expand Up @@ -43,7 +41,6 @@ def add_observability(image: Image):
def timer(
action: str,
model: str = None,
container_type: ContainerType = None,
tags: dict[str, str | int] = None,
) -> None:
"""
Expand All @@ -52,7 +49,6 @@ def timer(
Args:
action: The noun being timed
model: Optional, used as a tag
container_type: Optional, used as a tag and to estimate GPU cost
tags: Any additional tags to include in the structured log
"""
start = time.perf_counter()
Expand All @@ -68,9 +64,6 @@ def timer(
extra = (tags or {}) | {"duration": elapsed}
if model:
extra["model"] = model
if container_type:
extra["container_type"] = container_type.value
extra["gpu_cost"] = elapsed * container_type.gpu_cost_per_second

logging.info(f"{action} execution profiled", extra=extra)

Expand Down
46 changes: 5 additions & 41 deletions modal/shared/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,44 +12,13 @@ class GPUType(Enum):
A100_40G = "A100_40G"
A100_80G = "A100_80G"


class ContainerType(Enum):
VllmContainer_3B = "VllmContainer_3B"
VllmContainer_7B = "VllmContainer_7B"

VllmContainerA100_40G = "VllmContainerA100_40G"

VllmContainerA100_80G = "VllmContainerA100_80G"
VllmContainerA100_80G_32K = "VllmContainerA100_80G_32K"

VllmContainerA100_160G = "VllmContainerA100_160G"
VllmContainerA100_160G_Isolated = "VllmContainerA100_160G_Isolated"

@property
def gpu_cost_per_second(self) -> float:
"""
Returns:
The quoted GPU compute cost per second for the container,
as found on https://modal.com/pricing
"""

# TODO: might be better to put this on the container class itself,
# but this is good enough(tm) for now
def cost_per_second(self) -> float:
match self:
case ContainerType.VllmContainer_3B:
return _COST_PER_SECOND_A100_40G * 1
case ContainerType.VllmContainer_7B:
return _COST_PER_SECOND_A100_40G * 1
case ContainerType.VllmContainerA100_40G:
return _COST_PER_SECOND_A100_40G * 1
case ContainerType.VllmContainerA100_80G:
return _COST_PER_SECOND_A100_80G * 1
case ContainerType.VllmContainerA100_80G_32K:
return _COST_PER_SECOND_A100_80G * 1
case ContainerType.VllmContainerA100_160G:
return _COST_PER_SECOND_A100_80G * 2
case ContainerType.VllmContainerA100_160G_Isolated:
return _COST_PER_SECOND_A100_80G * 2
case GPUType.A100_40G:
return _COST_PER_SECOND_A100_40G
case GPUType.A100_80G:
return _COST_PER_SECOND_A100_80G


# https://github.com/vllm-project/vllm/blob/320a622ec4d098f2da5d097930f4031517e7327b/vllm/sampling_params.py#L7-L52
Expand Down Expand Up @@ -77,17 +46,12 @@ class Params(BaseModel):
skip_special_tokens: bool = True


class RunnerConfiguration(BaseModel):
container: ContainerType


class CompletionPayload(BaseModel):
id: str
prompt: str
stream: bool = False
params: Params
model: str
runner: RunnerConfiguration | None = None


class Usage(BaseModel):
Expand Down
8 changes: 1 addition & 7 deletions scripts/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ const envFile = `.env.dev`;
config({ path: envFile });

export const defaultModel = process.env.MODEL || 'microsoft/phi-2';
export const defaultContainer = process.env.CONTAINER_TYPE;

export function getApiUrl(path: string) {
const url = process.env.API_URL;
Expand Down Expand Up @@ -37,8 +36,7 @@ export async function completion(
stream = false,
stop = ['</s>'],
apiKey = undefined as string | undefined,
quiet = false,
container = defaultContainer
quiet = false
} = {}
) {
const apiUrl = getApiUrl('');
Expand All @@ -54,10 +52,6 @@ export async function completion(
stream
};

if (container) {
bodyPayload['runner'] = { container };
}

const p = await fetch(apiUrl, {
method: 'POST',
headers: getAuthHeaders(apiKey),
Expand Down
7 changes: 2 additions & 5 deletions scripts/test-dynamic-model.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import {
completion,
defaultContainer,
defaultModel,
enqueueAddModel,
pollForJobCompletion,
runIfCalledAsScript
} from 'scripts/shared';

async function main(
modelName = defaultModel,
containerType = defaultContainer
modelName = defaultModel
) {
console.log(`Test adding model ${modelName}`);
const body = await enqueueAddModel(modelName);
Expand All @@ -28,8 +26,7 @@ async function main(
model: modelName,
max_tokens: 1024,
stop: ['</s>'],
stream: false,
container: containerType
stream: false
});
}

Expand Down

0 comments on commit 8d140f8

Please sign in to comment.