refactor: move all models to their own unique containers (#71)

* refactor: move all models to their own unique containers * fix: bug in modal object initialization * fix: logging bug * chore: remove dead code
OpenRouterTeam · Mar 6, 2024 · 8d140f8 · 8d140f8
1 parent a60f30d
commit 8d140f8
Show file tree

Hide file tree

Showing 9 changed files with 61 additions and 140 deletions.
diff --git a/modal/runner/__init__.py b/modal/runner/__init__.py
@@ -1,8 +1,6 @@
 from modal import Secret, asgi_app
 
-from runner.containers import (
-    DEFAULT_CONTAINER_TYPES,
-)
+from runner.containers import DEFAULT_CONTAINERS
 from runner.shared.clean import clean_models_volume
 from runner.shared.common import stub
 from runner.shared.download import download_model, downloader_image
@@ -42,7 +40,7 @@ def completion():  # named for backwards compatibility with the Modal URL
 def download(force: bool = False):
     logger = get_logger("download")
     logger.info("Downloading all models...")
-    for model in DEFAULT_CONTAINER_TYPES:
+    for model in DEFAULT_CONTAINERS:
         # Can't be parallelized because of a modal volume corruption issue
         download_model.local(model, force=force)
     logger.info("ALL DONE!")
@@ -58,7 +56,5 @@ def download(force: bool = False):
 def clean(all: bool = False, dry: bool = False):
     logger = get_logger("clean")
     logger.info(f"Cleaning models volume. ALL: {all}. DRY: {dry}")
-    remaining_models = (
-        [] if all else [m.lower() for m in DEFAULT_CONTAINER_TYPES]
-    )
+    remaining_models = [] if all else [m.lower() for m in DEFAULT_CONTAINERS]
     clean_models_volume(remaining_models, dry)
diff --git a/modal/runner/containers/__init__.py b/modal/runner/containers/__init__.py
@@ -1,47 +1,17 @@
-from pathlib import Path
-
-import modal
-
-from shared.protocol import ContainerType
-
 from .vllm_unified import (
-    VllmContainer_3B,
-    VllmContainer_7B,
-    VllmContainerA100_40G,
-    VllmContainerA100_80G,
-    VllmContainerA100_160G,
-    VllmContainerA100_160G_Isolated,
+    VllmContainer_IntelNeuralChat7B,
+    VllmContainer_JebCarterPsyfighter13B,
+    VllmContainer_JohnDurbinBagel34B,
+    VllmContainer_KoboldAIPsyfighter2,
+    VllmContainer_MicrosoftPhi2,
+    VllmContainer_NeverSleepNoromaidMixtral8x7B,
 )
 
-DEFAULT_CONTAINER_TYPES = {
-    "microsoft/phi-2": ContainerType.VllmContainer_3B,
-    "Intel/neural-chat-7b-v3-1": ContainerType.VllmContainer_7B,
-    # "PygmalionAI/mythalion-13b": ContainerType.VllmContainerA100_40G,
-    "jebcarter/Psyfighter-13B": ContainerType.VllmContainerA100_40G,
-    "KoboldAI/LLaMA2-13B-Psyfighter2": ContainerType.VllmContainerA100_40G,
-    # "Austism/chronos-hermes-13b-v2": ContainerType.VllmContainerA100_40G,
-    # "NeverSleep/Noromaid-20b-v0.1.1": ContainerType.VllmContainerA100_80G,
-    # "cognitivecomputations/dolphin-2.6-mixtral-8x7b": ContainerType.VllmContainerA100_160G,
-    "NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": ContainerType.VllmContainerA100_160G,
-    "jondurbin/bagel-34b-v0.2": ContainerType.VllmContainerA100_160G_Isolated,
+DEFAULT_CONTAINERS = {
+    "microsoft/phi-2": VllmContainer_MicrosoftPhi2,
+    "Intel/neural-chat-7b-v3-1": VllmContainer_IntelNeuralChat7B,
+    "jebcarter/Psyfighter-13B": VllmContainer_JebCarterPsyfighter13B,
+    "KoboldAI/LLaMA2-13B-Psyfighter2": VllmContainer_KoboldAIPsyfighter2,
+    "NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": VllmContainer_NeverSleepNoromaidMixtral8x7B,
+    "jondurbin/bagel-34b-v0.2": VllmContainer_JohnDurbinBagel34B,
 }
-
-
-def get_container(
-    model_path: Path, container_type: ContainerType
-) -> modal.cls.Obj:
-    match container_type:
-        case ContainerType.VllmContainer_3B:
-            return VllmContainer_3B(model_path)
-        case ContainerType.VllmContainer_7B:
-            return VllmContainer_7B(model_path)
-        case ContainerType.VllmContainerA100_40G:
-            return VllmContainerA100_40G(model_path)
-        case ContainerType.VllmContainerA100_80G:
-            return VllmContainerA100_80G(model_path)
-        case ContainerType.VllmContainerA100_80G_32K:
-            return VllmContainerA100_80G(model_path, max_model_len=32_000)
-        case ContainerType.VllmContainerA100_160G:
-            return VllmContainerA100_160G(model_path)
-        case ContainerType.VllmContainerA100_160G_Isolated:
-            return VllmContainerA100_160G_Isolated(model_path)
diff --git a/modal/runner/containers/vllm_unified.py b/modal/runner/containers/vllm_unified.py
@@ -96,26 +96,34 @@ def __init__(
     return wrap(_VllmContainer)
 
 
-VllmContainer_3B = _make_container(
-    "VllmContainer_3B", num_gpus=1, concurrent_inputs=120
+VllmContainer_MicrosoftPhi2 = _make_container(
+    name="VllmContainer_MicrosoftPhi2",
+    num_gpus=1,
+    concurrent_inputs=120,
 )
-
-VllmContainer_7B = _make_container(
-    "VllmContainer_7B", num_gpus=1, concurrent_inputs=100
+VllmContainer_IntelNeuralChat7B = _make_container(
+    name="VllmContainer_IntelNeuralChat7B",
+    num_gpus=1,
+    concurrent_inputs=100,
 )
-VllmContainerA100_40G = _make_container(
-    "VllmContainerA100_40G", num_gpus=1, concurrent_inputs=32
+VllmContainer_JebCarterPsyfighter13B = _make_container(
+    "VllmContainer_JebCarterPsyfighter13B",
+    num_gpus=1,
+    concurrent_inputs=32,
 )
-VllmContainerA100_80G = _make_container(
-    "VllmContainerA100_80G", num_gpus=1, memory=80
+VllmContainer_KoboldAIPsyfighter2 = _make_container(
+    name="VllmContainer_KoboldAIPsyfighter2",
+    num_gpus=1,
+    concurrent_inputs=32,
 )
-VllmContainerA100_160G = _make_container(
-    "VllmContainerA100_160G", num_gpus=2, memory=80, concurrent_inputs=4
+VllmContainer_NeverSleepNoromaidMixtral8x7B = _make_container(
+    name="VllmContainer_NeverSleepNoromaidMixtral8x7B",
+    num_gpus=2,
+    memory=80,
+    concurrent_inputs=4,
 )
-
-# Allow new models to be tested on the isolated container
-VllmContainerA100_160G_Isolated = _make_container(
-    "VllmContainerA100_160G_Isolated",
+VllmContainer_JohnDurbinBagel34B = _make_container(
+    name="VllmContainer_JohnDurbinBagel34B",
     num_gpus=2,
     memory=80,
     concurrent_inputs=4,

diff --git a/modal/runner/endpoints/completion.py b/modal/runner/endpoints/completion.py
@@ -1,10 +1,10 @@
 from fastapi import Request, status
 from fastapi.responses import StreamingResponse
 
-from runner.containers import DEFAULT_CONTAINER_TYPES, get_container
+from runner.containers import DEFAULT_CONTAINERS
 from runner.shared.common import BACKLOG_THRESHOLD
 from runner.shared.sampling_params import SamplingParams
-from shared.logging import get_logger, timer
+from shared.logging import get_logger
 from shared.protocol import (
     CompletionPayload,
     create_error_response,
@@ -38,21 +38,16 @@ def completion(
             f"Unable to locate model {payload.model}",
         )
 
-    container_type = (
-        payload.runner.container
-        if payload.runner
-        else DEFAULT_CONTAINER_TYPES.get(payload.model)
-    )
-
-    if container_type is None:
+    container = DEFAULT_CONTAINERS.get(payload.model)
+    if container is None:
         message = f"Unable to locate container type for model {payload.model}"
         logger.error(message)
         return create_error_response(
             status.HTTP_400_BAD_REQUEST,
             f"Unable to locate container type for model {payload.model}",
         )
 
-    runner = get_container(model_path, container_type)
+    runner = container(model_path)
 
     stats = runner.generate.get_current_stats()
     logger.info(stats)
@@ -96,11 +91,10 @@ def completion(
         return create_error_response(status.HTTP_400_BAD_REQUEST, str(e))
 
     async def generate():
-        with timer("runner.generate", str(model_path), container_type):
-            async for text in runner.generate.remote_gen.aio(
-                payload, sampling_params
-            ):
-                yield text
+        async for text in runner.generate.remote_gen.aio(
+            payload, sampling_params
+        ):
+            yield text
 
     return StreamingResponse(
         generate(),

diff --git a/modal/runner/engines/vllm.py b/modal/runner/engines/vllm.py
@@ -71,6 +71,10 @@ def __init__(
     def gpu_count(self) -> int:
         return self.engine_args.tensor_parallel_size
 
+    @property
+    def cost_per_second(self) -> float:
+        return self.gpu_count * self.gpu_type.cost_per_second
+
     # @method()
     # async def tokenize_prompt(self, payload: Payload) -> List[int]:
     #     return self.tokenizer(payload.prompt).input_ids
@@ -147,6 +151,7 @@ async def generate(self, payload: CompletionPayload, params):
                     "tokens": resp.usage.completion_tokens,
                     "tps": resp.usage.completion_tokens / t_start_inference,
                     "duration": resp.usage.duration,
+                    "cost": resp.usage.duration * self.cost_per_second,
                 },
             )
         except Exception as err:

diff --git a/modal/shared/logging.py b/modal/shared/logging.py
@@ -14,8 +14,6 @@
 from modal import Image, Secret
 from sentry_sdk.scrubber import DEFAULT_DENYLIST, EventScrubber
 
-from shared.protocol import ContainerType
-
 _sentry_denylist = DEFAULT_DENYLIST + ["prompt"]
 sentry_sdk.init(
     dsn=os.environ.get("SENTRY_DSN"),
@@ -43,7 +41,6 @@ def add_observability(image: Image):
 def timer(
     action: str,
     model: str = None,
-    container_type: ContainerType = None,
     tags: dict[str, str | int] = None,
 ) -> None:
     """
@@ -52,7 +49,6 @@ def timer(
     Args:
         action: The noun being timed
         model: Optional, used as a tag
-        container_type: Optional, used as a tag and to estimate GPU cost
         tags: Any additional tags to include in the structured log
     """
     start = time.perf_counter()
@@ -68,9 +64,6 @@ def timer(
         extra = (tags or {}) | {"duration": elapsed}
         if model:
             extra["model"] = model
-        if container_type:
-            extra["container_type"] = container_type.value
-            extra["gpu_cost"] = elapsed * container_type.gpu_cost_per_second
 
         logging.info(f"{action} execution profiled", extra=extra)
 

diff --git a/modal/shared/protocol.py b/modal/shared/protocol.py
@@ -12,44 +12,13 @@ class GPUType(Enum):
     A100_40G = "A100_40G"
     A100_80G = "A100_80G"
 
-
-class ContainerType(Enum):
-    VllmContainer_3B = "VllmContainer_3B"
-    VllmContainer_7B = "VllmContainer_7B"
-
-    VllmContainerA100_40G = "VllmContainerA100_40G"
-
-    VllmContainerA100_80G = "VllmContainerA100_80G"
-    VllmContainerA100_80G_32K = "VllmContainerA100_80G_32K"
-
-    VllmContainerA100_160G = "VllmContainerA100_160G"
-    VllmContainerA100_160G_Isolated = "VllmContainerA100_160G_Isolated"
-
     @property
-    def gpu_cost_per_second(self) -> float:
-        """
-        Returns:
-            The quoted GPU compute cost per second for the container,
-            as found on https://modal.com/pricing
-        """
-
-        # TODO: might be better to put this on the container class itself,
-        #       but this is good enough(tm) for now
+    def cost_per_second(self) -> float:
         match self:
-            case ContainerType.VllmContainer_3B:
-                return _COST_PER_SECOND_A100_40G * 1
-            case ContainerType.VllmContainer_7B:
-                return _COST_PER_SECOND_A100_40G * 1
-            case ContainerType.VllmContainerA100_40G:
-                return _COST_PER_SECOND_A100_40G * 1
-            case ContainerType.VllmContainerA100_80G:
-                return _COST_PER_SECOND_A100_80G * 1
-            case ContainerType.VllmContainerA100_80G_32K:
-                return _COST_PER_SECOND_A100_80G * 1
-            case ContainerType.VllmContainerA100_160G:
-                return _COST_PER_SECOND_A100_80G * 2
-            case ContainerType.VllmContainerA100_160G_Isolated:
-                return _COST_PER_SECOND_A100_80G * 2
+            case GPUType.A100_40G:
+                return _COST_PER_SECOND_A100_40G
+            case GPUType.A100_80G:
+                return _COST_PER_SECOND_A100_80G
 
 
 # https://github.com/vllm-project/vllm/blob/320a622ec4d098f2da5d097930f4031517e7327b/vllm/sampling_params.py#L7-L52
@@ -77,17 +46,12 @@ class Params(BaseModel):
     skip_special_tokens: bool = True
 
 
-class RunnerConfiguration(BaseModel):
-    container: ContainerType
-
-
 class CompletionPayload(BaseModel):
     id: str
     prompt: str
     stream: bool = False
     params: Params
     model: str
-    runner: RunnerConfiguration | None = None
 
 
 class Usage(BaseModel):

diff --git a/scripts/shared.ts b/scripts/shared.ts
@@ -7,7 +7,6 @@ const envFile = `.env.dev`;
 config({ path: envFile });
 
 export const defaultModel = process.env.MODEL || 'microsoft/phi-2';
-export const defaultContainer = process.env.CONTAINER_TYPE;
 
 export function getApiUrl(path: string) {
   const url = process.env.API_URL;
@@ -37,8 +36,7 @@ export async function completion(
     stream = false,
     stop = ['</s>'],
     apiKey = undefined as string | undefined,
-    quiet = false,
-    container = defaultContainer
+    quiet = false
   } = {}
 ) {
   const apiUrl = getApiUrl('');
@@ -54,10 +52,6 @@ export async function completion(
     stream
   };
 
-  if (container) {
-    bodyPayload['runner'] = { container };
-  }
-
   const p = await fetch(apiUrl, {
     method: 'POST',
     headers: getAuthHeaders(apiKey),

diff --git a/scripts/test-dynamic-model.ts b/scripts/test-dynamic-model.ts
@@ -1,15 +1,13 @@
 import {
   completion,
-  defaultContainer,
   defaultModel,
   enqueueAddModel,
   pollForJobCompletion,
   runIfCalledAsScript
 } from 'scripts/shared';
 
 async function main(
-  modelName = defaultModel,
-  containerType = defaultContainer
+  modelName = defaultModel
 ) {
   console.log(`Test adding model ${modelName}`);
   const body = await enqueueAddModel(modelName);
@@ -28,8 +26,7 @@ async function main(
     model: modelName,
     max_tokens: 1024,
     stop: ['</s>'],
-    stream: false,
-    container: containerType
+    stream: false
   });
 }