# vLLM Handoff - BYOP with Custom Config

## Imports

In [1]:
import base64
import wallaroo
import pyarrow as pa
import pandas as pd
from wallaroo.deployment_config import DeploymentConfigBuilder
from wallaroo.framework import Framework
from wallaroo.engine_config import Acceleration
from wallaroo.continuous_batching_config import ContinuousBatchingConfig
from wallaroo.object import EntityNotFoundError

In [2]:
wl = wallaroo.Client()

Please log into the following URL in a web browser:

	https://autoscale-uat-gcp.wallaroo.dev/auth/realms/master/device?user_code=RRFM-MPVG

Login successful!


In [3]:
wallaroo.__version__

'2025.1.0+4be069be7'

## Define Schemas & Upload model

In [4]:
input_schema = pa.schema([
    pa.field('prompt', pa.string()),
    pa.field('max_tokens', pa.int64()),
])
output_schema = pa.schema([
    pa.field('generated_text', pa.string()),
    pa.field('num_output_tokens', pa.int64())
])

### Upload model via API

In [None]:
base64.b64encode(
    bytes(input_schema.serialize())
).decode("utf8")

In [None]:
base64.b64encode(
    bytes(output_schema.serialize())
).decode("utf8")

Run the following command in order to upload the model via **API**:

```bash
curl --progress-bar -X POST   -H "Content-Type: multipart/form-data"   -H "Authorization: Bearer <your-auth-token-here>"   -F 'metadata={"name": "byop-vllm-tinyllama-async-fc-v3", "visibility": "private", "workspace_id": <your-workspace-id-here>, "conversion": {"framework": "custom", "python_version": "3.8", "requirements": [], "framework_config": {"config": {"gpu_memory_utilization": 0.9, "max_model_len": 128}, "framework": "custom"}}, "input_schema": "/////7AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABUAAAABAAAAMT///8AAAECEAAAACQAAAAEAAAAAAAAAAoAAABtYXhfdG9rZW5zAAAIAAwACAAHAAgAAAAAAAABQAAAABAAFAAIAAYABwAMAAAAEAAQAAAAAAABBRAAAAAcAAAABAAAAAAAAAAGAAAAcHJvbXB0AAAEAAQABAAAAA==", "output_schema": "/////8AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABcAAAABAAAALz///8AAAECEAAAACwAAAAEAAAAAAAAABEAAABudW1fb3V0cHV0X3Rva2VucwAAAAgADAAIAAcACAAAAAAAAAFAAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEFEAAAACQAAAAEAAAAAAAAAA4AAABnZW5lcmF0ZWRfdGV4dAAABAAEAAQAAAA="};type=application/json'   -F "file=@byop-tinyllama-custom-config.zip;type=application/octet-stream"   https://benchmarkscluster.wallaroocommunity.ninja/v1/api/models/upload_and_convert | cat
```

In [None]:
# Retrieve the model
model = wl.get_model("byop-vllm-tinyllama-async-fc-v3")
model

### Upload model via SDK

In [5]:
from wallaroo.framework import CustomConfig


In [6]:
model = wl.upload_model(
    "byop-vllm-tinyllama-ynsv5", 
    "./byop_tinyllama_vllm_v4.zip",
    framework=Framework.CUSTOM,
    framework_config=CustomConfig(
        gpu_memory_utilization=0.9, 
        max_model_len=128
    ),
    input_schema=input_schema, 
    output_schema=output_schema,
    accel=Acceleration.CUDA
)
model

Waiting for model loading - this will take up to 10min.
.odel is pending loading to a container runtime.
.............................successfulner runtime.

Ready


0,1
Name,byop-vllm-tinyllama-ynsv5
Version,4b40ba86-8af1-4945-bde6-137245d5e618
File Name,byop_tinyllama_vllm_v4.zip
SHA,5e244d5ab73cf718256d1d08b7c0553102215f69c3d70936b2d4b89043499a2e
Status,ready
Image Path,proxy.replicated.com/proxy/wallaroo/ghcr.io/wallaroolabs/mac-deploy:v2025.1.0-main-6132
Architecture,x86
Acceleration,cuda
Updated At,2025-08-May 18:22:35
Workspace id,60


In [7]:
# Define continous batching for Async vLLM (you can choose the number of connections you want)
cbc = ContinuousBatchingConfig(max_concurrent_batch_size = 256)

In [8]:
batch = model.configure(
    input_schema = input_schema,
    output_schema = output_schema,
    continuous_batching_config = cbc
)
batch

0,1
Name,byop-vllm-tinyllama-ynsv5
Version,4b40ba86-8af1-4945-bde6-137245d5e618
File Name,byop_tinyllama_vllm_v4.zip
SHA,5e244d5ab73cf718256d1d08b7c0553102215f69c3d70936b2d4b89043499a2e
Status,ready
Image Path,proxy.replicated.com/proxy/wallaroo/ghcr.io/wallaroolabs/mac-deploy:v2025.1.0-main-6132
Architecture,x86
Acceleration,cuda
Updated At,2025-08-May 18:22:35
Workspace id,60


## Deployment

In [15]:
deployment_config = DeploymentConfigBuilder() \
    .cpus(1.).memory('1Gi') \
    .sidekick_cpus(batch, 1.) \
    .sidekick_memory(batch, '10Gi') \
    .sidekick_gpus(batch, 1) \
    .deployment_label("wallaroo.ai/accelerator:t4-shared") \
    .build()

In [None]:
pipeline = wl.build_pipeline("byop-tinyllama-cutom-vllm")
pipeline.undeploy()
pipeline.clear()

pipeline.add_model_step(batch)
pipeline.deploy(deployment_config=deployment_config)

In [17]:
pipeline.status()

{'status': 'Running',
 'details': [],
 'engines': [{'ip': '10.4.7.8',
   'name': 'engine-65bc55d64f-mdrnh',
   'status': 'Running',
   'reason': None,
   'details': [],
   'pipeline_statuses': {'pipelines': [{'id': 'byop-tinyllama-cutom-vllm',
      'status': 'Running',
      'version': '95a07681-e434-4108-8e9c-01c052b7b5ec'}]},
   'model_statuses': {'models': [{'model_version_id': 434,
      'name': 'byop-vllm-tinyllama-ynsv5',
      'sha': '5e244d5ab73cf718256d1d08b7c0553102215f69c3d70936b2d4b89043499a2e',
      'status': 'Running',
      'version': '4b40ba86-8af1-4945-bde6-137245d5e618'}]}}],
 'engine_lbs': [{'ip': '10.4.1.15',
   'name': 'engine-lb-5cf49f9d5f-dkvsz',
   'status': 'Running',
   'reason': None,
   'details': []}],
 'sidekicks': [{'ip': '10.4.7.9',
   'name': 'engine-sidekick-byop-vllm-tinyllama-ynsv5-434-5cc6f466fc-zqzbk',
   'status': 'Running',
   'reason': None,
   'details': [],
   'statuses': '\n'}]}

## Inference

In [18]:
data = pd.DataFrame({"prompt": ["What is Wallaroo.AI?"], "max_tokens": [200]})

In [19]:
pipeline.infer(data, timeout=600)

Unnamed: 0,time,in.max_tokens,in.prompt,out.generated_text,out.num_output_tokens,anomaly.count
0,2025-05-08 18:41:35.436,200,What is Wallaroo.AI?,\n2.2 How does Wallaroo.AI's Asset Composition...,200,0


## Undeploy

In [14]:
pipeline.undeploy()

Waiting for undeployment - this will take up to 45s ..................................... ok


0,1
name,byop-tinyllama-demo-yns-cudafix
created,2025-05-08 18:23:23.012161+00:00
last_updated,2025-05-08 18:23:23.094326+00:00
deployed,False
workspace_id,60
workspace_name,younes.amar@wallaroo.ai - Default Workspace
arch,x86
accel,cuda
tags,
versions,"2ae66497-d235-44b5-8be5-52a6b83cf945, 2c8d7c28-1702-4e6a-9805-c8f5b918ab36"
