# vLLM Handoff

## Imports

In [1]:
import base64
import wallaroo
import pyarrow as pa
import pandas as pd
from wallaroo.deployment_config import DeploymentConfigBuilder
from wallaroo.framework import Framework
from wallaroo.engine_config import Acceleration
from wallaroo.continuous_batching_config import ContinuousBatchingConfig
from wallaroo.object import EntityNotFoundError

In [2]:
wl = wallaroo.Client()

## Download & zip model

```bash
git lfs install
```

```bash
git clone https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
```

```bash
cd ./Llama-3.1-8B-Instruct && rm -rf .git
```

```bash
cd .. & zip -r llama-31-8b-instruct.zip ./Llama-3.1-8B-Instruct
```

## Define Schemas & Upload model

In [3]:
input_schema = pa.schema([
    pa.field('prompt', pa.string()),
    pa.field('max_tokens', pa.int64()),
    # pa.field('top_p', pa.int64()),
    # pa.field('ignore_eos', pa.bool()),
    # pa.field('temperature', pa.int64()),
])
output_schema = pa.schema([
    pa.field('generated_text', pa.string()),
    pa.field('num_output_tokens', pa.int64())
])

### Upload model via API

In [None]:
base64.b64encode(
    bytes(input_schema.serialize())
).decode("utf8")

In [None]:
base64.b64encode(
    bytes(output_schema.serialize())
).decode("utf8")

Run the following command in order to upload the model via **API**:

```bash
curl --progress-bar -X POST   -H "Content-Type: multipart/form-data"   -H "Authorization: Bearer <your-auth-token-here>"   -F 'metadata={"name": "vllm-llama31-8b-async-fc-v3", "visibility": "private", "workspace_id": <your-workspace-id-here>, "conversion": {"framework": "vllm", "python_version": "3.8", "requirements": [], "framework_config": {"config": {"gpu_memory_utilization": 0.9, "max_model_len": 128}, "framework": "vllm"}}, "input_schema": "/////7AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABUAAAABAAAAMT///8AAAECEAAAACQAAAAEAAAAAAAAAAoAAABtYXhfdG9rZW5zAAAIAAwACAAHAAgAAAAAAAABQAAAABAAFAAIAAYABwAMAAAAEAAQAAAAAAABBRAAAAAcAAAABAAAAAAAAAAGAAAAcHJvbXB0AAAEAAQABAAAAA==", "output_schema": "/////8AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABcAAAABAAAALz///8AAAECEAAAACwAAAAEAAAAAAAAABEAAABudW1fb3V0cHV0X3Rva2VucwAAAAgADAAIAAcACAAAAAAAAAFAAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEFEAAAACQAAAAEAAAAAAAAAA4AAABnZW5lcmF0ZWRfdGV4dAAABAAEAAQAAAA="};type=application/json'   -F "file=@llama-31-8b-instruct.zip;type=application/octet-stream"   https://benchmarkscluster.wallaroocommunity.ninja/v1/api/models/upload_and_convert | cat
```

In [None]:
# Retrieve the model
model = wl.get_model("vllm-llama31-8b-async-fc-v3")
model

### Upload model via SDK

In [4]:
from wallaroo.framework import VLLMConfig

In [17]:
??VLLMConfig

[0;31mInit signature:[0m
[0mVLLMConfig[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice_group[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m,[0m [0mwallaroo[0m[0;34m.[0m[0mwallaroo_ml_ops_api_client[0m[0;34m.[0m[0mtypes[0m[0;34m.[0m[0mUnset[0m[0;34m][0m [0;34m=[0m [0;34m<[0m[0mwallaroo[0m[0;34m.[0m[0mwallaroo_ml_ops_api_client[0m[0;34m.[0m[0mtypes[0m[0;34m.[0m[0mUnset[0m [0mobject[0m [0mat[0m [0;36m0x7b569d49cca0[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgpu_memory_utilization[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mwallaroo[0m[0;34m.[0m[0mwallaroo_ml_ops_api_client[0m[0;34m.[0m[0mtypes[0m[0;34m.[0m[0mUnset[0m[0;34m,[0m [0mfloat[0m[0;34m][0m [0;34m=[0m [0;36m0.9[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkv_cache_dtype[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mwall

In [6]:
model = wl.upload_model(
    "vllm-llama31-8b-async-demo", 
    "./vLLM_llama-31-8b.zip",
    framework=Framework.VLLM,
    framework_config=VLLMConfig(
        gpu_memory_utilization=0.9, 
        max_model_len=128
    ),
    input_schema=input_schema, 
    output_schema=output_schema,
    accel=Acceleration.CUDA
)
model

Please log into the following URL in a web browser:

	https://autoscale-uat-gcp.wallaroo.dev/auth/realms/master/device?user_code=VPKN-MCIQ

Login successful!
Waiting for model loading - this will take up to 10min.
.odel is pending loading to a container runtime.
.............................................successful

Ready


0,1
Name,vllm-llama31-8b-async-demo
Version,422d3ad9-1bc7-40c1-99af-0ba109964bfd
File Name,vLLM_llama-31-8b.zip
SHA,62c338e77c031d7c071fe25e1d202fcd1ded052377a007ebd18cb63eadddf838
Status,ready
Image Path,proxy.replicated.com/proxy/wallaroo/ghcr.io/wallaroolabs/mac-deploy:v2025.1.0-main-6132
Architecture,x86
Acceleration,cuda
Updated At,2025-08-May 19:24:36
Workspace id,60


In [7]:
model

0,1
Name,vllm-llama31-8b-async-demo
Version,422d3ad9-1bc7-40c1-99af-0ba109964bfd
File Name,vLLM_llama-31-8b.zip
SHA,62c338e77c031d7c071fe25e1d202fcd1ded052377a007ebd18cb63eadddf838
Status,ready
Image Path,proxy.replicated.com/proxy/wallaroo/ghcr.io/wallaroolabs/mac-deploy:v2025.1.0-main-6132
Architecture,x86
Acceleration,cuda
Updated At,2025-08-May 19:24:36
Workspace id,60


In [8]:
# Define continous batching for Async vLLM (you can choose the number of connections you want)
cbc = ContinuousBatchingConfig(max_concurrent_batch_size = 100)

In [9]:
batch = model.configure(
    input_schema = input_schema,
    output_schema = output_schema,
    continuous_batching_config = cbc
)

In [10]:
batch

0,1
Name,vllm-llama31-8b-async-demo
Version,422d3ad9-1bc7-40c1-99af-0ba109964bfd
File Name,vLLM_llama-31-8b.zip
SHA,62c338e77c031d7c071fe25e1d202fcd1ded052377a007ebd18cb63eadddf838
Status,ready
Image Path,proxy.replicated.com/proxy/wallaroo/ghcr.io/wallaroolabs/mac-deploy:v2025.1.0-main-6132
Architecture,x86
Acceleration,cuda
Updated At,2025-08-May 19:24:36
Workspace id,60


## Deployment

In [11]:
deployment_config = DeploymentConfigBuilder() \
    .cpus(1.).memory('1Gi') \
    .sidekick_cpus(batch, 1.) \
    .sidekick_memory(batch, '10Gi') \
    .sidekick_gpus(batch, 1) \
    .deployment_label("wallaroo.ai/accelerator:a100") \
    .build()

In [None]:
pipeline = wl.build_pipeline("llama-31-8b-vllm-demo")
pipeline.clear()
pipeline.undeploy()

pipeline.add_model_step(batch)
pipeline.deploy(deployment_config=deployment_config)

In [13]:
pipeline.status()

{'status': 'Running',
 'details': [],
 'engines': [{'ip': '10.4.8.2',
   'name': 'engine-8558f6576d-8h7pc',
   'status': 'Running',
   'reason': None,
   'details': [],
   'pipeline_statuses': {'pipelines': [{'id': 'llama-31-8b-vllm-demo',
      'status': 'Running',
      'version': '62806288-5f42-44b8-9345-bb4dfb613801'}]},
   'model_statuses': {'models': [{'model_version_id': 443,
      'name': 'vllm-llama31-8b-async-demo',
      'sha': '62c338e77c031d7c071fe25e1d202fcd1ded052377a007ebd18cb63eadddf838',
      'status': 'Running',
      'version': '422d3ad9-1bc7-40c1-99af-0ba109964bfd'}]}}],
 'engine_lbs': [{'ip': '10.4.1.17',
   'name': 'engine-lb-5cf49f9d5f-sqr4f',
   'status': 'Running',
   'reason': None,
   'details': []}],
 'sidekicks': [{'ip': '10.4.8.7',
   'name': 'engine-sidekick-vllm-llama31-8b-async-demo-443-75d58845c-svvll',
   'status': 'Running',
   'reason': None,
   'details': [],
   'statuses': '\n'}]}

## Inference

In [14]:
data = pd.DataFrame({"prompt": ["What is Wallaroo.AI?"], "max_tokens": [200]})

In [15]:
pipeline.infer(data)

Unnamed: 0,time,in.max_tokens,in.prompt,out.generated_text,out.num_output_tokens,anomaly.count
0,2025-05-08 19:42:06.259,200,What is Wallaroo.AI?,Cloud and AutoML with Python\nWallaroo.AI is a...,122,0


In [16]:
pipeline.publish(deployment_config=deployment_config)

Waiting for pipeline publish... It may take up to 600 sec.
............................................... Published.


0,1
ID,36
Pipeline Name,llama-31-8b-vllm-demo
Pipeline Version,a5b7a202-9923-4d8d-ba4c-31e22a83cddc
Status,Published
Workspace Id,60
Workspace Name,younes.amar@wallaroo.ai - Default Workspace
Edges,
Engine URL,us-central1-docker.pkg.dev/wallaroo-dev-253816/uat/engines/proxy/wallaroo/ghcr.io/wallaroolabs/fitzroy-mini-cuda:v2025.1.0-main-6132
Pipeline URL,us-central1-docker.pkg.dev/wallaroo-dev-253816/uat/pipelines/llama-31-8b-vllm-demo:a5b7a202-9923-4d8d-ba4c-31e22a83cddc
Helm Chart URL,oci://us-central1-docker.pkg.dev/wallaroo-dev-253816/uat/charts/llama-31-8b-vllm-demo

0
docker run \  -p $EDGE_PORT:8080 \  -e OCI_USERNAME=$OCI_USERNAME \  -e OCI_PASSWORD=$OCI_PASSWORD \  -e PIPELINE_URL=us-central1-docker.pkg.dev/wallaroo-dev-253816/uat/pipelines/llama-31-8b-vllm-demo:a5b7a202-9923-4d8d-ba4c-31e22a83cddc \  -e CONFIG_CPUS=1.0 --gpus all --cpus=2.0 --memory=11g \  us-central1-docker.pkg.dev/wallaroo-dev-253816/uat/engines/proxy/wallaroo/ghcr.io/wallaroolabs/fitzroy-mini-cuda:v2025.1.0-main-6132

0
helm install --atomic $HELM_INSTALL_NAME \  oci://us-central1-docker.pkg.dev/wallaroo-dev-253816/uat/charts/llama-31-8b-vllm-demo \  --namespace $HELM_INSTALL_NAMESPACE \  --version 0.0.1-a5b7a202-9923-4d8d-ba4c-31e22a83cddc \  --set ociRegistry.username=$OCI_USERNAME \  --set ociRegistry.password=$OCI_PASSWORD


## Undeploy

In [91]:
pipeline.undeploy()

Please log into the following URL in a web browser:

	https://autoscale-uat-gcp.wallaroo.dev/auth/realms/master/device?user_code=XXND-IZZK

Login successful!
 ok


0,1
name,llama-31-8b-vllm-ynsv5
created,2025-05-06 12:31:40.360907+00:00
last_updated,2025-05-06 19:51:47.490400+00:00
deployed,False
workspace_id,60
workspace_name,younes.amar@wallaroo.ai - Default Workspace
arch,x86
accel,cuda
tags,
versions,"b82ed30f-e937-4b49-94d5-63e6e798cc4b, b0a4ab4d-28ee-4470-9391-888a486375d2, 47760536-b263-428d-a9eb-f763c84f8920, 632917ff-0ffd-49be-abca-5a69a6432f93, 18cc0cad-cf6c-4abf-9083-ee90c2e704e2"
