# Task 12.2: vLLM Deployment - Solutions

This notebook provides solutions for the exercises in the vLLM deployment notebook.

## Exercise 1: Find the Optimal Batch Size

Test different `--max-num-seqs` values (8, 16, 32, 64, 128) and find the optimal setting.

In [None]:
# Solution: Testing different batch sizes
import asyncio
import aiohttp
import time
import json
from typing import List, Dict

VLLM_URL = "http://localhost:8000"

async def run_load_test(prompts: List[str], concurrency: int) -> Dict:
    """
    Run a load test with specified concurrency.
    """
    semaphore = asyncio.Semaphore(concurrency)
    results = []
    
    async def single_request(prompt):
        async with semaphore:
            start = time.perf_counter()
            try:
                async with aiohttp.ClientSession() as session:
                    async with session.post(
                        f"{VLLM_URL}/v1/chat/completions",
                        json={
                            "model": "meta-llama/Llama-3.1-8B-Instruct",
                            "messages": [{"role": "user", "content": prompt}],
                            "max_tokens": 100,
                            "stream": False
                        },
                        timeout=aiohttp.ClientTimeout(total=60)
                    ) as response:
                        await response.json()
                        return time.perf_counter() - start, True
            except Exception as e:
                return time.perf_counter() - start, False
    
    start_time = time.perf_counter()
    tasks = [single_request(p) for p in prompts]
    results = await asyncio.gather(*tasks)
    total_time = time.perf_counter() - start_time
    
    latencies = [r[0] * 1000 for r in results if r[1]]  # ms
    successful = sum(1 for r in results if r[1])
    
    return {
        "throughput_rps": successful / total_time if total_time > 0 else 0,
        "avg_latency_ms": sum(latencies) / len(latencies) if latencies else 0,
        "p90_latency_ms": sorted(latencies)[int(len(latencies) * 0.9)] if latencies else 0,
        "success_rate": successful / len(results)
    }

# Test prompts
test_prompts = [
    "What is machine learning?",
    "Explain the water cycle.",
    "How does a computer work?",
    "What is photosynthesis?",
] * 10  # 40 prompts

# To test different max-num-seqs values, you would:
# 1. Start vLLM with --max-num-seqs 8
# 2. Run this test at concurrency 8, 16, 32
# 3. Restart vLLM with --max-num-seqs 16, repeat
# 4. Compare throughput and latency

# Example result analysis:
# max-num-seqs=8:  throughput=6.2 rps, latency_p90=1200ms
# max-num-seqs=16: throughput=10.5 rps, latency_p90=1500ms  (optimal)
# max-num-seqs=32: throughput=12.1 rps, latency_p90=2500ms  (latency too high)
# max-num-seqs=64: throughput=11.8 rps, latency_p90=4200ms  (diminishing returns)

print("Optimal max-num-seqs depends on your latency requirements:")
print("- For low latency (interactive): use 8-16")
print("- For high throughput (batch): use 32-64")
print("- For DGX Spark with 128GB: 32 is usually a good balance")

## Exercise 2: Compare Streaming vs Non-Streaming

Measure the throughput difference between streaming and non-streaming requests.

In [None]:
# Solution: Streaming vs Non-Streaming comparison
import requests
import time
import json

def test_non_streaming(prompt: str, max_tokens: int = 100) -> dict:
    """Test non-streaming request."""
    start = time.perf_counter()
    response = requests.post(
        f"{VLLM_URL}/v1/chat/completions",
        json={
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": max_tokens,
            "stream": False
        },
        timeout=60
    )
    end = time.perf_counter()
    return {
        "total_time": end - start,
        "ttft": None,  # Not available for non-streaming
        "mode": "non-streaming"
    }

def test_streaming(prompt: str, max_tokens: int = 100) -> dict:
    """Test streaming request."""
    start = time.perf_counter()
    first_token_time = None
    
    response = requests.post(
        f"{VLLM_URL}/v1/chat/completions",
        json={
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": max_tokens,
            "stream": True
        },
        stream=True,
        timeout=60
    )
    
    for line in response.iter_lines():
        if line and first_token_time is None:
            line_str = line.decode()
            if line_str.startswith("data: ") and "[DONE]" not in line_str:
                try:
                    chunk = json.loads(line_str[6:])
                    if chunk.get("choices", [{}])[0].get("delta", {}).get("content"):
                        first_token_time = time.perf_counter()
                except:
                    pass
    
    end = time.perf_counter()
    return {
        "total_time": end - start,
        "ttft": (first_token_time - start) if first_token_time else None,
        "mode": "streaming"
    }

# Run comparison
prompt = "Explain the theory of relativity in detail."

# Non-streaming
ns_result = test_non_streaming(prompt, max_tokens=200)
print(f"Non-streaming: Total time = {ns_result['total_time']*1000:.0f}ms")

# Streaming
s_result = test_streaming(prompt, max_tokens=200)
print(f"Streaming: TTFT = {s_result['ttft']*1000:.0f}ms, Total = {s_result['total_time']*1000:.0f}ms")

print("\nConclusions:")
print("- Streaming has similar total time but provides TTFT (user sees response faster)")
print("- Use streaming for interactive chat (better UX)")
print("- Use non-streaming for batch processing (simpler code)")

## Key Takeaways

1. **Optimal batch size** depends on your latency vs throughput requirements
2. **Streaming** provides better user experience but similar throughput
3. **Continuous batching** automatically adjusts - test to find your optimal settings