# Lab 3.3.3 Solutions: vLLM Continuous Batching

Complete solutions to all exercises from the vLLM continuous batching lab.

## Setup

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path("../scripts").resolve()))

import time
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from dataclasses import dataclass
from typing import List, Dict, Optional

from benchmark_utils import InferenceBenchmark

---

## Exercise 1: Analyze PagedAttention Memory Efficiency

**Task**: Calculate theoretical memory savings from PagedAttention vs. traditional pre-allocation.

In [None]:
@dataclass
class ModelConfig:
    """Model configuration for memory calculations."""
    name: str
    hidden_size: int
    num_layers: int
    num_kv_heads: int
    head_dim: int
    max_seq_len: int
    
    @property
    def kv_cache_per_token_bytes(self) -> int:
        """Calculate KV cache bytes per token."""
        # For each token: 2 (K and V) * num_layers * num_kv_heads * head_dim * 2 (bf16)
        return 2 * self.num_layers * self.num_kv_heads * self.head_dim * 2

# Common model configurations
MODELS = {
    "llama-3.1-8b": ModelConfig(
        name="Llama 3.1 8B",
        hidden_size=4096,
        num_layers=32,
        num_kv_heads=8,  # GQA
        head_dim=128,
        max_seq_len=131072
    ),
    "llama-3.1-70b": ModelConfig(
        name="Llama 3.1 70B",
        hidden_size=8192,
        num_layers=80,
        num_kv_heads=8,  # GQA
        head_dim=128,
        max_seq_len=131072
    ),
}


def calculate_memory_comparison(model: ModelConfig, 
                                batch_size: int,
                                avg_seq_len: int,
                                paged_block_size: int = 16) -> dict:
    """
    Compare memory usage: traditional pre-allocation vs PagedAttention.
    
    Traditional approach: Pre-allocate max_seq_len for every request
    PagedAttention: Allocate blocks on-demand as tokens are generated
    """
    bytes_per_token = model.kv_cache_per_token_bytes
    
    # Traditional: Pre-allocate max length for all requests
    traditional_memory = batch_size * model.max_seq_len * bytes_per_token
    
    # PagedAttention: Only allocate what's actually used
    # Plus some overhead for block table management
    actual_tokens = batch_size * avg_seq_len
    
    # Round up to block boundaries
    blocks_needed = (avg_seq_len + paged_block_size - 1) // paged_block_size
    paged_tokens = batch_size * blocks_needed * paged_block_size
    paged_memory = paged_tokens * bytes_per_token
    
    # Block table overhead (minimal)
    block_table_overhead = batch_size * blocks_needed * 8  # 8 bytes per pointer
    paged_memory += block_table_overhead
    
    savings = traditional_memory - paged_memory
    savings_ratio = savings / traditional_memory
    
    # How many more requests could we serve with saved memory?
    additional_requests = savings // (paged_memory // batch_size)
    
    return {
        "model": model.name,
        "batch_size": batch_size,
        "avg_seq_len": avg_seq_len,
        "max_seq_len": model.max_seq_len,
        "traditional_memory_gb": traditional_memory / (1024**3),
        "paged_memory_gb": paged_memory / (1024**3),
        "savings_gb": savings / (1024**3),
        "savings_percent": savings_ratio * 100,
        "additional_capacity": int(additional_requests),
        "throughput_multiplier": (batch_size + additional_requests) / batch_size
    }


def analyze_paged_attention_benefits():
    """Analyze PagedAttention benefits across different scenarios."""
    
    print("üìä PagedAttention Memory Efficiency Analysis")
    print("=" * 70)
    
    scenarios = [
        # (model_key, batch_size, avg_seq_len)
        ("llama-3.1-8b", 32, 512),    # Short responses
        ("llama-3.1-8b", 32, 2048),   # Medium responses
        ("llama-3.1-8b", 32, 8192),   # Long responses
        ("llama-3.1-70b", 8, 512),    # Larger model, fewer concurrent
        ("llama-3.1-70b", 8, 4096),   # Larger model, medium length
    ]
    
    for model_key, batch_size, avg_len in scenarios:
        model = MODELS[model_key]
        result = calculate_memory_comparison(model, batch_size, avg_len)
        
        print(f"\n{model.name} | Batch: {batch_size} | Avg Length: {avg_len:,}")
        print("-" * 50)
        print(f"  Traditional (pre-alloc {model.max_seq_len:,} tokens):")
        print(f"    Memory: {result['traditional_memory_gb']:.1f} GB")
        print(f"  PagedAttention (actual usage):")
        print(f"    Memory: {result['paged_memory_gb']:.2f} GB")
        print(f"  Savings: {result['savings_gb']:.1f} GB ({result['savings_percent']:.1f}%)")
        print(f"  Throughput potential: {result['throughput_multiplier']:.1f}x")
        print(f"    (Could serve {result['additional_capacity']} additional requests)")
    
    print("\n" + "=" * 70)
    print("üí° Key Insights:")
    print("  ‚Ä¢ Shorter responses = more memory savings")
    print("  ‚Ä¢ Savings increase with max_seq_len / avg_seq_len ratio")
    print("  ‚Ä¢ PagedAttention enables higher concurrent request capacity")
    print("  ‚Ä¢ Memory fragmentation eliminated by block-based allocation")

analyze_paged_attention_benefits()

---

## Exercise 2: Implement Batch Size Optimizer

**Task**: Create a system that dynamically finds the optimal batch size for maximum throughput.

In [None]:
@dataclass
class BatchTestResult:
    """Results from testing a specific batch size."""
    batch_size: int
    throughput_tps: float  # tokens per second
    avg_latency_ms: float
    p99_latency_ms: float
    error_rate: float
    gpu_memory_used_gb: float


class BatchSizeOptimizer:
    """
    Find optimal batch size for vLLM through binary search.
    
    The optimal batch size maximizes throughput while keeping:
    - Latency below a target threshold
    - Error rate at zero
    - GPU memory below limit
    """
    
    def __init__(self, 
                 base_url: str = "http://localhost:8000",
                 max_latency_ms: float = 5000,
                 max_memory_gb: float = 100):
        self.base_url = base_url
        self.max_latency_ms = max_latency_ms
        self.max_memory_gb = max_memory_gb
        self.results_history: List[BatchTestResult] = []
    
    def test_batch_size(self, batch_size: int, 
                        test_prompt: str = "Explain AI in one sentence.",
                        max_tokens: int = 50,
                        num_requests: int = 100) -> BatchTestResult:
        """
        Test a specific batch size and measure performance.
        """
        latencies = []
        errors = 0
        total_tokens = 0
        
        start_time = time.time()
        
        def make_request():
            try:
                req_start = time.time()
                response = requests.post(
                    f"{self.base_url}/v1/completions",
                    json={
                        "model": "default",
                        "prompt": test_prompt,
                        "max_tokens": max_tokens,
                        "temperature": 0.7
                    },
                    timeout=30
                )
                latency = (time.time() - req_start) * 1000
                
                if response.status_code == 200:
                    data = response.json()
                    tokens = data.get("usage", {}).get("completion_tokens", max_tokens)
                    return latency, tokens, None
                else:
                    return latency, 0, response.status_code
            except Exception as e:
                return 0, 0, str(e)
        
        # Run concurrent requests
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = [executor.submit(make_request) for _ in range(num_requests)]
            
            for future in as_completed(futures):
                latency, tokens, error = future.result()
                if error:
                    errors += 1
                else:
                    latencies.append(latency)
                    total_tokens += tokens
        
        total_time = time.time() - start_time
        
        # Calculate metrics
        result = BatchTestResult(
            batch_size=batch_size,
            throughput_tps=total_tokens / total_time if total_time > 0 else 0,
            avg_latency_ms=np.mean(latencies) if latencies else 0,
            p99_latency_ms=np.percentile(latencies, 99) if latencies else 0,
            error_rate=errors / num_requests,
            gpu_memory_used_gb=0  # Would need nvidia-smi or pynvml
        )
        
        self.results_history.append(result)
        return result
    
    def is_acceptable(self, result: BatchTestResult) -> bool:
        """Check if a batch size result meets our requirements."""
        return (
            result.error_rate == 0 and
            result.p99_latency_ms < self.max_latency_ms and
            result.gpu_memory_used_gb < self.max_memory_gb
        )
    
    def find_optimal(self, min_batch: int = 1, 
                     max_batch: int = 128) -> Tuple[int, BatchTestResult]:
        """
        Find optimal batch size using binary search.
        
        Algorithm:
        1. Start with a range [min_batch, max_batch]
        2. Test the midpoint
        3. If acceptable and throughput increasing, search higher
        4. If not acceptable or throughput decreasing, search lower
        5. Continue until range is small
        """
        best_batch = min_batch
        best_result = None
        best_throughput = 0
        
        print(f"üîç Searching for optimal batch size in [{min_batch}, {max_batch}]")
        print("-" * 60)
        
        while min_batch <= max_batch:
            mid = (min_batch + max_batch) // 2
            
            print(f"Testing batch_size={mid}...", end=" ")
            result = self.test_batch_size(mid)
            
            acceptable = self.is_acceptable(result)
            print(f"TPS: {result.throughput_tps:.1f}, "
                  f"P99: {result.p99_latency_ms:.0f}ms, "
                  f"OK: {acceptable}")
            
            if acceptable and result.throughput_tps > best_throughput:
                best_batch = mid
                best_result = result
                best_throughput = result.throughput_tps
                min_batch = mid + 1  # Try higher
            else:
                max_batch = mid - 1  # Try lower
        
        return best_batch, best_result
    
    def generate_report(self) -> str:
        """Generate a report of all tested configurations."""
        lines = ["\nüìä Batch Size Optimization Report", "=" * 60]
        lines.append(f"{'Batch':<8} {'TPS':>10} {'Avg Lat':>12} {'P99 Lat':>12} {'Errors':>8}")
        lines.append("-" * 60)
        
        for r in sorted(self.results_history, key=lambda x: x.batch_size):
            lines.append(
                f"{r.batch_size:<8} {r.throughput_tps:>10.1f} "
                f"{r.avg_latency_ms:>10.0f}ms {r.p99_latency_ms:>10.0f}ms "
                f"{r.error_rate:>7.1%}"
            )
        
        # Find optimal
        valid = [r for r in self.results_history if self.is_acceptable(r)]
        if valid:
            best = max(valid, key=lambda x: x.throughput_tps)
            lines.append("\n" + "=" * 60)
            lines.append(f"‚úÖ Optimal batch size: {best.batch_size}")
            lines.append(f"   Throughput: {best.throughput_tps:.1f} tok/s")
            lines.append(f"   P99 Latency: {best.p99_latency_ms:.0f}ms")
        
        return "\n".join(lines)


# Example usage (with simulated results for demo)
print("üîß Batch Size Optimizer")
print("   Run after starting vLLM server:")
print("   vllm serve meta-llama/Llama-3.1-8B-Instruct --enforce-eager")
print("")

# Simulated optimization results
print("Simulated optimization run:")
print("-" * 60)

simulated_results = [
    BatchTestResult(8, 450.5, 142, 285, 0, 12.3),
    BatchTestResult(16, 782.3, 165, 412, 0, 18.7),
    BatchTestResult(32, 1105.8, 234, 678, 0, 28.4),
    BatchTestResult(48, 1289.2, 312, 892, 0, 38.1),
    BatchTestResult(64, 1356.7, 428, 1456, 0, 48.2),
    BatchTestResult(96, 1298.4, 687, 2845, 0, 62.5),
    BatchTestResult(128, 1189.2, 1024, 4521, 0.02, 78.3),
]

print(f"{'Batch':<8} {'TPS':>10} {'Avg Lat':>12} {'P99 Lat':>12}")
print("-" * 45)
for r in simulated_results:
    marker = "‚Üê" if r.batch_size == 64 else ""
    print(f"{r.batch_size:<8} {r.throughput_tps:>10.1f} {r.avg_latency_ms:>10.0f}ms {r.p99_latency_ms:>10.0f}ms {marker}")

print("\n‚úÖ Optimal batch size: 64")
print("   - Highest throughput with acceptable latency")
print("   - Beyond 64, latency increases faster than throughput")

---

## Exercise 3: Implement Request Priority Queue

**Task**: Create a priority-based request routing system for vLLM.

In [None]:
import heapq
import threading
from enum import IntEnum
from dataclasses import dataclass, field
import uuid


class Priority(IntEnum):
    """Request priority levels (lower = higher priority)."""
    CRITICAL = 0   # Health checks, system requests
    HIGH = 1       # Premium users, time-sensitive
    NORMAL = 2     # Standard requests
    LOW = 3        # Batch jobs, background tasks
    BULK = 4       # Large batch processing


@dataclass(order=True)
class PrioritizedRequest:
    """A request with priority for queue ordering."""
    priority: int
    timestamp: float = field(compare=True)  # For FIFO within priority
    request_id: str = field(compare=False, default_factory=lambda: str(uuid.uuid4())[:8])
    prompt: str = field(compare=False, default="")
    max_tokens: int = field(compare=False, default=100)
    user_tier: str = field(compare=False, default="standard")
    

class PriorityRequestQueue:
    """
    Thread-safe priority queue for LLM requests.
    
    Features:
    - Multi-level priority support
    - Fair scheduling within priority levels (FIFO)
    - Priority boosting for aging requests
    - Rate limiting per priority level
    """
    
    def __init__(self, max_age_boost_sec: float = 30):
        self._queue: List[PrioritizedRequest] = []
        self._lock = threading.Lock()
        self._max_age_boost = max_age_boost_sec
        self._stats = {
            Priority.CRITICAL: {"enqueued": 0, "processed": 0},
            Priority.HIGH: {"enqueued": 0, "processed": 0},
            Priority.NORMAL: {"enqueued": 0, "processed": 0},
            Priority.LOW: {"enqueued": 0, "processed": 0},
            Priority.BULK: {"enqueued": 0, "processed": 0},
        }
    
    def enqueue(self, prompt: str, priority: Priority = Priority.NORMAL,
                max_tokens: int = 100, user_tier: str = "standard") -> str:
        """Add a request to the queue."""
        request = PrioritizedRequest(
            priority=priority,
            timestamp=time.time(),
            prompt=prompt,
            max_tokens=max_tokens,
            user_tier=user_tier
        )
        
        with self._lock:
            heapq.heappush(self._queue, request)
            self._stats[priority]["enqueued"] += 1
        
        return request.request_id
    
    def dequeue(self) -> Optional[PrioritizedRequest]:
        """Get the highest priority request."""
        with self._lock:
            if not self._queue:
                return None
            
            # Apply priority boosting for aged requests
            self._apply_aging_boost()
            
            request = heapq.heappop(self._queue)
            self._stats[Priority(request.priority)]["processed"] += 1
            return request
    
    def _apply_aging_boost(self):
        """Boost priority of requests that have been waiting too long."""
        current_time = time.time()
        boosted = False
        
        for request in self._queue:
            age = current_time - request.timestamp
            if age > self._max_age_boost and request.priority > Priority.CRITICAL:
                # Boost by one level
                request.priority = max(Priority.CRITICAL, request.priority - 1)
                boosted = True
        
        if boosted:
            heapq.heapify(self._queue)
    
    def get_queue_depth(self) -> Dict[Priority, int]:
        """Get current queue depth by priority."""
        with self._lock:
            depths = {p: 0 for p in Priority}
            for req in self._queue:
                depths[Priority(req.priority)] += 1
            return depths
    
    def get_stats(self) -> dict:
        """Get queue statistics."""
        with self._lock:
            return {
                "queue_size": len(self._queue),
                "by_priority": dict(self._stats)
            }


class PriorityRouter:
    """
    Routes requests to vLLM with priority handling.
    
    Implements:
    - Weighted fair queuing
    - Rate limiting per tier
    - Graceful degradation under load
    """
    
    # Weights for weighted fair queuing (higher = more share)
    PRIORITY_WEIGHTS = {
        Priority.CRITICAL: 100,
        Priority.HIGH: 50,
        Priority.NORMAL: 20,
        Priority.LOW: 5,
        Priority.BULK: 1,
    }
    
    def __init__(self, vllm_url: str = "http://localhost:8000",
                 max_concurrent: int = 64):
        self.vllm_url = vllm_url
        self.max_concurrent = max_concurrent
        self.queue = PriorityRequestQueue()
        self._active_requests = 0
        self._lock = threading.Lock()
    
    def submit(self, prompt: str, priority: Priority = Priority.NORMAL,
               max_tokens: int = 100, user_tier: str = "standard") -> str:
        """Submit a request for processing."""
        # Determine priority based on user tier if not specified
        if user_tier == "premium" and priority == Priority.NORMAL:
            priority = Priority.HIGH
        elif user_tier == "batch" and priority == Priority.NORMAL:
            priority = Priority.LOW
        
        return self.queue.enqueue(prompt, priority, max_tokens, user_tier)
    
    def should_accept_request(self, priority: Priority) -> bool:
        """
        Determine if we should accept a new request based on load.
        
        Under high load, lower priority requests are rejected.
        """
        with self._lock:
            load_ratio = self._active_requests / self.max_concurrent
        
        # Load shedding thresholds
        thresholds = {
            Priority.CRITICAL: 1.0,   # Always accept
            Priority.HIGH: 0.95,      # Reject at 95% load
            Priority.NORMAL: 0.85,    # Reject at 85% load
            Priority.LOW: 0.70,       # Reject at 70% load
            Priority.BULK: 0.50,      # Reject at 50% load
        }
        
        return load_ratio < thresholds[priority]
    
    def get_load_status(self) -> dict:
        """Get current load status."""
        with self._lock:
            load = self._active_requests / self.max_concurrent
        
        return {
            "active_requests": self._active_requests,
            "max_concurrent": self.max_concurrent,
            "load_percent": load * 100,
            "queue_depth": self.queue.get_queue_depth(),
            "accepting": {
                p.name: self.should_accept_request(p)
                for p in Priority
            }
        }


# Demonstrate the priority system
print("üéØ Priority Request Queue Demo")
print("=" * 50)

queue = PriorityRequestQueue()

# Simulate incoming requests with different priorities
requests_to_add = [
    ("What is AI?", Priority.NORMAL, "standard"),
    ("Health check", Priority.CRITICAL, "system"),
    ("Process this batch", Priority.BULK, "batch"),
    ("Urgent query", Priority.HIGH, "premium"),
    ("Another normal request", Priority.NORMAL, "standard"),
    ("Background job", Priority.LOW, "batch"),
]

print("\nEnqueuing requests:")
for prompt, priority, tier in requests_to_add:
    req_id = queue.enqueue(prompt, priority, user_tier=tier)
    print(f"  [{priority.name:<10}] {prompt[:30]} ‚Üí {req_id}")

print(f"\nQueue depth by priority: {queue.get_queue_depth()}")

print("\nProcessing order (highest priority first):")
while True:
    request = queue.dequeue()
    if request is None:
        break
    priority_name = Priority(request.priority).name
    print(f"  [{priority_name:<10}] {request.prompt[:30]}")

print("\n‚úÖ All requests processed in priority order")

---

## Exercise 4: Compare Batching Strategies

**Task**: Implement and compare static batching vs continuous batching throughput.

In [None]:
import random

class BatchingSimulator:
    """
    Simulate and compare different batching strategies.
    
    This helps understand WHY continuous batching is better.
    """
    
    def __init__(self, 
                 decode_time_per_token_ms: float = 10,
                 prefill_time_per_token_ms: float = 0.5):
        self.decode_time = decode_time_per_token_ms
        self.prefill_time = prefill_time_per_token_ms
    
    def simulate_static_batching(self, 
                                  requests: List[Dict],
                                  batch_size: int) -> dict:
        """
        Simulate static batching: wait for full batch, process together.
        
        Problem: Short requests wait for long ones to complete.
        """
        total_time = 0
        total_tokens = 0
        latencies = []
        
        # Process in batches
        for i in range(0, len(requests), batch_size):
            batch = requests[i:i+batch_size]
            
            # Find the longest request in batch
            max_output_tokens = max(r["output_tokens"] for r in batch)
            max_input_tokens = max(r["input_tokens"] for r in batch)
            
            # Batch time = time for longest request
            prefill_time = max_input_tokens * self.prefill_time
            decode_time = max_output_tokens * self.decode_time
            batch_time = prefill_time + decode_time
            
            total_time += batch_time
            
            # All requests in batch have same latency (wait for longest)
            for r in batch:
                latencies.append(batch_time)
                total_tokens += r["output_tokens"]
        
        return {
            "strategy": "Static Batching",
            "total_time_ms": total_time,
            "throughput_tps": total_tokens / (total_time / 1000),
            "avg_latency_ms": np.mean(latencies),
            "p99_latency_ms": np.percentile(latencies, 99),
            "efficiency": total_tokens / (total_time * batch_size) * 1000
        }
    
    def simulate_continuous_batching(self,
                                      requests: List[Dict],
                                      max_batch_size: int) -> dict:
        """
        Simulate continuous batching: requests exit as they complete.
        
        Advantage: Short requests don't wait for long ones.
        """
        # Sort by output length to simulate scheduling
        sorted_requests = sorted(requests, key=lambda x: x["output_tokens"])
        
        total_time = 0
        total_tokens = 0
        latencies = []
        
        # Simulate token-by-token generation with dynamic batching
        active_requests = []
        request_queue = list(sorted_requests)
        
        while request_queue or active_requests:
            # Add new requests up to batch size
            while len(active_requests) < max_batch_size and request_queue:
                req = request_queue.pop(0)
                active_requests.append({
                    "remaining": req["output_tokens"],
                    "total": req["output_tokens"],
                    "start_time": total_time
                })
            
            if not active_requests:
                break
            
            # Generate one token for all active requests
            step_time = self.decode_time  # Parallel decode
            total_time += step_time
            
            # Update all requests
            completed = []
            for req in active_requests:
                req["remaining"] -= 1
                total_tokens += 1
                
                if req["remaining"] <= 0:
                    completed.append(req)
                    latency = total_time - req["start_time"]
                    latencies.append(latency)
            
            # Remove completed, make room for new
            for req in completed:
                active_requests.remove(req)
        
        return {
            "strategy": "Continuous Batching",
            "total_time_ms": total_time,
            "throughput_tps": total_tokens / (total_time / 1000),
            "avg_latency_ms": np.mean(latencies),
            "p99_latency_ms": np.percentile(latencies, 99),
            "efficiency": 0.95  # Near-optimal GPU utilization
        }


# Run comparison
print("üìä Batching Strategy Comparison")
print("=" * 60)

# Generate realistic request distribution
# Mix of short (10-50), medium (50-200), and long (200-500) responses
random.seed(42)
requests = []
for _ in range(100):
    output_len = random.choice([
        random.randint(10, 50),    # 50% short
        random.randint(10, 50),
        random.randint(50, 200),   # 30% medium
        random.randint(200, 500),  # 20% long
    ])
    requests.append({
        "input_tokens": random.randint(50, 200),
        "output_tokens": output_len
    })

simulator = BatchingSimulator()

# Compare strategies
static_result = simulator.simulate_static_batching(requests, batch_size=16)
continuous_result = simulator.simulate_continuous_batching(requests, max_batch_size=16)

print(f"\nTest: {len(requests)} requests with mixed output lengths")
print(f"Output length distribution: 10-500 tokens\n")

print(f"{'Metric':<25} {'Static':<20} {'Continuous':<20}")
print("-" * 65)
print(f"{'Total Time':<25} {static_result['total_time_ms']:>15,.0f}ms {continuous_result['total_time_ms']:>15,.0f}ms")
print(f"{'Throughput':<25} {static_result['throughput_tps']:>15,.1f} t/s {continuous_result['throughput_tps']:>15,.1f} t/s")
print(f"{'Avg Latency':<25} {static_result['avg_latency_ms']:>15,.0f}ms {continuous_result['avg_latency_ms']:>15,.0f}ms")
print(f"{'P99 Latency':<25} {static_result['p99_latency_ms']:>15,.0f}ms {continuous_result['p99_latency_ms']:>15,.0f}ms")

improvement = (continuous_result['throughput_tps'] / static_result['throughput_tps'] - 1) * 100
latency_improvement = (1 - continuous_result['avg_latency_ms'] / static_result['avg_latency_ms']) * 100

print(f"\n‚úÖ Continuous Batching Improvements:")
print(f"   ‚Ä¢ {improvement:.1f}% higher throughput")
print(f"   ‚Ä¢ {latency_improvement:.1f}% lower average latency")
print(f"\nüí° Key Insight: Short requests don't wait for long ones!")

---

## Key Takeaways

1. **PagedAttention saves memory**: Only allocate what you need, when you need it

2. **Batch size optimization**: Binary search to find the sweet spot between throughput and latency

3. **Priority queuing**: Critical for production - premium users shouldn't wait behind batch jobs

4. **Continuous batching wins**: 
   - Higher throughput (GPU always busy)
   - Lower latency (short requests exit early)
   - Better user experience

5. **vLLM on DGX Spark**:
   - Use `--enforce-eager` for ARM64 compatibility
   - `--dtype bfloat16` for optimal performance
   - Monitor GPU memory to tune batch sizes