# Task 12.1 Solutions: Engine Benchmark

This notebook contains solutions to the exercises from the Engine Benchmark task.

## Exercise 1: Test Different Prompt Lengths

Test how different engines handle prompts of varying lengths.

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path("../scripts").resolve()))

from benchmark_utils import InferenceBenchmark
import numpy as np

# Create prompts of different lengths
def generate_prompt(target_tokens: int) -> str:
    """Generate a prompt with approximately the target number of tokens."""
    base = "Summarize the following text: "
    filler = "The quick brown fox jumps over the lazy dog. " * (target_tokens // 10)
    return base + filler

prompt_lengths = {
    "short": generate_prompt(20),
    "medium": generate_prompt(100),
    "long": generate_prompt(500),
}

# Benchmark each length
def benchmark_prompt_lengths(engine: str, model: str):
    """Benchmark an engine with different prompt lengths."""
    benchmark = InferenceBenchmark(engine=engine, model=model)
    benchmark.warmup(3)
    
    results = {}
    for name, prompt in prompt_lengths.items():
        # Run 3 times and average
        ttfts = []
        for _ in range(3):
            result = benchmark.run_single(prompt, max_tokens=50, stream=True)
            if result.error is None:
                ttfts.append(result.time_to_first_token * 1000)  # ms
        
        if ttfts:
            results[name] = {
                "avg_ttft_ms": np.mean(ttfts),
                "prompt_length": len(prompt.split())
            }
    
    return results

# Run benchmarks (if Ollama is running)
try:
    results = benchmark_prompt_lengths("ollama", "llama3.1:8b")
    
    print("ðŸ“Š Prompt Length vs TTFT (Ollama)")
    print("=" * 50)
    print(f"{'Length':<12} {'Words':<10} {'Avg TTFT (ms)'}")
    print("-" * 40)
    for name, data in results.items():
        print(f"{name:<12} {data['prompt_length']:<10} {data['avg_ttft_ms']:.1f}")
    
    # Analysis
    print("\nðŸ’¡ Insights:")
    print("   - TTFT increases with prompt length (prefill takes longer)")
    print("   - The relationship is roughly linear for typical prompts")
    print("   - TensorRT-LLM would show the smallest increase (best prefill)")
    
except Exception as e:
    print(f"Could not run benchmark: {e}")
    print("Make sure Ollama is running with: ollama serve")

## Exercise 2: Find the Saturation Point

Find the concurrency level where latency starts to degrade significantly.

In [None]:
from benchmark_utils import InferenceBenchmark

def find_saturation_point(engine: str, model: str, max_concurrency: int = 64):
    """
    Find the concurrency level where latency starts degrading significantly.
    
    The "knee" in the curve is where:
    - Throughput gains diminish
    - Latency starts increasing faster than linearly
    """
    benchmark = InferenceBenchmark(engine=engine, model=model)
    benchmark.warmup(3)
    
    # Test prompts
    prompts = ["What is AI?"] * 20
    
    # Test concurrency levels
    concurrency_levels = [1, 2, 4, 8, 16, 32, 64]
    concurrency_levels = [c for c in concurrency_levels if c <= max_concurrency]
    
    results = {}
    previous_throughput = 0
    saturation_point = None
    
    for concurrency in concurrency_levels:
        result = benchmark.run_batch(
            prompts=prompts,
            max_tokens=50,
            concurrency=concurrency,
            stream=False
        )
        
        results[concurrency] = {
            "throughput": result.throughput_rps,
            "p90_latency_ms": result.p90_latency * 1000,
            "avg_ttft_ms": result.avg_ttft * 1000
        }
        
        # Check for saturation (throughput gains < 20%)
        if previous_throughput > 0:
            gain = (result.throughput_rps - previous_throughput) / previous_throughput
            if gain < 0.2 and saturation_point is None:
                saturation_point = concurrency
        
        previous_throughput = result.throughput_rps
    
    return results, saturation_point

# Run the analysis
try:
    results, saturation = find_saturation_point("ollama", "llama3.1:8b", max_concurrency=16)
    
    print("ðŸ“Š Concurrency Saturation Analysis")
    print("=" * 60)
    print(f"{'Concurrency':<12} {'Throughput':>12} {'P90 Latency':>15} {'Avg TTFT':>12}")
    print("-" * 60)
    for conc, data in results.items():
        print(f"{conc:<12} {data['throughput']:>10.2f}/s {data['p90_latency_ms']:>12.0f}ms {data['avg_ttft_ms']:>10.0f}ms")
    
    if saturation:
        print(f"\nðŸŽ¯ Saturation Point: {saturation} concurrent requests")
        print("   Beyond this, throughput gains diminish significantly.")
        print(f"   Recommended operating concurrency: {saturation // 2} - {saturation}")
    else:
        print("\nðŸ’¡ No clear saturation point found within tested range.")
        print("   The engine may be able to handle higher concurrency.")

except Exception as e:
    print(f"Could not run analysis: {e}")

## Key Takeaways

1. **Prompt Length Impact**: Longer prompts increase TTFT due to prefill processing. TensorRT-LLM handles this best.

2. **Saturation Point**: Every engine has a point where adding more concurrency provides diminishing returns. Operating below this point ensures stable latency.

3. **Trade-offs**:
   - Low concurrency: Best per-request latency
   - High concurrency (below saturation): Best throughput
   - Above saturation: Degraded performance for everyone

4. **Engine Comparison**:
   - Ollama: Great for development, saturates early
   - vLLM: Handles high concurrency well with continuous batching
   - TensorRT-LLM: Best prefill speed, moderate decode
   - SGLang: Good all-rounder with speculative decoding