# Lab 1.1.5: Ollama Benchmarking - SOLUTIONS

This notebook contains solutions to the exercises in the Ollama Benchmarking notebook.

---

## Challenge Solution

**Task:** Create a benchmark that tests different quantization levels, context lengths, and batch processing.

In [None]:
import requests
import time
import statistics
from typing import Dict, List, Optional
from dataclasses import dataclass

OLLAMA_BASE_URL = "http://localhost:11434"

@dataclass
class BenchmarkResult:
    model: str
    config: str
    prefill_tps: float
    decode_tps: float
    prompt_tokens: int
    generated_tokens: int
    total_time_s: float

In [None]:
# Solution Part 1: Quantization Level Comparison

def benchmark_quantization_levels():
    """
    Compare different quantization levels of the same model.
    
    You'll need to pull these models first:
        ollama pull llama3.1:8b-q4_0
        ollama pull llama3.1:8b-q5_0  
        ollama pull llama3.1:8b-q8_0
        ollama pull llama3.1:8b-fp16 (if available)
    """
    # Models to compare
    quant_models = [
        "llama3.1:8b-q4_0",   # 4-bit quantization
        "llama3.1:8b-q5_0",   # 5-bit quantization
        "llama3.1:8b-q8_0",   # 8-bit quantization
    ]
    
    # Check which models are available
    try:
        response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=30)
        available = [m["name"] for m in response.json().get("models", [])]
    except requests.exceptions.Timeout:
        print("Error: Ollama server connection timed out")
        return []
    except requests.exceptions.ConnectionError:
        print("Error: Could not connect to Ollama server")
        return []
    
    models_to_test = [m for m in quant_models if m in available]
    
    if not models_to_test:
        print("No quantized models found. Pull them first:")
        for m in quant_models:
            print(f"  ollama pull {m}")
        return []
    
    print("Quantization Level Comparison")
    print("=" * 60)
    
    results = []
    prompt = "Explain the concept of machine learning in detail."
    
    for model in models_to_test:
        print(f"\nBenchmarking {model}...")
        
        try:
            # Warmup
            requests.post(f"{OLLAMA_BASE_URL}/api/generate", 
                         json={"model": model, "prompt": "Hi", "stream": False},
                         timeout=60)
            
            # Benchmark
            speeds = []
            for _ in range(3):
                resp = requests.post(f"{OLLAMA_BASE_URL}/api/generate",
                    json={"model": model, "prompt": prompt, "stream": False,
                          "options": {"num_predict": 100}},
                    timeout=120)
                data = resp.json()
                
                eval_count = data.get("eval_count", 0)
                eval_duration = data.get("eval_duration", 1) / 1e9
                speeds.append(eval_count / eval_duration if eval_duration > 0 else 0)
            
            avg_speed = statistics.mean(speeds)
            print(f"  Decode speed: {avg_speed:.1f} tok/s")
            
            results.append({
                "model": model,
                "decode_tps": avg_speed
            })
        except requests.exceptions.Timeout:
            print(f"  Timeout while benchmarking {model}")
        except requests.exceptions.ConnectionError:
            print(f"  Connection error while benchmarking {model}")
    
    return results

# Run quantization comparison
quant_results = benchmark_quantization_levels()

In [None]:
# Solution Part 2: Context Length Comparison

def benchmark_context_lengths(model: str = None):
    """
    Compare performance at different context lengths.
    
    Longer contexts = slower prefill but same decode speed.
    """
    # Get first available model if not specified
    if model is None:
        try:
            response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=30)
            models = response.json().get("models", [])
            if not models:
                print("No models available!")
                return []
            model = models[0]["name"]
        except requests.exceptions.Timeout:
            print("Error: Ollama server connection timed out")
            return []
        except requests.exceptions.ConnectionError:
            print("Error: Could not connect to Ollama server")
            return []
    
    # Context lengths to test
    context_configs = [
        (512, "short"),
        (2048, "medium"),
        (4096, "long"),
        (8192, "very_long"),
    ]
    
    # Generate prompts of different lengths
    base_text = "Please analyze and respond to this text. " * 50
    
    print(f"\nContext Length Comparison: {model}")
    print("=" * 60)
    
    results = []
    
    for num_ctx, label in context_configs:
        print(f"\nTesting context length: {num_ctx} ({label})")
        
        try:
            # Use shorter prompt for shorter contexts
            prompt_len = min(num_ctx // 4, len(base_text))
            prompt = base_text[:prompt_len]
            
            resp = requests.post(f"{OLLAMA_BASE_URL}/api/generate",
                json={
                    "model": model,
                    "prompt": prompt,
                    "stream": False,
                    "options": {
                        "num_ctx": num_ctx,
                        "num_predict": 50
                    }
                },
                timeout=120
            )
            data = resp.json()
            
            prompt_count = data.get("prompt_eval_count", 0)
            prompt_duration = data.get("prompt_eval_duration", 1) / 1e9
            eval_count = data.get("eval_count", 0)
            eval_duration = data.get("eval_duration", 1) / 1e9
            
            prefill_tps = prompt_count / prompt_duration if prompt_duration > 0 else 0
            decode_tps = eval_count / eval_duration if eval_duration > 0 else 0
            
            print(f"  Prompt tokens: {prompt_count}")
            print(f"  Prefill: {prefill_tps:.1f} tok/s")
            print(f"  Decode: {decode_tps:.1f} tok/s")
            
            results.append({
                "context_length": num_ctx,
                "label": label,
                "prompt_tokens": prompt_count,
                "prefill_tps": prefill_tps,
                "decode_tps": decode_tps
            })
            
        except requests.exceptions.Timeout:
            print(f"  Timeout for context length {num_ctx}")
        except requests.exceptions.ConnectionError:
            print(f"  Connection error for context length {num_ctx}")
        except Exception as e:
            print(f"  Error: {e}")
    
    return results

# Run context length comparison
ctx_results = benchmark_context_lengths()

In [None]:
# Solution Part 3: Concurrent Request Simulation

import concurrent.futures
import threading

def benchmark_concurrent_requests(model: str = None, num_concurrent: int = 4):
    """
    Test performance under concurrent load.
    
    Note: Ollama processes requests serially by default,
    so this measures queuing behavior more than true parallelism.
    """
    if model is None:
        try:
            response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=30)
            models = response.json().get("models", [])
            if not models:
                print("No models available!")
                return {}
            model = models[0]["name"]
        except requests.exceptions.Timeout:
            print("Error: Ollama server connection timed out")
            return {}
        except requests.exceptions.ConnectionError:
            print("Error: Could not connect to Ollama server")
            return {}
    
    print(f"\nConcurrent Request Test: {model}")
    print(f"Simulating {num_concurrent} concurrent requests")
    print("=" * 60)
    
    prompt = "What is machine learning?"
    results = []
    results_lock = threading.Lock()
    
    def make_request(request_id: int):
        start = time.time()
        try:
            resp = requests.post(f"{OLLAMA_BASE_URL}/api/generate",
                json={"model": model, "prompt": prompt, "stream": False,
                      "options": {"num_predict": 50}},
                timeout=60
            )
            elapsed = time.time() - start
            data = resp.json()
            
            with results_lock:
                results.append({
                    "request_id": request_id,
                    "success": True,
                    "time_s": elapsed,
                    "tokens": data.get("eval_count", 0)
                })
                
        except requests.exceptions.Timeout:
            elapsed = time.time() - start
            with results_lock:
                results.append({
                    "request_id": request_id,
                    "success": False,
                    "time_s": elapsed,
                    "error": "Request timed out"
                })
        except requests.exceptions.ConnectionError:
            elapsed = time.time() - start
            with results_lock:
                results.append({
                    "request_id": request_id,
                    "success": False,
                    "time_s": elapsed,
                    "error": "Connection error"
                })
        except Exception as e:
            elapsed = time.time() - start
            with results_lock:
                results.append({
                    "request_id": request_id,
                    "success": False,
                    "time_s": elapsed,
                    "error": str(e)
                })
    
    # Launch concurrent requests
    overall_start = time.time()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_concurrent) as executor:
        futures = [executor.submit(make_request, i) for i in range(num_concurrent)]
        concurrent.futures.wait(futures)
    
    overall_time = time.time() - overall_start
    
    # Analyze results
    successful = [r for r in results if r["success"]]
    
    if successful:
        avg_time = statistics.mean(r["time_s"] for r in successful)
        total_tokens = sum(r["tokens"] for r in successful)
        
        print(f"\nResults:")
        print(f"  Successful requests: {len(successful)}/{num_concurrent}")
        print(f"  Total time: {overall_time:.2f}s")
        print(f"  Average request time: {avg_time:.2f}s")
        print(f"  Total tokens generated: {total_tokens}")
        print(f"  Overall throughput: {total_tokens/overall_time:.1f} tok/s")
    
    return {
        "concurrent_requests": num_concurrent,
        "overall_time_s": overall_time,
        "results": results
    }

# Run concurrent test
concurrent_results = benchmark_concurrent_requests(num_concurrent=4)

In [None]:
# Solution Part 4: Comprehensive Report

def generate_comprehensive_report():
    """Generate a full benchmark report."""
    from datetime import datetime
    
    print("="*70)
    print("          DGX SPARK COMPREHENSIVE BENCHMARK REPORT")
    print("="*70)
    print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print()
    
    # Run all benchmarks
    print("\n[1/3] Testing available models...")
    try:
        response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=30)
        models = [m["name"] for m in response.json().get("models", [])]
        print(f"Found {len(models)} models: {', '.join(models[:5])}{'...' if len(models) > 5 else ''}")
    except requests.exceptions.Timeout:
        print("Error: Ollama server connection timed out")
        models = []
    except requests.exceptions.ConnectionError:
        print("Error: Could not connect to Ollama server")
        models = []
    
    print("\n[2/3] Running context length tests...")
    ctx_results = benchmark_context_lengths(models[0] if models else None)
    
    print("\n[3/3] Running concurrent request tests...")
    conc_results = benchmark_concurrent_requests(models[0] if models else None, num_concurrent=3)
    
    print("\n" + "="*70)
    print("                        SUMMARY")
    print("="*70)
    
    if ctx_results:
        print("\nContext Length Impact:")
        for r in ctx_results:
            print(f"  {r['context_length']:>5} tokens: Prefill {r['prefill_tps']:>7.1f} tok/s, Decode {r['decode_tps']:>6.1f} tok/s")
    
    print("\n" + "="*70)
    print("Report complete!")

# Generate report
# generate_comprehensive_report()

---

## Key Insights from Solutions

### Quantization Trade-offs
| Quantization | Speed | Quality | Memory |
|--------------|-------|---------|--------|
| Q4 | Fastest | Lower | Smallest |
| Q5 | Fast | Good | Small |
| Q8 | Medium | Very Good | Medium |
| FP16 | Slower | Best | Large |

### Context Length Impact
- **Prefill time** scales roughly linearly with context length
- **Decode speed** stays relatively constant
- **Memory usage** increases with context (KV cache)

### Concurrent Requests
- Ollama processes requests serially by default
- Multiple requests queue up, increasing latency
- For true parallelism, use multiple Ollama instances or vLLM

---

## Cleanup

In [None]:
# Cleanup resources
import gc
gc.collect()
print("Cleanup complete!")