# Blackwell Inference Benchmark Analysis

This notebook analyzes benchmark results from the vLLM inference testbench.

**Metrics analyzed:**
- Time to First Token (TTFT)
- Inter-Token Latency (ITL)
- Throughput (tokens/sec)
- GPU Telemetry (VRAM, power, temperature)

**Usage:**
1. Run benchmarks on the server
2. Copy results to local `../results/` directory
3. Update the `RESULT_FILE` path below
4. Run all cells

## Setup and Imports

In [None]:
import os
import json
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
from datetime import datetime

# Styling
import plotly.io as pio
pio.templates.default = "plotly_white"

print("✓ Imports successful")

## Load Benchmark Results

In [None]:
# Configure paths
RESULTS_DIR = Path('../results')

# List available result files
json_files = sorted(RESULTS_DIR.glob('*_detailed.json'))
print(f"Found {len(json_files)} result files:\n")
for i, f in enumerate(json_files):
    print(f"{i}: {f.name}")

In [None]:
# Select result file (change index or filename)
RESULT_FILE = json_files[-1] if json_files else None  # Most recent file

if RESULT_FILE is None:
    raise FileNotFoundError("No result files found in ../results/")

print(f"Loading: {RESULT_FILE.name}")

# Load JSON data
with open(RESULT_FILE, 'r') as f:
    results = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(results)
print(f"\n✓ Loaded {len(df)} requests")
print(f"  Successful: {df['success'].sum()}")
print(f"  Failed: {(~df['success']).sum()}")

# Filter successful requests only
df_success = df[df['success']].copy()

# Display sample
df_success.head()

## Data Overview

In [None]:
# Summary statistics
print("Dataset Info:")
print(f"  Total requests: {len(df_success)}")
print(f"  Context lengths: {sorted(df_success['context_length'].unique())}")
print(f"  Prompt tokens: {df_success['prompt_tokens'].min()}-{df_success['prompt_tokens'].max()}")
print(f"  Tokens generated: {df_success['tokens_generated'].sum()}")

# Basic statistics
df_success[['ttft_ms', 'itl_ms', 'throughput_tokens_per_sec', 'total_latency_ms']].describe()

## Time to First Token (TTFT) Analysis

In [None]:
# TTFT distribution
fig = px.histogram(
    df_success,
    x='ttft_ms',
    nbins=50,
    title='Time to First Token Distribution',
    labels={'ttft_ms': 'TTFT (ms)', 'count': 'Frequency'},
    marginal='box'
)
fig.update_layout(showlegend=False, height=500)
fig.show()

# Statistics
print("\nTTFT Statistics:")
print(f"  Mean: {df_success['ttft_ms'].mean():.2f} ms")
print(f"  Median (P50): {df_success['ttft_ms'].median():.2f} ms")
print(f"  P95: {df_success['ttft_ms'].quantile(0.95):.2f} ms")
print(f"  P99: {df_success['ttft_ms'].quantile(0.99):.2f} ms")
print(f"  Max: {df_success['ttft_ms'].max():.2f} ms")

In [None]:
# TTFT by context length
if 'context_length' in df_success.columns:
    fig = px.box(
        df_success,
        x='context_length',
        y='ttft_ms',
        title='TTFT Distribution by Context Length',
        labels={'context_length': 'Context Length', 'ttft_ms': 'TTFT (ms)'},
        points='outliers'
    )
    fig.update_layout(height=500)
    fig.show()

## Inter-Token Latency (ITL) Analysis

In [None]:
# ITL distribution
fig = px.histogram(
    df_success,
    x='itl_ms',
    nbins=50,
    title='Inter-Token Latency Distribution',
    labels={'itl_ms': 'ITL (ms)', 'count': 'Frequency'},
    marginal='box'
)
fig.update_layout(showlegend=False, height=500)
fig.show()

# Statistics
print("\nITL Statistics:")
print(f"  Mean: {df_success['itl_ms'].mean():.2f} ms")
print(f"  Median (P50): {df_success['itl_ms'].median():.2f} ms")
print(f"  P95: {df_success['itl_ms'].quantile(0.95):.2f} ms")
print(f"  Min: {df_success['itl_ms'].min():.2f} ms")
print(f"  Max: {df_success['itl_ms'].max():.2f} ms")

In [None]:
# ITL over time (request sequence)
fig = px.scatter(
    df_success,
    x='request_id',
    y='itl_ms',
    color='context_length' if 'context_length' in df_success.columns else None,
    title='Inter-Token Latency Over Request Sequence',
    labels={'request_id': 'Request ID', 'itl_ms': 'ITL (ms)'},
    opacity=0.6
)
fig.update_layout(height=500)
fig.show()

## Throughput Analysis

In [None]:
# Throughput distribution
fig = px.histogram(
    df_success,
    x='throughput_tokens_per_sec',
    nbins=50,
    title='Throughput Distribution',
    labels={'throughput_tokens_per_sec': 'Throughput (tokens/sec)', 'count': 'Frequency'},
    marginal='box'
)
fig.update_layout(showlegend=False, height=500)
fig.show()

# Statistics
print("\nThroughput Statistics:")
print(f"  Mean: {df_success['throughput_tokens_per_sec'].mean():.2f} tokens/sec")
print(f"  Median: {df_success['throughput_tokens_per_sec'].median():.2f} tokens/sec")
print(f"  Total tokens generated: {df_success['tokens_generated'].sum()}")

## Context Length Comparison

In [None]:
if 'context_length' in df_success.columns:
    # Group by context length
    ctx_summary = df_success.groupby('context_length').agg({
        'ttft_ms': ['mean', 'median', lambda x: x.quantile(0.95)],
        'itl_ms': ['mean', 'median', lambda x: x.quantile(0.95)],
        'throughput_tokens_per_sec': ['mean', 'median'],
        'total_latency_ms': ['mean', 'median'],
        'request_id': 'count'
    }).round(2)
    
    ctx_summary.columns = ['_'.join(col).strip() for col in ctx_summary.columns.values]
    ctx_summary.rename(columns={'request_id_count': 'num_requests'}, inplace=True)
    
    print("\nPerformance by Context Length:")
    display(ctx_summary)

In [None]:
# Multi-metric comparison across context lengths
if 'context_length' in df_success.columns:
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('TTFT by Context Length', 'ITL by Context Length',
                       'Throughput by Context Length', 'Total Latency by Context Length')
    )
    
    metrics = [
        ('ttft_ms', 'TTFT (ms)'),
        ('itl_ms', 'ITL (ms)'),
        ('throughput_tokens_per_sec', 'Throughput (tokens/sec)'),
        ('total_latency_ms', 'Total Latency (ms)')
    ]
    
    positions = [(1, 1), (1, 2), (2, 1), (2, 2)]
    
    for (metric, label), (row, col) in zip(metrics, positions):
        for ctx_len in sorted(df_success['context_length'].unique()):
            data = df_success[df_success['context_length'] == ctx_len][metric]
            fig.add_trace(
                go.Box(y=data, name=str(ctx_len), showlegend=(row==1 and col==1)),
                row=row, col=col
            )
    
    fig.update_layout(height=800, title_text="Performance Metrics Across Context Lengths")
    fig.show()

## GPU Telemetry Analysis

In [None]:
# Load telemetry data if available
telemetry_file = RESULT_FILE.parent / RESULT_FILE.name.replace('_detailed.json', '_telemetry.json')

if telemetry_file.exists():
    print(f"Loading telemetry: {telemetry_file.name}")
    
    with open(telemetry_file, 'r') as f:
        telemetry = json.load(f)
    
    df_telem = pd.DataFrame(telemetry)
    
    # Filter out error entries
    df_telem = df_telem[~df_telem['timestamp'].isna()].copy()
    
    if len(df_telem) > 0:
        # Convert timestamp
        df_telem['timestamp'] = pd.to_datetime(df_telem['timestamp'])
        df_telem['elapsed_sec'] = (df_telem['timestamp'] - df_telem['timestamp'].min()).dt.total_seconds()
        
        print(f"\n✓ Loaded {len(df_telem)} telemetry samples")
        print(f"  Duration: {df_telem['elapsed_sec'].max():.1f} seconds")
        
        # Display sample
        display(df_telem.head())
    else:
        print("No valid telemetry data found")
        df_telem = None
else:
    print("No telemetry file found")
    df_telem = None

In [None]:
# GPU metrics over time
if df_telem is not None and len(df_telem) > 0:
    fig = make_subplots(
        rows=4, cols=1,
        subplot_titles=('GPU Utilization', 'VRAM Usage', 'Temperature', 'Power Draw'),
        vertical_spacing=0.08
    )
    
    # GPU Utilization
    fig.add_trace(
        go.Scatter(x=df_telem['elapsed_sec'], y=df_telem['gpu_utilization_percent'],
                  mode='lines', name='GPU Util %', line=dict(color='blue')),
        row=1, col=1
    )
    
    # VRAM Usage
    fig.add_trace(
        go.Scatter(x=df_telem['elapsed_sec'], y=df_telem['memory_used_mb']/1024,
                  mode='lines', name='VRAM (GB)', line=dict(color='green')),
        row=2, col=1
    )
    
    # Temperature
    fig.add_trace(
        go.Scatter(x=df_telem['elapsed_sec'], y=df_telem['temperature_c'],
                  mode='lines', name='Temp (C)', line=dict(color='red')),
        row=3, col=1
    )
    
    # Power Draw
    fig.add_trace(
        go.Scatter(x=df_telem['elapsed_sec'], y=df_telem['power_draw_w'],
                  mode='lines', name='Power (W)', line=dict(color='orange')),
        row=4, col=1
    )
    
    fig.update_xaxes(title_text="Time (seconds)", row=4, col=1)
    fig.update_yaxes(title_text="%", row=1, col=1)
    fig.update_yaxes(title_text="GB", row=2, col=1)
    fig.update_yaxes(title_text="°C", row=3, col=1)
    fig.update_yaxes(title_text="Watts", row=4, col=1)
    
    fig.update_layout(height=1000, title_text="GPU Telemetry Over Time", showlegend=False)
    fig.show()
    
    # Summary statistics
    print("\nGPU Telemetry Summary:")
    print(f"  Peak VRAM: {df_telem['memory_used_mb'].max()/1024:.2f} GB")
    print(f"  Avg GPU Util: {df_telem['gpu_utilization_percent'].mean():.1f}%")
    print(f"  Peak Temperature: {df_telem['temperature_c'].max():.1f}°C")
    print(f"  Avg Power Draw: {df_telem['power_draw_w'].mean():.1f}W")
    print(f"  Peak Power Draw: {df_telem['power_draw_w'].max():.1f}W")

## Summary Report

In [None]:
# Generate comprehensive summary
print("="*60)
print("BENCHMARK SUMMARY REPORT")
print("="*60)
print(f"\nFile: {RESULT_FILE.name}")
print(f"Total Requests: {len(df)}")
print(f"Successful: {len(df_success)} ({len(df_success)/len(df)*100:.1f}%)")

print("\n--- TIME TO FIRST TOKEN (TTFT) ---")
print(f"Mean:   {df_success['ttft_ms'].mean():.2f} ms")
print(f"Median: {df_success['ttft_ms'].median():.2f} ms")
print(f"P95:    {df_success['ttft_ms'].quantile(0.95):.2f} ms")
print(f"P99:    {df_success['ttft_ms'].quantile(0.99):.2f} ms")

print("\n--- INTER-TOKEN LATENCY (ITL) ---")
print(f"Mean:   {df_success['itl_ms'].mean():.2f} ms")
print(f"Median: {df_success['itl_ms'].median():.2f} ms")
print(f"P95:    {df_success['itl_ms'].quantile(0.95):.2f} ms")

print("\n--- THROUGHPUT ---")
print(f"Mean:   {df_success['throughput_tokens_per_sec'].mean():.2f} tokens/sec")
print(f"Median: {df_success['throughput_tokens_per_sec'].median():.2f} tokens/sec")
print(f"Total tokens: {df_success['tokens_generated'].sum()}")

if df_telem is not None and len(df_telem) > 0:
    print("\n--- GPU TELEMETRY ---")
    print(f"Peak VRAM:      {df_telem['memory_used_mb'].max()/1024:.2f} GB")
    print(f"Avg GPU Util:   {df_telem['gpu_utilization_percent'].mean():.1f}%")
    print(f"Peak Temp:      {df_telem['temperature_c'].max():.1f}°C")
    print(f"Avg Power:      {df_telem['power_draw_w'].mean():.1f}W")

print("\n" + "="*60)

## Compare Multiple Benchmark Runs

In [None]:
# Load and compare multiple result files
def load_benchmark(file_path):
    """Load a benchmark result file and return summary statistics."""
    with open(file_path, 'r') as f:
        results = json.load(f)
    
    df = pd.DataFrame(results)
    df_success = df[df['success']].copy()
    
    return {
        'filename': file_path.name,
        'total_requests': len(df),
        'successful': len(df_success),
        'ttft_mean': df_success['ttft_ms'].mean(),
        'ttft_p95': df_success['ttft_ms'].quantile(0.95),
        'itl_mean': df_success['itl_ms'].mean(),
        'itl_p95': df_success['itl_ms'].quantile(0.95),
        'throughput_mean': df_success['throughput_tokens_per_sec'].mean(),
        'total_tokens': df_success['tokens_generated'].sum()
    }

# Load all available result files
if len(json_files) > 1:
    comparison_data = [load_benchmark(f) for f in json_files[-5:]]  # Last 5 runs
    df_comparison = pd.DataFrame(comparison_data)
    
    print("\nComparison of Recent Benchmark Runs:")
    display(df_comparison.round(2))
else:
    print("Only one benchmark result available. Run more benchmarks to enable comparison.")

## Export Summary

Export key metrics to a summary file for tracking over time.

In [None]:
# Create summary export
summary = {
    'benchmark_file': RESULT_FILE.name,
    'timestamp': datetime.now().isoformat(),
    'total_requests': len(df),
    'successful_requests': len(df_success),
    'success_rate': len(df_success) / len(df),
    'ttft': {
        'mean_ms': float(df_success['ttft_ms'].mean()),
        'median_ms': float(df_success['ttft_ms'].median()),
        'p95_ms': float(df_success['ttft_ms'].quantile(0.95)),
        'p99_ms': float(df_success['ttft_ms'].quantile(0.99))
    },
    'itl': {
        'mean_ms': float(df_success['itl_ms'].mean()),
        'median_ms': float(df_success['itl_ms'].median()),
        'p95_ms': float(df_success['itl_ms'].quantile(0.95))
    },
    'throughput': {
        'mean_tokens_per_sec': float(df_success['throughput_tokens_per_sec'].mean()),
        'total_tokens': int(df_success['tokens_generated'].sum())
    }
}

if df_telem is not None and len(df_telem) > 0:
    summary['gpu'] = {
        'peak_vram_gb': float(df_telem['memory_used_mb'].max() / 1024),
        'avg_utilization_percent': float(df_telem['gpu_utilization_percent'].mean()),
        'peak_temperature_c': float(df_telem['temperature_c'].max()),
        'avg_power_w': float(df_telem['power_draw_w'].mean())
    }

# Save summary
summary_path = RESULT_FILE.parent / RESULT_FILE.name.replace('_detailed.json', '_analysis_summary.json')
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"✓ Summary exported to {summary_path.name}")