# Qwen3-Coder-Next-FP8 Performance Speed Test

This notebook benchmarks the performance of the locally running **Qwen3-Coder-Next-FP8** model via vLLM on port 8000.

This notebook is created by vibe coding using **Qwen/Qwen3-Coder-Next-FP8** in **Cursor**.

Testing the locally running model via vLLM at `http://localhost:8000`.

## 1. Environment Setup & Health Check

This section verifies that the vLLM API is accessible and confirms the model is available. We'll:
- Check vLLM API connectivity
- List available models
- Verify Qwen3-Coder-Next-FP8 is loaded
- Capture system information for context
- Validate API key authentication

In [None]:
import os
import sys
from dotenv import load_dotenv

# Get the project root directory (where .env file is located)
# The .env file is in the project root, not necessarily the current working directory
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
env_path = os.path.join(project_root, '.env')

print(f"üîç Looking for .env file at: {project_root}")

# Load environment variables from .env file
# load_dotenv with dotenv_path parameter explicitly points to the .env file
load_dotenv(dotenv_path=env_path, override=True)

# vLLM configuration
VLLM_API_KEY = os.getenv('VLLM_API_KEY')
VLLM_BASE_URL = "http://localhost:8000"
VLLM_OPENAI_COMPATIBLE_URL = f"{VLLM_BASE_URL}/v1"

# Model configuration
MODEL_NAME = "Qwen/Qwen3-Coder-Next-FP8"

# Validation - check if API key was loaded (without printing it)
if not VLLM_API_KEY:
    print("\n‚ùå VLLM_API_KEY not found in environment variables!")
    print("   Please ensure .env file exists at:", env_path)
    print("   And contains: VLLM_API_KEY=<your-api-key>")
    sys.exit(1)

# Print success without exposing any API keys
print(f"‚úÖ Environment loaded successfully")
print(f"   vLLM Base URL: {VLLM_BASE_URL}")
print(f"   OpenAI Compatible URL: {VLLM_OPENAI_COMPATIBLE_URL}")
print(f"   Model Name: {MODEL_NAME}")

In [None]:
import requests
import json
import cpuinfo
import psutil
import platform
from datetime import datetime

# API headers with authorization
API_HEADERS = {
    "Authorization": f"Bearer {VLLM_API_KEY}"
}

def check_api_connectivity():
    """Check if vLLM API is reachable."""
    print("=" * 60)
    print("1. Checking API Connectivity...")
    print("=" * 60)
    
    try:
        # Test basic connectivity with a GET request (vLLM health endpoint only supports GET)
        response = requests.get(f"{VLLM_BASE_URL}/health", timeout=5)
        
        if response.status_code == 200:
            print("‚úÖ API is reachable!")
            print(f"   Response status: {response.status_code}")
            return True
        else:
            print(f"‚ö†Ô∏è  API returned status {response.status_code}")
            return False
    except requests.exceptions.ConnectionError:
        print("‚ùå Cannot connect to vLLM API!")
        print(f"   Make sure vLLM is running at {VLLM_BASE_URL}")
        print("   Run: vllm serve Qwen/Qwen3-Coder-Next-FP8 --port 8000 --api-key <your-api-key>")
        return False
    except requests.exceptions.Timeout:
        print("‚ùå Connection timed out!")
        return False
    except Exception as e:
        print(f"‚ùå Unexpected error: {e}")
        return False

def get_vllm_info():
    """Get vLLM server information."""
    print("\n" + "=" * 60)
    print("2. Getting vLLM Server Information...")
    print("=" * 60)
    
    try:
        response = requests.get(f"{VLLM_BASE_URL}/version", timeout=5)
        response.raise_for_status()
        version_info = response.json()
        
        print("‚úÖ vLLM Version Info:")
        print(f"   {json.dumps(version_info, indent=2)}")
        return version_info
    except requests.exceptions.HTTPError as e:
        print(f"‚ùå Failed to get version: {e}")
        return None
    except Exception as e:
        print(f"‚ùå Failed to get version: {e}")
        return None

def list_available_models():
    """List available models on the vLLM server."""
    print("\n" + "=" * 60)
    print("3. Listing Available Models...")
    print("=" * 60)
    
    try:
        response = requests.get(f"{VLLM_OPENAI_COMPATIBLE_URL}/models", headers=API_HEADERS, timeout=10)
        response.raise_for_status()
        models_info = response.json()
        
        if "data" in models_info:
            print("‚úÖ Available Models:")
            for model in models_info["data"]:
                model_id = model.get("id", "unknown")
                print(f"   - {model_id}")
            return models_info["data"]
        else:
            print("‚ö†Ô∏è  No models found in response")
            return []
    except requests.exceptions.HTTPError as e:
        print(f"‚ùå Failed to list models: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"   Response: {e.response.text}")
        return None
    except Exception as e:
        print(f"‚ùå Failed to list models: {e}")
        return None

def validate_api_key():
    """Validate that the API key works correctly."""
    print("\n" + "=" * 60)
    print("4. Validating API Key...")
    print("=" * 60)
    
    try:
        # Test with a simple health check that requires auth
        response = requests.get(f"{VLLM_OPENAI_COMPATIBLE_URL}/models", headers=API_HEADERS, timeout=10)
        
        if response.status_code == 200:
            print("‚úÖ API key is valid!")
            print(f"   Response status: {response.status_code}")
            return True
        elif response.status_code == 401:
            print("‚ùå API key is invalid or expired!")
            print(f"   Response status: {response.status_code}")
            return False
        else:
            print(f"‚ö†Ô∏è  Unexpected response: {response.status_code}")
            return False
    except Exception as e:
        print(f"‚ùå Validation failed: {e}")
        return False

def get_system_info():
    """Capture system information for context."""
    print("\n" + "=" * 60)
    print("5. System Information...")
    print("=" * 60)
    
    try:
        # CPU info
        cpu_info = cpuinfo.get_cpu_info()
        print("CPU:")
        print(f"   Brand: {cpu_info.get('brand_raw', 'N/A')}")
        print(f"   Cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical")
        print(f"   Load: {psutil.cpu_percent(interval=0.5)}%")
        
        # Memory info
        memory = psutil.virtual_memory()
        print("\nMemory:")
        print(f"   Total: {memory.total / (1024**3):.2f} GB")
        print(f"   Available: {memory.available / (1024**3):.2f} GB")
        print(f"   Used: {memory.percent}%")
        
        # GPU info (if available)
        print("\nGPU:")
        try:
            import subprocess
            result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total,memory.used,temperature.gpu,power.draw', '--format=csv,noheader,nounits'], 
                                  capture_output=True, text=True, timeout=5)
            gpu_info = result.stdout.strip().split(', ')
            if len(gpu_info) >= 5:
                print(f"   Name: {gpu_info[0]}")
                print(f"   Total Memory: {gpu_info[1]} MB")
                print(f"   Used Memory: {gpu_info[2]} MB")
                print(f"   Temperature: {gpu_info[3]}¬∞C")
                print(f"   Power Draw: {gpu_info[4]} W")
        except Exception as e:
            print(f"   GPU info not available: {e}")
        
        # Platform info
        print("\nPlatform:")
        print(f"   OS: {platform.system()} {platform.release()}")
        print(f"   Python: {platform.python_version()}")
        print(f"   Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
        return True
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not capture full system info: {e}")
        return False

# Run all checks
print("üîç Starting Environment Setup & Health Check...")
print()

api_connected = check_api_connectivity()
vllm_version = get_vllm_info()
available_models = list_available_models()
api_key_valid = validate_api_key()
system_info = get_system_info()

# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"API Connectivity: {'‚úÖ' if api_connected else '‚ùå'}")
print(f"API Key Valid: {'‚úÖ' if api_key_valid else '‚ùå'}")

if available_models:
    model_available = any(MODEL_NAME in m.get('id', '') for m in available_models)
    print(f"Model Available: {'‚úÖ' if model_available else '‚ö†Ô∏è  Model not found or listing failed'}")
else:
    print("Model Available: ‚ö†Ô∏è  Could not verify (model listing failed)")

print(f"System Info Captured: {'‚úÖ' if system_info else '‚ö†Ô∏è  Partial'}")

if api_connected and api_key_valid:
    print("\n‚úÖ Environment is ready for benchmarking!")
else:
    print("\n‚ùå Environment issues detected. Please fix above errors before proceeding.")

## 2. Time to First Token (TTFT) Analysis

This section measures the model's **Time to First Token (TTFT)** - how long it takes for the model to start generating text after receiving a prompt. This is a critical metric for interactive applications.

**Test Scenarios:**
- **Short prompts (~50 tokens)**: Simple completions to establish baseline
- **Medium prompts (~500 tokens)**: Moderate context handling
- **Long prompts (~3K tokens)**: Extended context behavior
- **Very long prompts (~10K tokens)**: Long context testing

**Metrics:**
- **TTFT (ms)**: Time from request to first token
- **Total latency (ms)**: End-to-end response time
- **Tokens/second**: Generation throughput

In [None]:
import time
from typing import Dict, List

def measure_tts_latency(prompt: str, max_tokens: int = 100) -> Dict:
    """
    Measure Time to First Token (TTFT) and latency for a single prompt.
    
    Args:
        prompt: Input prompt text
        max_tokens: Maximum tokens to generate
    
    Returns:
        Dict with timing metrics
    """
    # Use messages format for /chat/completions endpoint
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "stream": False,
        "max_tokens": max_tokens,
        "temperature": 0.7,
        "top_p": 0.9
    }
    
    try:
        start_time = time.time()
        response = requests.post(
            f"{VLLM_OPENAI_COMPATIBLE_URL}/chat/completions",
            json=payload,
            headers=API_HEADERS,
            timeout=120
        )
        response.raise_for_status()
        end_time = time.time()
        
        result = response.json()
        total_time = end_time - start_time
        
        # Get response text
        if "choices" in result and len(result["choices"]) > 0:
            response_text = result["choices"][0].get("message", {}).get("content", "")
        else:
            response_text = result.get("response", "")
        
        # Calculate metrics
        output_tokens = len(response_text.split())
        ttft_ms = result.get("usage", {}).get("prompt_processing_time", 0) * 1000
        
        return {
            'prompt_length': len(prompt.split()),
            'max_tokens': max_tokens,
            'output_tokens': output_tokens,
            'total_time_ms': total_time * 1000,
            'ttft_ms': ttft_ms,
            'tokens_per_second': output_tokens / total_time if total_time > 0 else 0,
            'response_preview': response_text[:150] + '...' if len(response_text) > 150 else response_text
        }
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Request failed: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"   Response: {e.response.text}")
        return None

def run_tts_tests() -> List[Dict]:
    """
    Run TTFT tests with different prompt sizes.
    
    Returns:
        List of results for each test
    """
    print("‚è±Ô∏è  Running Time to First Token (TTFT) Tests...\n")
    
    # Test cases: (name, prompt, max_tokens)
    # Using natural, non-repetitive prompts for realistic benchmarking
    # Prompts carefully sized to stay well within 131K token context limit
    
    short_prompt = "Once upon a time in a land far away, there was a curious developer named Alex who loved solving complex problems with elegant code. One day, Alex discovered a hidden repository that promised to revolutionize how we write software."
    
    medium_prompt = """Recursion is a fundamental concept in programming where a function calls itself to solve smaller instances of a problem. It's like looking into a mirror that reflects another mirror, creating an infinite regression. In programming, recursion consists of two key components: the base case and the recursive case. The base case is the condition that stops the recursion, preventing infinite loops. The recursive case is where the function calls itself with modified arguments, moving closer to the base case. A classic example is calculating factorial: factorial(5) = 5 * factorial(4) = 5 * 4 * factorial(3) = 5 * 4 * 3 * factorial(2) = 5 * 4 * 3 * 2 * factorial(1) = 5 * 4 * 3 * 2 * 1 = 120. Another famous example is computing Fibonacci numbers, where each number is the sum of the two preceding ones. Recursion is particularly powerful for tree traversal, divide-and-conquer algorithms, and backtracking problems. However, recursive solutions can lead to stack overflow if the recursion depth is too large, and they may be less efficient than iterative solutions due to function call overhead. Understanding when to use recursion versus iteration is a crucial skill for every programmer."""
    
    long_prompt = """Machine learning is a subset of artificial intelligence that enables computers to learn from data without being explicitly programmed. It involves building mathematical models that can identify patterns, make predictions, and improve their performance over time. The core components of machine learning include data, features, models, and learning algorithms. Supervised learning uses labeled data to train models for tasks like classification and regression. Unsupervised learning discovers hidden patterns in unlabeled data through techniques like clustering and dimensionality reduction. Reinforcement learning involves agents learning to interact with environments by receiving rewards or penalties. Neural networks, inspired by the human brain, are powerful machine learning models consisting of interconnected layers of nodes. Deep learning, a subset of neural networks, uses multiple hidden layers to learn complex representations. Convolutional Neural Networks (CNNs) excel at image processing, while Recurrent Neural Networks (RNNs) and Transformers dominate sequential data tasks like language modeling. Popular machine learning frameworks like TensorFlow, PyTorch, and Scikit-learn have democratized access to these powerful tools. Real-world applications span healthcare (diagnosis), finance (fraud detection), transportation (autonomous vehicles), and entertainment (recommendation systems). However, machine learning faces challenges including data quality, model interpretability, bias and fairness, and computational requirements. As the field continues advancing, it promises to transform industries and improve lives worldwide."""
    
    very_long_prompt = """Software documentation is essential communication that explains how to use, maintain, and contribute to a software project. Good documentation serves multiple audiences including end users, developers, system administrators, and contributors. User documentation focuses on features, installation, configuration, and troubleshooting guides. Developer documentation covers architecture, API references, coding standards, and contribution guidelines. Key principles of effective documentation include clarity, completeness, consistency, and currency. Documentation should be written in plain language with clear examples and code snippets. Version control systems like Git help track documentation changes alongside code changes. Common documentation formats include README files, Markdown documents, API documentation generators like Swagger and Doxygen, and documentation sites built with tools like MkDocs and Sphinx. Modern documentation practices emphasize interactive examples, video tutorials, and community-driven content. Automated documentation generation reduces manual effort and ensures documentation stays synchronized with code. Poor documentation leads to user frustration, increased support costs, and project abandonment. Conversely, comprehensive documentation accelerates onboarding, reduces bugs, and fosters community contributions. Best practices include documenting design decisions, maintaining changelogs, providing code examples for all APIs, and regularly reviewing documentation for accuracy. Technical writers and developers collaborate to produce documentation that balances depth with accessibility, ensuring the software reaches its full potential.
    
Additional content to reach target length: The software development lifecycle encompasses planning, analysis, design, implementation, testing, deployment, and maintenance phases. Agile methodologies like Scrum and Kanban have transformed how teams deliver software iteratively. Continuous integration and continuous deployment (CI/CD) pipelines automate testing and deployment processes. Testing strategies include unit tests, integration tests, and end-to-end tests to ensure code quality. Code review practices improve code quality and knowledge sharing among team members. Performance optimization involves profiling, benchmarking, and identifying bottlenecks in code. Security considerations must be integrated throughout the development lifecycle. Scalability and availability requirements drive architecture decisions. Cloud computing platforms provide infrastructure for deploying and scaling applications. Open source software has revolutionized how technology evolves through community collaboration and shared knowledge. Understanding these concepts provides a foundation for professional software engineering practice."""
    
    test_cases = [
        ("Short (~50 tokens)", short_prompt, 50),
        ("Medium (~500 tokens)", medium_prompt, 100),
        ("Long (~3K tokens)", long_prompt, 200),
        ("Very Long (~10K tokens)", very_long_prompt, 300)
    ]
    
    results = []
    
    for name, prompt, max_tokens in test_cases:
        print(f"--- {name} Test ---")
        print(f"   Prompt length: ~{len(prompt.split())} tokens")
        print(f"   Max output tokens: {max_tokens}")
        
        result = measure_tts_latency(prompt, max_tokens)
        
        if result:
            results.append(result)
            print(f"   Output tokens: {result['output_tokens']}")
            print(f"   TTFT: {result['ttft_ms']:.2f} ms")
            print(f"   Total time: {result['total_time_ms']:.2f} ms")
            print(f"   Speed: {result['tokens_per_second']:.2f} tokens/sec")
            print(f"   Response preview: {result['response_preview'][:100]}...")
        else:
            print(f"   ‚ùå Failed for {name} test")
        print()
    
    return results

In [None]:
# Run TTFT tests
tts_results = run_tts_tests()

## Summary & Analysis

After running the tests, you can analyze the results to understand:

- How TTFT scales with prompt length
- Generation throughput at different output lengths
- Whether the model shows signs of context truncation with very long prompts

### Expected Patterns:

- **TTFT should increase** with longer prompts (more context to process)
- **Throughput (tokens/sec)** should be relatively stable for similar output sizes
- **Very long contexts** (10K+ tokens) may show degraded performance or errors if context length limits are exceeded

In [None]:
# Display results in a formatted table
print("=" * 80)
print("TTFT TEST RESULTS SUMMARY")
print("=" * 80)

if tts_results:
    print(f"{'Test Case':<25} {'Prompt Tok':<10} {'Output Tok':<10} {'TTFT (ms)':<12} {'Total (ms)':<12} {'Tok/s':<10}")
    print("-" * 80)
    
    for r in tts_results:
        test_case = f"{r['prompt_length']}p/{r['max_tokens']}o"
        print(f"{test_case:<25} {r['prompt_length']:<10} {r['output_tokens']:<10} {r['ttft_ms']:<12.2f} {r['total_time_ms']:<12.2f} {r['tokens_per_second']:<10.2f}")
    
    print("=" * 80)
    
    # Calculate averages
    valid_ttft_results = [r for r in tts_results if r['ttft_ms'] > 0]
    if valid_ttft_results:
        avg_ttft = sum(r['ttft_ms'] for r in valid_ttft_results) / len(valid_ttft_results)
        avg_throughput = sum(r['tokens_per_second'] for r in tts_results) / len(tts_results)
        
        print(f"\nAverage TTFT: {avg_ttft:.2f} ms")
        print(f"Average Throughput: {avg_throughput:.2f} tokens/sec")
    else:
        print("\nNo valid TTFT data to calculate averages.")
else:
    print("‚ùå No results to display. Tests may have failed.")

## 3. End-to-End Latency Analysis

This section measures the **total end-to-end latency** for generating responses with different output lengths. Unlike TTFT which focuses on time to first token, this test measures the complete time from request to full response generation.

**Test Scenarios:**
- **Small output (50 tokens)**: Quick completion tasks
- **Medium output (150 tokens)**: Short answer generation
- **Large output (300 tokens)**: Detailed response generation
- **Very large output (500 tokens)**: Extended content generation

**Metrics:**
- **Total latency (ms)**: End-to-end response time
- **Tokens/second**: Overall generation throughput
- **Latency per token (ms/token)**: Average time per output token

**Expected Patterns:**
- Longer outputs will take more time but may have better throughput (amortized setup cost)
- Total latency should scale roughly linearly with output length
- Throughput (tokens/sec) should be relatively stable for similar workloads

In [None]:
import time
from typing import Dict, List

def measure_end_to_end_latency(prompt: str, max_tokens: int = 100) -> Dict:
    """
    Measure end-to-end latency for generating a response with a fixed prompt.
    
    Args:
        prompt: Input prompt text (kept constant across tests)
        max_tokens: Maximum tokens to generate
    
    Returns:
        Dict with latency metrics
    """
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "stream": False,
        "max_tokens": max_tokens,
        "temperature": 0.7,
        "top_p": 0.9
    }
    
    try:
        start_time = time.time()
        response = requests.post(
            f"{VLLM_OPENAI_COMPATIBLE_URL}/chat/completions",
            json=payload,
            headers=API_HEADERS,
            timeout=120
        )
        response.raise_for_status()
        end_time = time.time()
        
        result = response.json()
        total_time = end_time - start_time
        
        # Get response text
        if "choices" in result and len(result["choices"]) > 0:
            response_text = result["choices"][0].get("message", {}).get("content", "")
        else:
            response_text = result.get("response", "")
        
        # Calculate metrics
        output_tokens = len(response_text.split())
        latency_per_token = (total_time * 1000) / output_tokens if output_tokens > 0 else 0
        
        return {
            'max_tokens': max_tokens,
            'output_tokens': output_tokens,
            'total_time_ms': total_time * 1000,
            'latency_per_token_ms': latency_per_token,
            'tokens_per_second': output_tokens / total_time if total_time > 0 else 0,
            'response_preview': response_text[:150] + '...' if len(response_text) > 150 else response_text
        }
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Request failed: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"   Response: {e.response.text}")
        return None

def run_end_to_end_latency_tests() -> List[Dict]:
    """
    Run end-to-end latency tests with different output lengths.
    
    Returns:
        List of results for each test
    """
    print("‚è±Ô∏è  Running End-to-End Latency Tests...\n")
    
    # Use a moderate prompt for consistent comparison
    # This ensures we're measuring output generation time, not prompt processing
    consistent_prompt = """The concept of artificial intelligence has evolved significantly since its inception. AI systems now capable of understanding natural language, recognizing images, and making decisions in complex environments. The field continues to advance rapidly with new breakthroughs."""
    
    # Test cases: different max_tokens, same prompt
    test_cases = [
        ("Small output (50 tokens)", 50),
        ("Medium output (150 tokens)", 150),
        ("Large output (300 tokens)", 300),
        ("Very large output (500 tokens)", 500)
    ]
    
    results = []
    
    for name, max_tokens in test_cases:
        print(f"--- {name} Test ---")
        print(f"   Prompt: Constant (moderate length)")
        print(f"   Max output tokens: {max_tokens}")
        
        result = measure_end_to_end_latency(consistent_prompt, max_tokens)
        
        if result:
            results.append(result)
            print(f"   Output tokens: {result['output_tokens']}")
            print(f"   Total latency: {result['total_time_ms']:.2f} ms")
            print(f"   Tokens/sec: {result['tokens_per_second']:.2f}")
            print(f"   Latency per token: {result['latency_per_token_ms']:.2f} ms/token")
            print(f"   Response preview: {result['response_preview'][:100]}...")
        else:
            print(f"   ‚ùå Failed for {name} test")
        print()
    
    return results

In [None]:
# Run End-to-End Latency tests
e2e_results = run_end_to_end_latency_tests()

## Summary & Analysis

After running the end-to-end latency tests, analyze:

- How total latency scales with output length
- Whether throughput (tokens/sec) remains consistent
- Latency per token trends across different output sizes

**Expected Observations:**
- Larger outputs will take more total time but may have better tokens/sec (setup cost amortized)
- Latency per token should be relatively stable for similar workloads
- Compare with TTFT results: end-to-end latency includes TTFT + generation time

In [None]:
# Display end-to-end latency results
print("=" * 80)
print("END-TO-END LATENCY TEST RESULTS SUMMARY")
print("=" * 80)

if e2e_results:
    print(f"{'Output Size':<25} {'Output Tok':<12} {'Total (ms)':<12} {'Tok/s':<12} {'Latency/Token (ms)':<20}")
    print("-" * 80)
    
    for r in e2e_results:
        output_size = f"{r['max_tokens']} tokens"
        print(f"{output_size:<25} {r['output_tokens']:<12} {r['total_time_ms']:<12.2f} {r['tokens_per_second']:<12.2f} {r['latency_per_token_ms']:<20.2f}")
    
    print("=" * 80)
    
    # Calculate averages
    if e2e_results:
        avg_latency = sum(r['total_time_ms'] for r in e2e_results) / len(e2e_results)
        avg_throughput = sum(r['tokens_per_second'] for r in e2e_results) / len(e2e_results)
        avg_latency_per_token = sum(r['latency_per_token_ms'] for r in e2e_results) / len(e2e_results)
        
        print(f"\nAverage Total Latency: {avg_latency:.2f} ms")
        print(f"Average Throughput: {avg_throughput:.2f} tokens/sec")
        print(f"Average Latency per Token: {avg_latency_per_token:.2f} ms/token")
else:
    print("‚ùå No results to display. Tests may have failed.")

## 4. Throughput (Tokens/Second) Analysis

This section measures the **generation throughput** - how many tokens per second the model can produce during sustained generation. This is critical for understanding the model's sustained performance capabilities.

**Test Scenarios:**
- **Short generation (50 tokens)**: Quick throughput baseline
- **Medium generation (150 tokens)**: Moderate throughput measurement
- **Long generation (300 tokens)**: Sustained throughput evaluation
- **Extended generation (500 tokens)**: Long-duration throughput

**Metrics:**
- **Tokens/second**: Primary throughput metric
- **Total latency**: Context for throughput calculations
- **Consistency**: Multiple runs to measure variance

**Expected Patterns:**
- Throughput should be relatively stable across different output sizes
- Larger generations may show better throughput (fixed overhead amortized)
- Standard deviation across multiple runs indicates stability

In [None]:
import time
import statistics
from typing import Dict, List

def measure_throughput(prompt: str, max_tokens: int = 100, num_runs: int = 3) -> Dict:
    """
    Measure throughput (tokens/second) for generating responses.
    Runs multiple times and reports average, min, max, and standard deviation.
    
    Args:
        prompt: Input prompt text
        max_tokens: Maximum tokens to generate
        num_runs: Number of runs to average (default 3 for statistical significance)
    
    Returns:
        Dict with throughput metrics including statistics across runs
    """
    latencies = []
    output_tokens_list = []
    
    for run in range(num_runs):
        payload = {
            "model": MODEL_NAME,
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "stream": False,
            "max_tokens": max_tokens,
            "temperature": 0.7,
            "top_p": 0.9
        }
        
        try:
            start_time = time.time()
            response = requests.post(
                f"{VLLM_OPENAI_COMPATIBLE_URL}/chat/completions",
                json=payload,
                headers=API_HEADERS,
                timeout=120
            )
            response.raise_for_status()
            end_time = time.time()
            
            result = response.json()
            total_time = end_time - start_time
            
            # Get response text
            if "choices" in result and len(result["choices"]) > 0:
                response_text = result["choices"][0].get("message", {}).get("content", "")
            else:
                response_text = result.get("response", "")
            
            output_tokens = len(response_text.split())
            tokens_per_second = output_tokens / total_time if total_time > 0 else 0
            
            latencies.append(total_time * 1000)  # Convert to ms
            output_tokens_list.append(output_tokens)
            
        except requests.exceptions.RequestException as e:
            print(f"‚ùå Run {run + 1} failed: {e}")
            if hasattr(e, 'response') and e.response is not None:
                print(f"   Response: {e.response.text}")
            continue
    
    if not latencies:
        return None
    
    # Calculate statistics across runs
    avg_latency = statistics.mean(latencies)
    avg_tokens = statistics.mean(output_tokens_list)
    avg_throughput = avg_tokens / (avg_latency / 1000) if avg_latency > 0 else 0
    
    # Calculate min, max, std dev
    min_latency = min(latencies)
    max_latency = max(latencies)
    std_latency = statistics.stdev(latencies) if len(latencies) > 1 else 0
    std_throughput = statistics.stdev([t / (l / 1000) for t, l in zip(output_tokens_list, latencies)]) if len(latencies) > 1 else 0
    
    return {
        'max_tokens': max_tokens,
        'num_runs': num_runs,
        'avg_output_tokens': avg_tokens,
        'avg_latency_ms': avg_latency,
        'min_latency_ms': min_latency,
        'max_latency_ms': max_latency,
        'std_latency_ms': std_latency,
        'avg_throughput': avg_throughput,
        'std_throughput': std_throughput,
        'min_throughput': min(output_tokens_list) / (max_latency / 1000) if max_latency > 0 else 0,
        'max_throughput': max(output_tokens_list) / (min_latency / 1000) if min_latency > 0 else 0
    }

def run_throughput_tests() -> List[Dict]:
    """
    Run throughput tests with different output lengths.
    
    Returns:
        List of results for each test
    """
    print("‚ö° Running Throughput (Tokens/Second) Tests...\n")
    
    # Use a moderate prompt for consistent comparison
    consistent_prompt = """Machine learning is a subset of artificial intelligence that enables computers to learn from data without being explicitly programmed. It involves building mathematical models that can identify patterns and make predictions."""
    
    # Test cases: different max_tokens
    test_cases = [
        ("Short (50 tokens)", 50),
        ("Medium (150 tokens)", 150),
        ("Long (300 tokens)", 300),
        ("Extended (500 tokens)", 500)
    ]
    
    results = []
    
    for name, max_tokens in test_cases:
        print(f"--- {name} Test ---")
        print(f"   Prompt: Constant (moderate length)")
        print(f"   Max output tokens: {max_tokens}")
        print(f"   Number of runs: 3")
        
        result = measure_throughput(consistent_prompt, max_tokens, num_runs=3)
        
        if result:
            results.append(result)
            print(f"   Avg output tokens: {result['avg_output_tokens']:.1f}")
            print(f"   Avg latency: {result['avg_latency_ms']:.2f} ms")
            print(f"   Latency range: {result['min_latency_ms']:.2f} - {result['max_latency_ms']:.2f} ms")
            print(f"   Std dev latency: {result['std_latency_ms']:.2f} ms")
            print(f"   Avg throughput: {result['avg_throughput']:.2f} tokens/sec")
            print(f"   Throughput range: {result['min_throughput']:.2f} - {result['max_throughput']:.2f} tokens/sec")
            print(f"   Std dev throughput: {result['std_throughput']:.2f} tokens/sec")
        else:
            print(f"   ‚ùå Failed for {name} test")
        print()
    
    return results

In [None]:
# Run Throughput tests
throughput_results = run_throughput_tests()

## Summary & Analysis

After running the throughput tests, analyze:

- **Average throughput** across different output sizes
- **Consistency** (standard deviation) - lower is better
- **Throughput range** (min to max) - indicates stability

**Key Metrics to Compare:**
- Average tokens/second: Higher is better
- Standard deviation: Lower indicates more consistent performance
- Min/Max range: Narrower range indicates more predictable performance

**Expected Observations:**
- Throughput should be relatively stable across output sizes
- Larger generations may show better throughput (fixed overhead amortized)
- Low standard deviation indicates consistent model performance

In [None]:
# Display throughput results
print("=" * 90)
print("THROUGHPUT TEST RESULTS SUMMARY")
print("=" * 90)

if throughput_results:
    print(f"{'Output Size':<20} {'Avg Tok':<10} {'Avg Tok/s':<12} {'Min Tok/s':<12} {'Max Tok/s':<12} {'Std Dev':<12}")
    print("-" * 90)
    
    for r in throughput_results:
        output_size = f"{r['max_tokens']} tokens"
        print(f"{output_size:<20} {r['avg_output_tokens']:<10.1f} {r['avg_throughput']:<12.2f} {r['min_throughput']:<12.2f} {r['max_throughput']:<12.2f} {r['std_throughput']:<12.2f}")
    
    print("=" * 90)
    
    # Calculate overall averages
    if throughput_results:
        avg_throughput = sum(r['avg_throughput'] for r in throughput_results) / len(throughput_results)
        avg_std = sum(r['std_throughput'] for r in throughput_results) / len(throughput_results)
        
        print(f"\nAverage Throughput: {avg_throughput:.2f} tokens/sec")
        print(f"Average Standard Deviation: {avg_std:.2f} tokens/sec")
        print(f"\nInterpretation:")
        print(f"  - The model generates an average of {avg_throughput:.0f} tokens per second")
        print(f"  - Throughput varies by ¬±{avg_std:.1f} tokens/sec across tests")
        if avg_std < 2:
            print(f"  - ‚úÖ Throughput is very consistent")
        elif avg_std < 5:
            print(f"  - ‚ö†Ô∏è  Throughput has moderate variance")
        else:
            print(f"  - ‚ö†Ô∏è  Throughput has high variance")
else:
    print("‚ùå No results to display. Tests may have failed.")

## 5. Context Window Pressure Test

This section measures how the model performs under **different context window pressures**. With a configured context length of **131,072 tokens**, this test pushes the model to understand how performance changes as context approaches the limit.

**Test Scenarios:**
- **Small context (1K tokens)**: Baseline with minimal context
- **Medium context (8K tokens)**: Moderate context pressure
- **Large context (32K tokens)**: Significant context pressure
- **Very large context (64K tokens)**: High context pressure (half of max)
- **Near limit context (100K tokens)**: Near maximum context pressure

**Metrics:**
- **TTFT (ms)**: Time to first token with large contexts
- **Total latency (ms)**: End-to-end response time
- **Tokens/second**: Generation throughput under pressure
- **Success rate**: Percentage of successful requests

**Expected Patterns:**
- TTFT should increase with context length (more tokens to process before first output)
- Throughput may decrease as context size grows (more KV cache to manage)
- Near context limit, may see errors or degradation
- Model should handle up to 131K tokens without truncation

In [None]:
import time
from typing import Dict, List

def measure_context_window(prompt: str, max_tokens: int = 100) -> Dict:
    """
    Measure performance with a specific context window size.
    
    Args:
        prompt: Input prompt text (full context)
        max_tokens: Maximum tokens to generate
    
    Returns:
        Dict with performance metrics
    """
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "stream": False,
        "max_tokens": max_tokens,
        "temperature": 0.7,
        "top_p": 0.9
    }
    
    try:
        start_time = time.time()
        response = requests.post(
            f"{VLLM_OPENAI_COMPATIBLE_URL}/chat/completions",
            json=payload,
            headers=API_HEADERS,
            timeout=120
        )
        response.raise_for_status()
        end_time = time.time()
        
        result = response.json()
        total_time = end_time - start_time
        
        # Get response text
        if "choices" in result and len(result["choices"]) > 0:
            response_text = result["choices"][0].get("message", {}).get("content", "")
        else:
            response_text = result.get("response", "")
        
        # Calculate metrics
        output_tokens = len(response_text.split())
        prompt_tokens = len(prompt.split())
        total_tokens = prompt_tokens + output_tokens
        
        return {
            'prompt_tokens': prompt_tokens,
            'output_tokens': output_tokens,
            'total_tokens': total_tokens,
            'total_time_ms': total_time * 1000,
            'tokens_per_second': output_tokens / total_time if total_time > 0 else 0,
            'success': True,
            'response_preview': response_text[:150] + '...' if len(response_text) > 150 else response_text
        }
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Request failed: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"   Response: {e.response.text}")
        return {
            'success': False,
            'error': str(e)
        }

In [None]:
def run_context_window_tests() -> List[Dict]:
    """
    Run context window pressure tests with different context sizes.
    
    Model max context: 131,072 tokens
    Configured via: --max-model-len 131072
    
    Returns:
        List of results for each context size
    """
    print("üìè Running Context Window Pressure Tests...\n")
    print(f"Model max context: 131,072 tokens")
    print(f"Configured context: 131,072 tokens")
    print()
    
    # Realistic, non-repetitive context prompts
    # Each context is a coherent, meaningful document of approximately the target size
    
    context_1k = """Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It involves building mathematical models that can identify patterns, make predictions, and improve performance over time. The field has revolutionized many industries including healthcare, finance, and transportation.
    
Supervised learning uses labeled data to train models for classification and regression tasks. Unsupervised learning discovers hidden patterns in unlabeled data through clustering and dimensionality reduction techniques. Reinforcement learning involves agents learning to interact with environments by receiving rewards or penalties.
    
Neural networks, inspired by the human brain, consist of interconnected layers of nodes that process information. Deep learning, a subset of neural networks, uses multiple hidden layers to learn complex representations. Convolutional Neural Networks (CNNs) excel at image processing while Recurrent Neural Networks (RNNs) and Transformers dominate sequential data tasks like language modeling.
    
Popular machine learning frameworks like TensorFlow, PyTorch, and Scikit-learn have democratized access to these powerful tools. Real-world applications span healthcare diagnosis, fraud detection in finance, autonomous vehicles in transportation, and recommendation systems in entertainment.
    
However, machine learning faces challenges including data quality issues, model interpretability concerns, bias and fairness problems, and significant computational requirements. As the field continues advancing, it promises to transform industries and improve lives worldwide through intelligent automation and data-driven decision making."""
    
    context_8k = """Machine learning has transformed the landscape of artificial intelligence and data-driven decision making. At its core, machine learning enables computers to automatically learn and improve from experience without being explicitly programmed for specific tasks. This capability stems from the ability of algorithms to identify patterns in data and make predictions based on those patterns.
    
The history of machine learning dates back to the mid-20th century, with significant milestones including the development of the perceptron in the 1950s, the backpropagation algorithm in the 1980s, and the rise of deep learning in the 2010s. Each of these developments built upon previous work to create more powerful and flexible models.
    
Supervised learning remains one of the most widely used approaches, where models are trained on labeled datasets to make predictions. Classification tasks categorize data into predefined classes, while regression tasks predict continuous values. Common algorithms include logistic regression, support vector machines, decision trees, and random forests.
    
Unsupervised learning discovers hidden structures in unlabeled data. Clustering algorithms group similar data points together, while dimensionality reduction techniques like PCA and t-SNE reduce feature space complexity. Autoencoders and generative models learn to reconstruct and generate new data samples.
    
Reinforcement learning involves agents learning optimal policies through trial and error interactions with environments. The agent receives rewards or penalties for actions, gradually learning strategies that maximize cumulative reward. This approach has achieved remarkable success in games like chess and Go, as well as robotics and navigation.
    
Deep learning has revolutionized machine learning with neural networks containing many layers. Convolutional Neural Networks process grid-like data such as images through convolutional layers that detect local patterns. Recurrent Neural Networks handle sequential data by maintaining hidden states that capture temporal dependencies. Transformers use attention mechanisms to process sequences in parallel, achieving state-of-the-art results in natural language processing.
    
Feature engineering remains crucial for effective machine learning, involving selection, transformation, and creation of input variables that improve model performance. Domain knowledge informs feature engineering, helping models capture relevant patterns while reducing noise and irrelevant information.
    
Model evaluation and validation ensure generalization to new data. Cross-validation, holdout sets, and metrics like accuracy, precision, recall, and F1 score assess model performance. Regularization techniques prevent overfitting by penalizing complex models.
    
Hyperparameter tuning optimizes model configuration through grid search, random search, or Bayesian optimization. Learning rates, batch sizes, network architectures, and regularization strengths significantly impact final performance.
    
Scalability challenges arise with large datasets and complex models. Distributed computing frameworks like Spark and specialized hardware including GPUs and TPUs accelerate training and inference. Cloud platforms provide scalable infrastructure for machine learning workflows.
    
Ethical considerations including bias, fairness, transparency, and privacy require careful attention. Models can perpetuate and amplify biases present in training data. Explainable AI methods help users understand and trust model predictions. Data protection regulations like GDPR influence how data is collected and used.
    
The future of machine learning includes advances in self-supervised learning, meta-learning, federated learning, and neuro-symbolic AI combining neural networks with symbolic reasoning. These approaches address current limitations and expand the scope of what machine learning can accomplish."""
    
    context_32k = """Machine learning continues to evolve rapidly, pushing boundaries of what artificial intelligence can achieve. The field encompasses numerous algorithms, methodologies, and applications that transform how we process information and make decisions.
    
Supervised learning algorithms learn from labeled training data to make predictions on new inputs. Classification algorithms categorize inputs into discrete classes, from simple binary classification to complex multi-class problems. Regression algorithms predict continuous numerical values, modeling relationships between variables. Ensemble methods combine multiple models to improve performance and robustness.
    
Unsupervised learning discovers patterns without labeled examples. Clustering algorithms group similar instances, revealing natural structures in data. Dimensionality reduction techniques compress high-dimensional data while preserving essential features. Anomaly detection identifies unusual patterns that deviate from normal behavior.
    
Reinforcement learning agents learn optimal behaviors through interaction with environments. Markov Decision Processes provide theoretical frameworks for sequential decision making. Policy gradient methods directly optimize reward accumulation. Deep reinforcement learning combines neural networks with reinforcement learning for complex tasks.
    
Deep learning architectures have become standard for many machine learning problems. Convolutional layers detect local patterns in images through learned filters. Recurrent layers maintain memory of previous inputs through hidden state propagation. Attention mechanisms weigh different parts of input sequences differently based on relevance.
    
Transformers dominate natural language processing with self-attention mechanisms that process entire sequences in parallel. BERT and GPT architectures demonstrate remarkable capabilities in understanding and generating human language. Fine-tuning pre-trained models on specific tasks achieves state-of-the-art results with less training data.
    
Computer vision applications include image classification, object detection, semantic segmentation, and image generation. Convolutional architectures process visual information through hierarchical feature extraction. Generative models create realistic images, videos, and animations.
    
Natural language processing enables machines to understand, generate, and translate human language. Language models predict word sequences and capture semantic relationships. Machine translation systems convert text between languages. Speech recognition transcribes spoken language to text.
    
Time series forecasting predicts future values based on historical patterns. ARIMA and SARIMA models capture temporal dependencies in stationary data. LSTM and GRU recurrent networks handle long-term dependencies in sequential data. Transformer-based models process time series with attention mechanisms.
    
Recommender systems predict user preferences for items. Collaborative filtering recommends items based on similar users' preferences. Content-based filtering recommends items similar to those users liked before. Hybrid approaches combine multiple recommendation strategies.
    
Natural language understanding enables machines to comprehend human language meaning. Named entity recognition identifies and classifies entities in text. Sentiment analysis determines emotional tone of text. Question answering systems respond to natural language queries.
    
Generative models create new data samples resembling training data. Variational autoencoders learn latent representations and generate new samples. Generative adversarial networks pit generators against discriminators in adversarial training. Diffusion models gradually denoise data to generate samples.
    
Graph neural networks process graph-structured data with nodes and edges. Node classification predicts labels for graph vertices. Graph classification categorizes entire graphs. Message passing algorithms propagate information across graph structures.
    
Transfer learning leverages knowledge from one domain to improve performance in another. Pre-training on large datasets provides general feature representations. Fine-tuning adapts pre-trained models to specific tasks with limited data. Domain adaptation addresses distribution differences between source and target domains.
    
Federated learning trains models across decentralized devices while preserving privacy. Devices train locally on private data and share model updates. Central server aggregates updates to create global model. Privacy-preserving techniques protect sensitive information.
    
Model compression techniques reduce computational requirements. Quantization reduces numerical precision of weights and activations. Pruning removes unnecessary connections from neural networks. Knowledge distillation transfers knowledge from large to small models.
    
Explainable AI provides insights into model decision-making processes. Feature importance methods identify influential input variables. Local interpretable explanations approximate complex models locally. Saliency maps highlight important regions in input data.
    
Ethical AI development addresses fairness, accountability, transparency, and privacy. Bias detection identifies unfair treatment of demographic groups. Fairness metrics quantify equity in model predictions. Explainability enables accountability for algorithmic decisions."""
    
    context_64k = """Machine learning systems continue advancing toward greater capability, efficiency, and accessibility. The integration of machine learning into everyday applications transforms user experiences and business operations.
    
Model architecture innovation drives performance improvements. Vision Transformers have challenged Convolutional Neural Networks for image tasks. Large language models demonstrate unprecedented capabilities in text generation and understanding. Graph neural networks extend machine learning to relational data domains.
    
Training optimization techniques enable more efficient model development. Mixed precision training reduces memory usage and accelerates computation. Gradient accumulation enables larger effective batch sizes with limited memory. Learning rate schedulers dynamically adjust optimization dynamics.
    
Data augmentation improves model generalization by creating modified training samples. Image augmentations include rotations, flips, color changes, and cropping. Text augmentations include synonym replacement, back translation, and dropout. Augmentation creates more diverse training data.
    
Batch normalization stabilizes training by normalizing layer inputs. Layer normalization adapts normalization to individual samples. Instance normalization applies normalization to individual channels. Normalization techniques enable deeper network training.
    
Dropout regularization prevents overfitting by randomly deactivating neurons during training. Label smoothing softens target labels to prevent overconfidence. Weight decay penalizes large weights through L2 regularization. Regularization improves generalization to unseen data.
    
Gradient clipping prevents exploding gradients in deep networks. Adaptive gradient algorithms adjust learning rates per parameter. Second-order optimization methods use Hessian information. Optimizer choice significantly impacts training dynamics.
    
Distribution shift between training and deployment data causes performance degradation. Covariate shift involves input distribution changes. Label shift involves output distribution changes. Concept drift involves relationship changes over time. Domain adaptation addresses distribution mismatches.
    
Active learning reduces labeling costs by selecting informative samples. Uncertainty sampling selects samples where model is uncertain. Query-by-committee uses multiple models to identify informative samples. Diversity sampling selects representative samples from clusters.
    
Self-supervised learning creates pseudo-labels from raw data without human annotation. Contrastive learning trains models to distinguish similar from dissimilar samples. Masked language modeling predicts masked tokens in text. Autoencoding reconstructs corrupted inputs.
    
Meta-learning trains models to learn faster on new tasks. Model-agnostic meta-learning adapts initial weights for quick learning. Learning to optimize updates optimization algorithms themselves. Meta-reinforcement learning optimizes reward acquisition strategies.
    
Causal inference goes beyond correlation to identify cause-effect relationships. Directed acyclic graphs represent causal relationships. do-calculus manipulates causal diagrams for inference. Instrumental variables identify causal effects in observational studies.
    
Multimodal learning integrates information from multiple modalities. Image captioning connects visual and language modalities. Visual question answering combines image understanding with question comprehension. Cross-modal retrieval finds similar items across different modalities.
    
Continual learning enables models to acquire new skills without forgetting old ones. Elastic weight consolidation protects important weights during learning. Experience replay stores and reuses past experiences. Dynamic architectures add capacity for new tasks.
    
Robustness to adversarial attacks ensures reliable performance. Adversarial training exposes vulnerabilities during training. Defensive distillation softens model outputs. Certification provides guarantees against certain attacks.
    
Neuro-symbolic AI combines neural networks with symbolic reasoning. Neural theorem provers verify logical statements. Differentiable programming blends gradient-based and discrete optimization. Hybrid architectures integrate neural and symbolic components.
    
Edge machine learning deploys models on resource-constrained devices. Model quantization reduces computational requirements. Edge inference enables real-time decision making without cloud dependency. Federated learning preserves data privacy on edge devices.
    
AI safety addresses alignment between AI goals and human values. Reward hacking occurs when agents exploit reward function flaws. Scalable oversight provides supervision for complex tasks. Interference with monitoring systems undermines safety measures."""
    
    context_100k = """Machine learning has fundamentally transformed how we process information and make decisions across industries. The technology enables computers to learn patterns from data without explicit programming, creating systems that improve automatically through experience.
    
Deep learning architectures have achieved remarkable success across diverse domains. Convolutional neural networks process visual information through hierarchical feature extraction, enabling breakthroughs in image classification, object detection, and medical imaging analysis. Recurrent neural networks maintain internal states to process sequential data, revolutionizing speech recognition and time series forecasting. Transformers use self-attention mechanisms to process sequences in parallel, achieving state-of-the-art results in natural language processing tasks including translation, summarization, and question answering.
    
Natural language processing has experienced dramatic improvements with large language models. Models like GPT, BERT, and T5 understand and generate human-like text, enabling applications in customer service, content creation, and code generation. Fine-tuning pre-trained models on specific tasks achieves excellent performance with relatively little task-specific data. Prompt engineering guides model behavior through carefully crafted input patterns, enabling few-shot and zero-shot learning capabilities.
    
Computer vision applications continue expanding in scope and sophistication. Autonomous vehicles use cameras and deep learning for perception, navigation, and decision making. Medical imaging analysis assists radiologists in detecting diseases like cancer and cardiovascular conditions. Surveillance systems monitor video feeds for security threats and unusual activities. Industrial quality control uses vision systems to inspect products for defects.
    
Reinforcement learning has achieved superhuman performance in complex games. AlphaGo defeated world champions in Go, demonstrating machine learning capabilities in strategic reasoning. Robotics applications use reinforcement learning for manipulation, navigation, and human-robot interaction. Game-playing agents develop sophisticated strategies through self-play and reward maximization.
    
Recommendation systems enhance user experiences across digital platforms. E-commerce platforms recommend products based on user preferences and behavior. Streaming services suggest movies, music, and shows using collaborative filtering and content-based methods. Social media feeds prioritize content likely to engage users based on interaction patterns.
    
Time series forecasting enables prediction of future values based on historical patterns. Financial institutions forecast stock prices, currency exchange rates, and market trends. Energy companies predict electricity demand for grid management. Supply chain systems forecast inventory needs and optimize logistics. Accurate forecasting improves decision making and resource allocation.
    
Anomaly detection identifies unusual patterns indicating potential issues. Fraud detection systems identify suspicious transactions in financial networks. Network security monitors detect intrusion attempts and malware communication. Industrial systems detect equipment failures before they occur. Early anomaly detection prevents significant losses and damage.
    
Healthcare applications transform patient care and research. Diagnostic systems assist physicians in identifying diseases from medical images. Drug discovery platforms predict molecular properties and identify promising candidates. Personalized medicine tailors treatments based on patient genomics and health history. Clinical decision support provides evidence-based recommendations at point of care.
    
Autonomous systems operate with minimal human supervision. Drones perform inspections, deliveries, and surveillance tasks. Self-driving cars navigate complex environments safely. Industrial robots execute precise manufacturing tasks. Autonomous systems increase efficiency and reduce human risk in dangerous environments.
    
Ethical considerations guide responsible machine learning development. Bias detection identifies unfair treatment of demographic groups. Fairness metrics quantify equity in model predictions across groups. Explainable AI methods provide insights into model decision processes. Privacy-preserving techniques protect sensitive data during model training and inference.
    
MLOps practices standardize machine learning operations. Model versioning tracks different versions for reproducibility. Automated testing validates model performance before deployment. Continuous monitoring tracks performance degradation over time. Pipeline automation streamlines data preparation, training, and deployment workflows.
    
Scalability challenges require distributed computing solutions. Data parallelism divides training data across multiple devices. Model parallelism divides large models across devices. Distributed optimizers synchronize gradients across computing nodes. Cloud platforms provide elastic infrastructure for scaling machine learning workloads.
    
Model compression techniques enable deployment on resource-constrained devices. Quantization reduces numerical precision from 32-bit to 8-bit or lower. Pruning removes redundant connections, reducing model size. Knowledge distillation transfers knowledge from large teacher models to smaller student models. Edge deployment brings intelligence to local devices."""
    
    context_sizes = [
        ("1K tokens", context_1k, 50),
        ("8K tokens", context_8k, 100),
        ("32K tokens", context_32k, 150),
        ("64K tokens", context_64k, 200),
        ("100K tokens", context_100k, 250)
    ]
    
    results = []
    
    for name, prompt, max_tokens in context_sizes:
        print(f"--- {name} Context Test ---")
        prompt_tokens = len(prompt.split())
        print(f"   Prompt tokens: ~{prompt_tokens}")
        print(f"   Max output tokens: {max_tokens}")
        
        result = measure_context_window(prompt, max_tokens)
        
        if result and result['success']:
            results.append(result)
            print(f"   Output tokens: {result['output_tokens']}")
            print(f"   Total tokens: {result['total_tokens']}")
            print(f"   Total time: {result['total_time_ms']:.2f} ms")
            print(f"   Tokens/sec: {result['tokens_per_second']:.2f}")
            print(f"   Response preview: {result['response_preview'][:100]}...")
        else:
            print(f"   ‚ùå Failed for {name} test")
            if result:
                print(f"   Error: {result.get('error', 'Unknown')}")
        print()
    
    return results

In [None]:
# Run Context Window tests
context_results = run_context_window_tests()

## Summary & Analysis

After running the context window pressure tests, analyze:

- **TTFT vs Context Size**: How does time to first token scale with context length?
- **Throughput under pressure**: Does tokens/second degrade as context grows?
- **Context limits**: Does the model handle near-131K token contexts without errors?
- **Success rate**: Are all context sizes processed successfully?

**Key Questions:**
- At what context size does performance start to degrade?
- How close to 131K tokens can the model handle while maintaining good performance?
- Are there any errors or truncation issues?

**Expected Observations:**
- TTFT should increase roughly linearly with context size
- Throughput may decrease for larger contexts (more KV cache management)
- Near 131K limit, may see increased latency or errors
- Model should successfully process up to configured context limit

## 6. Tool Calling Performance

This section measures the model's **tool calling performance** - its ability to correctly invoke and use tools when enabled. With the vLLM server configured with `--enable-auto-tool-choice` and `--tool-call-parser qwen3_coder`, this test evaluates the model's tool invocation capabilities.

**Test Scenarios:**
- **Simple tool call**: Single tool invocation with straightforward parameters
- **Multiple tool calls**: Sequential tool invocations in one response
- **Complex tool parameters**: Tools with nested/complex parameter structures
- **Tool selection accuracy**: Model's ability to choose the right tool for the task

**Metrics:**
- **Tool invocation success rate**: Percentage of successful tool calls
- **Parameter correctness**: Accuracy of tool parameter values
- **Response latency**: Time to complete tool invocation and response
- **Tool selection accuracy**: Correct tool chosen for the task

**Expected Patterns:**
- Tool calls should complete with valid JSON structure
- Parameters should match expected types and values
- Latency should be consistent with non-tool responses
- Model should correctly identify which tool to use based on user intent

In [None]:
import time
import json
from typing import Dict, List, Optional

# Define available tools for testing
AVAILABLE_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a city",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {
                        "type": "string",
                        "description": "The city name to get weather for"
                    }
                },
                "required": ["city"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "calculate_math",
            "description": "Perform a mathematical calculation",
            "parameters": {
                "type": "object",
                "properties": {
                    "expression": {
                        "type": "string",
                        "description": "The mathematical expression to evaluate"
                    }
                },
                "required": ["expression"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "search_web",
            "description": "Search the web for information",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The search query"
                    },
                    "limit": {
                        "type": "integer",
                        "description": "Maximum number of results to return",
                        "default": 5
                    }
                },
                "required": ["query"]
            }
        }
    }
]

def measure_tool_calling(prompt: str, expected_tool: Optional[str] = None, max_tokens: int = 150) -> Dict:
    """
    Measure tool calling performance for a given prompt.
    
    Args:
        prompt: User prompt that should trigger tool calling
        expected_tool: Expected tool name to be called (optional)
        max_tokens: Maximum tokens to generate
    
    Returns:
        Dict with tool calling metrics
    """
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "tools": AVAILABLE_TOOLS,
        "tool_choice": "auto",
        "stream": False,
        "max_tokens": max_tokens,
        "temperature": 0.7,
        "top_p": 0.9
    }
    
    try:
        start_time = time.time()
        response = requests.post(
            f"{VLLM_OPENAI_COMPATIBLE_URL}/chat/completions",
            json=payload,
            headers=API_HEADERS,
            timeout=120
        )
        response.raise_for_status()
        end_time = time.time()
        
        result = response.json()
        total_time = end_time - start_time
        
        # Get response
        if "choices" in result and len(result["choices"]) > 0:
            message = result["choices"][0].get("message", {})
            response_text = message.get("content", "")
            tool_calls = message.get("tool_calls", [])
        else:
            response_text = ""
            tool_calls = []
        
        # Analyze tool calls
        tool_call_count = len(tool_calls)
        tool_names = [tc.get("function", {}).get("name", "") for tc in tool_calls]
        tool_arguments = [tc.get("function", {}).get("arguments", "") for tc in tool_calls]
        
        # Check if tool call was successful
        success = tool_call_count > 0 or response_text.strip() != ""
        
        # Parse tool call arguments if available
        parsed_arguments = []
        for arg_str in tool_arguments:
            try:
                parsed = json.loads(arg_str) if arg_str else {}
                parsed_arguments.append(parsed)
            except json.JSONDecodeError:
                parsed_arguments.append(None)
        
        return {
            'prompt': prompt[:100] + '...' if len(prompt) > 100 else prompt,
            'response_text': response_text[:200] if response_text else "",
            'tool_call_count': tool_call_count,
            'tool_names': tool_names,
            'tool_arguments': parsed_arguments,
            'total_time_ms': total_time * 1000,
            'success': success,
            'expected_tool': expected_tool,
            'tool_call_match': expected_tool is None or (len(tool_names) > 0 and expected_tool in tool_names)
        }
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Request failed: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"   Response: {e.response.text}")
        return {
            'success': False,
            'error': str(e)
        }

def run_tool_calling_tests() -> List[Dict]:
    """
    Run tool calling tests with different scenarios.
    
    Returns:
        List of results for each test
    """
    print("üîß Running Tool Calling Performance Tests...\n")
    
    test_cases = [
        (
            "Simple Weather Query",
            "What's the weather like in New York City right now?",
            "get_weather",
            100
        ),
        (
            "Math Calculation",
            "Please calculate the result of 25 * 17 + 123",
            "calculate_math",
            100
        ),
        (
            "Web Search Query",
            "Find information about the latest developments in quantum computing",
            "search_web",
            150
        ),
        (
            "Multi-tool Scenario",
            "Get the weather in London AND search for the best restaurants there",
            None,
            200
        ),
        (
            "No Tool Needed",
            "Explain the concept of machine learning in simple terms",
            None,
            150
        )
    ]
    
    results = []
    
    for name, prompt, expected_tool, max_tokens in test_cases:
        print(f"--- {name} Test ---")
        print(f"   Prompt: {prompt[:80]}...")
        print(f"   Expected tool: {expected_tool or 'None'}")
        
        result = measure_tool_calling(prompt, expected_tool, max_tokens)
        
        if result and result.get('success', False):
            results.append(result)
            print(f"   Tool calls: {result['tool_call_count']}")
            print(f"   Tool names: {result['tool_names']}")
            print(f"   Tool arguments: {result['tool_arguments']}")
            print(f"   Response: {result['response_text'][:100]}...")
            print(f"   Time: {result['total_time_ms']:.2f} ms")
            print(f"   Tool match: {'‚úÖ' if result.get('tool_call_match', False) else '‚ùå'}")
        else:
            print(f"   ‚ùå Failed for {name} test")
            if result and 'error' in result:
                print(f"   Error: {result['error']}")
        print()
    
    return results

In [None]:
# Display context window test results
print("=" * 90)
print("CONTEXT WINDOW PRESSURE TEST RESULTS SUMMARY")
print("=" * 90)

if context_results:
    print(f"{'Context Size':<15} {'Prompt Tok':<12} {'Output Tok':<12} {'Total Tok':<12} {'Time (ms)':<12} {'Tok/s':<12}")
    print("-" * 90)
    
    for r in context_results:
        if r['success']:
            context_size = f"{r['prompt_tokens'] // 1000}K"
            print(f"{context_size:<15} {r['prompt_tokens']:<12} {r['output_tokens']:<12} {r['total_tokens']:<12} {r['total_time_ms']:<12.2f} {r['tokens_per_second']:<12.2f}")
    
    print("=" * 90)
    
    # Calculate statistics
    successful_results = [r for r in context_results if r['success']]
    if successful_results:
        avg_throughput = sum(r['tokens_per_second'] for r in successful_results) / len(successful_results)
        
        # Find trend
        if len(successful_results) >= 2:
            first_result = successful_results[0]
            last_result = successful_results[-1]
            throughput_change = (last_result['tokens_per_second'] - first_result['tokens_per_second']) / first_result['tokens_per_second'] * 100
            
            print(f"\nAverage Throughput: {avg_throughput:.2f} tokens/sec")
            print(f"Throughput change (1K ‚Üí {last_result['prompt_tokens'] // 1000}K): {throughput_change:+.1f}%")
            
            print(f"\nInterpretation:")
            if throughput_change > -20:
                print("  ‚úÖ Throughput remains stable across context sizes")
            elif throughput_change > -50:
                print("  ‚ö†Ô∏è  Throughput degrades moderately with context size")
            else:
                print("  ‚ö†Ô∏è  Throughput significantly degrades with context size")
            
            # Check context limit
            max_prompt = max(r['prompt_tokens'] for r in successful_results)
            print(f"\nLargest context processed: {max_prompt:,} tokens")
            print(f"Model max context: 131,072 tokens")
            print(f"Utilization: {max_prompt / 131072 * 100:.1f}% of max context")
else:
    print("‚ùå No results to display. Tests may have failed.")

In [None]:
# Run Tool Calling tests
tool_results = run_tool_calling_tests()

## Summary & Analysis

After running the tool calling tests, analyze:

- **Tool invocation success rate**: How often did the model successfully invoke tools?
- **Tool selection accuracy**: Did the model choose the correct tools?
- **Parameter correctness**: Were the tool parameters correctly formatted and valid?
- **Response quality**: Did the model provide appropriate responses when tools weren't needed?

**Key Metrics to Evaluate:**

| Metric | What to Look For |
|--------|------------------|
| Tool Call Count | Should match expected number of tools |
| Tool Name Match | Should match expected tool names |
| Parameter Parsing | Arguments should be valid JSON |
| Response Quality | Should be relevant to the task |
| Latency | Should be reasonable for tool calls |

**Expected Observations:**
- Simple queries (weather, math) should trigger single tool calls
- Complex queries may trigger multiple tool calls
- No-tool queries should result in direct responses
- Tool arguments should be properly formatted JSON
- Latency should be consistent with chat completions

In [None]:
# Display Tool Calling results
print("=" * 90)
print("TOOL CALLING PERFORMANCE TEST RESULTS")
print("=" * 90)

if tool_results:
    print(f"{'Test Case':<25} {'Tool Call':<10} {'Tools':<20} {'Time (ms)':<12} {'Match':<8} {'Success':<8}")
    print("-" * 90)
    
    success_count = 0
    tool_match_count = 0
    
    for r in tool_results:
        if r.get('success', False):
            success_count += 1
            test_case = r['prompt'][:22] + "..." if len(r['prompt']) > 25 else r['prompt']
            tool_call = str(r['tool_call_count'])
            tools = ", ".join(r['tool_names']) if r['tool_names'] else "none"
            tool_match = r.get('tool_call_match', False)
            if tool_match:
                tool_match_count += 1
            
            print(f"{test_case:<25} {tool_call:<10} {tools:<20} {r['total_time_ms']:<12.2f} {'‚úÖ' if tool_match else '‚ùå':<8} {'‚úÖ':<8}")
        else:
            print(f"{r.get('prompt', 'Failed')[:25]:<25} {'-':<10} {'-':<20} {'-':<12} {'-':<8} {'‚ùå':<8}")
    
    print("=" * 90)
    
    if tool_results:
        print(f"\nSummary:")
        print(f"  Total tests: {len(tool_results)}")
        print(f"  Successful: {success_count}")
        print(f"  Tool match accuracy: {tool_match_count}/{success_count if success_count > 0 else 1}")
        
        avg_time = sum(r['total_time_ms'] for r in tool_results if r.get('success')) / max(success_count, 1)
        print(f"  Average latency: {avg_time:.2f} ms")
else:
    print("‚ùå No results to display. Tests may have failed.")

## 7. Memory Usage Analysis

This section monitors **GPU and CPU memory usage** during inference to understand the model's memory efficiency and resource requirements.

**Test Scenarios:**
- **Baseline memory**: Memory usage before inference
- **Small generation (50 tokens)**: Memory with minimal generation
- **Medium generation (150 tokens)**: Moderate memory usage
- **Large generation (300 tokens)**: Higher memory pressure
- **Large context (32K tokens)**: Memory with significant context

**Metrics:**
- **GPU Memory Usage**: Total, used, and free memory
- **GPU Memory Percentage**: Utilization percentage
- **CPU Memory Usage**: System memory consumption
- **Memory delta**: Change from baseline during inference

**Expected Patterns:**
- Larger generations should increase GPU memory usage
- Context window size directly impacts KV cache memory
- Memory should be released after inference completes
- Memory usage should remain stable across repeated runs

In [None]:
import time
import statistics
from typing import Dict, List, Tuple

def get_memory_usage() -> Dict:
    """
    Get current GPU and CPU memory usage.
    
    Returns:
        Dict with memory statistics
    """
    import psutil
    
    memory = {}
    
    # CPU memory
    cpu_mem = psutil.virtual_memory()
    memory['cpu_total_gb'] = cpu_mem.total / (1024**3)
    memory['cpu_used_gb'] = cpu_mem.used / (1024**3)
    memory['cpu_free_gb'] = cpu_mem.available / (1024**3)
    memory['cpu_percent'] = cpu_mem.percent
    
    # GPU memory (if available)
    try:
        import subprocess
        result = subprocess.run([
            'nvidia-smi', 
            '--query-gpu=memory.total,memory.used,memory.free,utilization.gpu,temperature.gpu,power.draw',
            '--format=csv,noheader,nounits'
        ], capture_output=True, text=True, timeout=5)
        
        gpu_info = result.stdout.strip().split(', ')
        if len(gpu_info) >= 6:
            memory['gpu_total_mb'] = float(gpu_info[0])
            memory['gpu_used_mb'] = float(gpu_info[1])
            memory['gpu_free_mb'] = float(gpu_info[2])
            memory['gpu_util_percent'] = float(gpu_info[3])
            memory['gpu_temp_c'] = float(gpu_info[4])
            memory['gpu_power_w'] = float(gpu_info[5])
    except Exception as e:
        memory['gpu_available'] = False
        memory['gpu_error'] = str(e)
    
    return memory

def format_memory_diff(before: float, after: float) -> Tuple[float, str]:
    """
    Calculate and format memory difference.
    
    Returns:
        Tuple of (delta_gb, formatted_string_with_indicator)
    """
    delta_mb = (after - before) * 1024  # Convert to MB
    delta_gb = after - before
    
    if delta_mb > 0:
        return delta_gb, f"+{delta_mb:.1f} MB üìà"
    elif delta_mb < 0:
        return delta_gb, f"{delta_mb:.1f} MB üìâ"
    else:
        return delta_gb, "‚âà0 MB"

def measure_memory_with_generation(prompt: str, max_tokens: int = 50, num_runs: int = 3) -> Dict:
    """
    Measure memory usage during text generation.
    
    Args:
        prompt: Input prompt
        max_tokens: Maximum tokens to generate
        num_runs: Number of runs to average
    
    Returns:
        Dict with memory metrics
    """
    # Get baseline memory
    baseline = get_memory_usage()
    baseline_gpu_used_mb = baseline.get('gpu_used_mb', 0)
    baseline_cpu_used_gb = baseline['cpu_used_gb']
    
    latencies = []
    output_tokens_list = []
    
    for run in range(num_runs):
        payload = {
            "model": MODEL_NAME,
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "stream": False,
            "max_tokens": max_tokens,
            "temperature": 0.7,
            "top_p": 0.9
        }
        
        try:
            start_time = time.time()
            response = requests.post(
                f"{VLLM_OPENAI_COMPATIBLE_URL}/chat/completions",
                json=payload,
                headers=API_HEADERS,
                timeout=120
            )
            response.raise_for_status()
            end_time = time.time()
            
            result = response.json()
            total_time = end_time - start_time
            
            if "choices" in result and len(result["choices"]) > 0:
                response_text = result["choices"][0].get("message", {}).get("content", "")
            else:
                response_text = ""
            
            output_tokens = len(response_text.split())
            latencies.append(total_time * 1000)
            output_tokens_list.append(output_tokens)
            
        except requests.exceptions.RequestException as e:
            print(f"‚ùå Run {run + 1} failed: {e}")
            continue
    
    if not latencies:
        return None
    
    # Get post-inference memory
    post_memory = get_memory_usage()
    post_gpu_used_mb = post_memory.get('gpu_used_mb', 0)
    post_cpu_used_gb = post_memory['cpu_used_gb']
    
    # Calculate averages
    avg_latency = statistics.mean(latencies)
    avg_output_tokens = statistics.mean(output_tokens_list)
    
    # Calculate memory deltas
    gpu_delta_gb, gpu_delta_str = format_memory_diff(baseline_gpu_used_mb / 1024, post_gpu_used_mb / 1024)
    cpu_delta_gb, cpu_delta_str = format_memory_diff(baseline_cpu_used_gb, post_cpu_used_gb)
    
    return {
        'max_tokens': max_tokens,
        'num_runs': num_runs,
        'avg_latency_ms': avg_latency,
        'avg_output_tokens': avg_output_tokens,
        'avg_throughput': avg_output_tokens / (avg_latency / 1000) if avg_latency > 0 else 0,
        'baseline_gpu_used_mb': baseline_gpu_used_mb,
        'post_gpu_used_mb': post_gpu_used_mb,
        'gpu_delta_gb': gpu_delta_gb,
        'gpu_delta_str': gpu_delta_str,
        'baseline_cpu_used_gb': baseline_cpu_used_gb,
        'post_cpu_used_gb': post_cpu_used_gb,
        'cpu_delta_gb': cpu_delta_gb,
        'cpu_delta_str': cpu_delta_str,
        'gpu_util_percent': post_memory.get('gpu_util_percent', 0),
        'gpu_temp_c': post_memory.get('gpu_temp_c', 0)
    }

def run_memory_tests() -> List[Dict]:
    """
    Run memory usage tests with different generation sizes.
    
    Returns:
        List of results for each test
    """
    print("üíæ Running Memory Usage Analysis...\n")
    
    consistent_prompt = """The concept of artificial intelligence has evolved significantly since its inception. AI systems are now capable of understanding natural language and making decisions in complex environments."""
    
    test_cases = [
        ("Small (50 tokens)", 50),
        ("Medium (150 tokens)", 150),
        ("Large (300 tokens)", 300)
    ]
    
    results = []
    
    for name, max_tokens in test_cases:
        print(f"--- {name} Memory Test ---")
        print(f"   Prompt: Constant (moderate length)")
        print(f"   Max output tokens: {max_tokens}")
        print(f"   Number of runs: 3")
        
        result = measure_memory_with_generation(consistent_prompt, max_tokens, num_runs=3)
        
        if result:
            results.append(result)
            print(f"   Latency: {result['avg_latency_ms']:.2f} ms")
            print(f"   Output tokens: {result['avg_output_tokens']:.1f}")
            print(f"   Throughput: {result['avg_throughput']:.2f} tokens/sec")
            print(f"   GPU Memory: {result['post_gpu_used_mb']:.0f} MB ({result['gpu_delta_str']})")
            print(f"   GPU Utilization: {result['gpu_util_percent']:.1f}%")
            print(f"   GPU Temperature: {result['gpu_temp_c']:.0f}¬∞C")
            print(f"   CPU Memory: {result['post_cpu_used_gb']:.2f} GB ({result['cpu_delta_str']})")
        else:
            print(f"   ‚ùå Failed for {name} test")
        print()
    
    return results

In [None]:
# Run Memory tests
memory_results = run_memory_tests()

## Summary & Analysis

After running the memory usage tests, analyze:

- **GPU Memory Impact**: How much GPU memory is consumed per generation size?
- **Memory Delta**: Does memory increase during inference and release afterward?
- **GPU Utilization**: How heavily is the GPU being utilized?
- **Temperature Impact**: Does prolonged inference cause temperature spikes?

**Key Observations:**
- Memory delta indicates actual memory overhead of generation
- GPU utilization shows how well the model saturates the GPU
- Temperature trends indicate thermal management
- Compare memory efficiency across different generation sizes

**Expected Patterns:**
- Larger generations should show higher memory usage
- Memory should stabilize after initial allocation
- GPU utilization should be high for efficient generation

In [None]:
# Display Memory Usage results
print("=" * 90)
print("MEMORY USAGE ANALYSIS RESULTS")
print("=" * 90)

if memory_results:
    print(f"{'Generation':<20} {'Latency (ms)':<15} {'Output Tok':<12} {'GPU Mem (MB)':<15} {'Œî GPU':<12} {'GPU %':<10}")
    print("-" * 90)
    
    for r in memory_results:
        gen_size = f"{r['max_tokens']} tokens"
        print(f"{gen_size:<20} {r['avg_latency_ms']:<15.2f} {r['avg_output_tokens']:<12.1f} {r['post_gpu_used_mb']:<15.0f} {r['gpu_delta_str']:<12} {r['gpu_util_percent']:<10.1f}")
    
    print("=" * 90)
    
    if memory_results:
        print(f"\nMemory Summary:")
        for r in memory_results:
            print(f"  {r['max_tokens']} tokens: GPU {r['post_gpu_used_mb']:.0f} MB ({r['gpu_delta_str']})")
            print(f"    CPU: {r['post_cpu_used_gb']:.2f} GB ({r['cpu_delta_str']})")
            print(f"    GPU Temp: {r['gpu_temp_c']:.0f}¬∞C, GPU Util: {r['gpu_util_percent']:.1f}%")
        
        # Check for memory leaks (memory not released after tests)
        if memory_results:
            first_result = memory_results[0]
            last_result = memory_results[-1]
            total_gpu_delta = last_result['post_gpu_used_mb'] - first_result['baseline_gpu_used_mb']
            print(f"\nTotal GPU Memory Change: {total_gpu_delta:+.0f} MB")
            if abs(total_gpu_delta) < 100:  # Less than 100MB change
                print("  ‚úÖ No significant memory leak detected")
            else:
                print(f"  ‚ö†Ô∏è  Significant memory change detected")
else:
    print("‚ùå No results to display. Tests may have failed.")

## 8. Long-Context Retrieval

This section tests the model's ability to **retrieve and use information from long contexts**. With a maximum context of **131,072 tokens**, this test evaluates how well the model can find and answer questions based on information embedded in long documents.

**Test Scenarios:**
- **Short context (1K tokens)**: Basic retrieval with minimal context
- **Medium context (8K tokens)**: Moderate retrieval challenge
- **Long context (32K tokens)**: Significant retrieval task
- **Very long context (64K tokens)**: Advanced retrieval challenge
- **Near limit context (100K tokens)**: Maximum retrieval challenge

**Metrics:**
- **Answer Accuracy**: Correctness of retrieved information
- **Response Relevance**: How well the answer addresses the question
- **Latency**: Time to process long context and retrieve answer
- **Success Rate**: Percentage of successful retrievals

**Expected Patterns:**
- Answer accuracy should remain high even with long contexts
- Latency should increase with context size (more tokens to process)
- Model should maintain consistent retrieval quality across context sizes

In [None]:
import time
from typing import Dict, List, Tuple

def measure_retrieval(context: str, question: str, expected_answer: str, max_tokens: int = 150) -> Dict:
    """
    Measure retrieval performance with a given context and question.
    
    Args:
        context: Long context containing information
        question: Question to ask about the context
        expected_answer: Expected answer (for verification)
        max_tokens: Maximum tokens to generate
    
    Returns:
        Dict with retrieval metrics
    """
    prompt = f"""Based on the following context, answer the question:

Context:
{context}

Question:
{question}

Answer:"""
    
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "stream": False,
        "max_tokens": max_tokens,
        "temperature": 0.7,
        "top_p": 0.9
    }
    
    try:
        start_time = time.time()
        response = requests.post(
            f"{VLLM_OPENAI_COMPATIBLE_URL}/chat/completions",
            json=payload,
            headers=API_HEADERS,
            timeout=120
        )
        response.raise_for_status()
        end_time = time.time()
        
        result = response.json()
        total_time = end_time - start_time
        
        if "choices" in result and len(result["choices"]) > 0:
            response_text = result["choices"][0].get("message", {}).get("content", "").strip()
        else:
            response_text = ""
        
        # Check if answer contains expected information
        answer_lower = response_text.lower()
        expected_lower = expected_answer.lower()
        
        # Simple relevance check - does the answer mention key terms?
        key_words = expected_answer.split()[:3]  # First 3 words as key terms
        relevance_score = sum(1 for word in key_words if word.lower() in answer_lower) / len(key_words)
        
        # Accuracy check (exact or partial match)
        is_correct = expected_lower in answer_lower or answer_lower in expected_lower or relevance_score >= 0.5
        
        return {
            'context_tokens': len(context.split()),
            'question': question[:80] + '...' if len(question) > 80 else question,
            'answer': response_text[:150] + '...' if len(response_text) > 150 else response_text,
            'expected_answer': expected_answer[:80] + '...' if len(expected_answer) > 80 else expected_answer,
            'total_time_ms': total_time * 1000,
            'relevance_score': relevance_score,
            'is_correct': is_correct,
            'success': True
        }
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Request failed: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"   Response: {e.response.text}")
        return {
            'success': False,
            'error': str(e)
        }

def run_retrieval_tests() -> List[Dict]:
    """
    Run long-context retrieval tests with different context sizes.
    
    Returns:
        List of results for each test
    """
    print("üîç Running Long-Context Retrieval Tests...\n")
    
    # Contexts with embedded information for retrieval
    # Each context contains specific information that can be queried
    
    context_1k = """Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It involves building mathematical models that can identify patterns, make predictions, and improve performance over time. The field has revolutionized many industries including healthcare, finance, and transportation.
    
Supervised learning uses labeled data to train models for classification and regression tasks. Unsupervised learning discovers hidden patterns in unlabeled data through clustering and dimensionality reduction techniques. Reinforcement learning involves agents learning to interact with environments by receiving rewards or penalties.
    
Neural networks, inspired by the human brain, consist of interconnected layers of nodes that process information. Deep learning, a subset of neural networks, uses multiple hidden layers to learn complex representations. Convolutional Neural Networks (CNNs) excel at image processing while Recurrent Neural Networks (RNNs) and Transformers dominate sequential data tasks like language modeling."""
    
    question_1k = "What type of learning uses labeled data for training models?"
    answer_1k = "Supervised learning"
    
    context_8k = """Machine learning has transformed the landscape of artificial intelligence and data-driven decision making. At its core, machine learning enables computers to automatically learn and improve from experience without being explicitly programmed for specific tasks. This capability stems from the ability of algorithms to identify patterns in data and make predictions based on those patterns.
    
The history of machine learning dates back to the mid-20th century, with significant milestones including the development of the perceptron in the 1950s, the backpropagation algorithm in the 1980s, and the rise of deep learning in the 2010s. Each of these developments built upon previous work to create more powerful and flexible models.
    
Supervised learning remains one of the most widely used approaches, where models are trained on labeled datasets to make predictions. Classification tasks categorize data into predefined classes, while regression tasks predict continuous values. Common algorithms include logistic regression, support vector machines, decision trees, and random forests.
    
Unsupervised learning discovers hidden structures in unlabeled data. Clustering algorithms group similar data points together, while dimensionality reduction techniques like PCA and t-SNE reduce feature space complexity. Autoencoders and generative models learn to reconstruct and generate new data samples.
    
Reinforcement learning involves agents learning optimal policies through trial and error interactions with environments. The agent receives rewards or penalties for actions, gradually learning strategies that maximize cumulative reward. This approach has achieved remarkable success in games like chess and Go, as well as robotics and navigation.
    
Deep learning has revolutionized machine learning with neural networks containing many layers. Convolutional Neural Networks process grid-like data such as images through convolutional layers that detect local patterns. Recurrent Neural Networks handle sequential data by maintaining hidden states that capture temporal dependencies. Transformers use attention mechanisms to process sequences in parallel, achieving state-of-the-art results in natural language processing."""
    
    question_8k = "In which decade was the backpropagation algorithm developed?"
    answer_8k = "1980s"
    
    context_32k = """Machine learning continues to evolve rapidly, pushing boundaries of what artificial intelligence can achieve. The field encompasses numerous algorithms, methodologies, and applications that transform how we process information and make decisions.
    
Supervised learning algorithms learn from labeled training data to make predictions on new inputs. Classification algorithms categorize inputs into discrete classes, from simple binary classification to complex multi-class problems. Regression algorithms predict continuous numerical values, modeling relationships between variables. Ensemble methods combine multiple models to improve performance and robustness.
    
Unsupervised learning discovers patterns without labeled examples. Clustering algorithms group similar instances, revealing natural structures in data. Dimensionality reduction techniques compress high-dimensional data while preserving essential features. Anomaly detection identifies unusual patterns that deviate from normal behavior.
    
Reinforcement learning agents learn optimal behaviors through interaction with environments. Markov Decision Processes provide theoretical frameworks for sequential decision making. Policy gradient methods directly optimize reward accumulation. Deep reinforcement learning combines neural networks with reinforcement learning for complex tasks.
    
Deep learning architectures have become standard for many machine learning problems. Convolutional layers detect local patterns in images through learned filters. Recurrent layers maintain memory of previous inputs through hidden state propagation. Attention mechanisms weigh different parts of input sequences differently based on relevance.
    
Transformers dominate natural language processing with self-attention mechanisms that process entire sequences in parallel. BERT and GPT architectures demonstrate remarkable capabilities in understanding and generating human language. Fine-tuning pre-trained models on specific tasks achieves state-of-the-art results with less training data."""
    
    question_32k = "Which architectures are mentioned as demonstrating remarkable capabilities in understanding and generating human language?"
    answer_32k = "BERT and GPT"
    
    context_64k = """Machine learning systems continue advancing toward greater capability, efficiency, and accessibility. The integration of machine learning into everyday applications transforms user experiences and business operations.
    
Model architecture innovation drives performance improvements. Vision Transformers have challenged Convolutional Neural Networks for image tasks. Large language models demonstrate unprecedented capabilities in text generation and understanding. Graph neural networks extend machine learning to relational data domains.
    
Training optimization techniques enable more efficient model development. Mixed precision training reduces memory usage and accelerates computation. Gradient accumulation enables larger effective batch sizes with limited memory. Learning rate schedulers dynamically adjust optimization dynamics.
    
Data augmentation improves model generalization by creating modified training samples. Image augmentations include rotations, flips, color changes, and cropping. Text augmentations include synonym replacement, back translation, and dropout. Augmentation creates more diverse training data.
    
Batch normalization stabilizes training by normalizing layer inputs. Layer normalization adapts normalization to individual samples. Instance normalization applies normalization to individual channels. Normalization techniques enable deeper network training.
    
Dropout regularization prevents overfitting by randomly deactivating neurons during training. Label smoothing softens target labels to prevent overconfidence. Weight decay penalizes large weights through L2 regularization. Regularization improves generalization to unseen data."""
    
    question_64k = "What technique reduces memory usage and accelerates computation in model development?"
    answer_64k = "Mixed precision training"
    
    context_100k = """Machine learning has fundamentally transformed how we process information and make decisions across industries. The technology enables computers to learn patterns from data without explicit programming, creating systems that improve automatically through experience.
    
Deep learning architectures have achieved remarkable success across diverse domains. Convolutional neural networks process visual information through hierarchical feature extraction, enabling breakthroughs in image classification, object detection, and medical imaging analysis. Recurrent neural networks maintain internal states to process sequential data, revolutionizing speech recognition and time series forecasting. Transformers use self-attention mechanisms to process sequences in parallel, achieving state-of-the-art results in natural language processing tasks including translation, summarization, and question answering.
    
Natural language processing has experienced dramatic improvements with large language models. Models like GPT, BERT, and T5 understand and generate human-like text, enabling applications in customer service, content creation, and code generation. Fine-tuning pre-trained models on specific tasks achieves excellent performance with relatively little task-specific data. Prompt engineering guides model behavior through carefully crafted input patterns, enabling few-shot and zero-shot learning capabilities.
    
Computer vision applications continue expanding in scope and sophistication. Autonomous vehicles use cameras and deep learning for perception, navigation, and decision making. Medical imaging analysis assists radiologists in detecting diseases like cancer and cardiovascular conditions. Surveillance systems monitor video feeds for security threats and unusual activities. Industrial quality control uses vision systems to inspect products for defects."""
    
    question_100k = "What type of neural networks process visual information through hierarchical feature extraction?"
    answer_100k = "Convolutional neural networks"
    
    test_cases = [
        ("1K tokens", context_1k, question_1k, answer_1k, 100),
        ("8K tokens", context_8k, question_8k, answer_8k, 100),
        ("32K tokens", context_32k, question_32k, answer_32k, 150),
        ("64K tokens", context_64k, question_64k, answer_64k, 150),
        ("100K tokens", context_100k, question_100k, answer_100k, 200)
    ]
    
    results = []
    
    for name, context, question, expected_answer, max_tokens in test_cases:
        print(f"--- {name} Retrieval Test ---")
        prompt_tokens = len(context.split())
        print(f"   Context tokens: ~{prompt_tokens}")
        print(f"   Question: {question[:60]}...")
        
        result = measure_retrieval(context, question, expected_answer, max_tokens)
        
        if result and result.get('success', False):
            results.append(result)
            print(f"   Answer: {result['answer'][:80]}...")
            print(f"   Expected: {expected_answer}")
            print(f"   Is correct: {'‚úÖ' if result['is_correct'] else '‚ùå'}")
            print(f"   Relevance score: {result['relevance_score']:.2f}")
            print(f"   Time: {result['total_time_ms']:.2f} ms")
        else:
            print(f"   ‚ùå Failed for {name} test")
            if result and 'error' in result:
                print(f"   Error: {result['error']}")
        print()
    
    return results

In [None]:
# Run Long-Context Retrieval tests
retrieval_results = run_retrieval_tests()

## Summary & Analysis

After running the long-context retrieval tests, analyze:

- **Answer Accuracy**: How well does the model retrieve information from long contexts?
- **Relevance Scores**: Are answers relevant to the questions asked?
- **Latency Trends**: Does retrieval time increase with context size?
- **Success Rate**: What percentage of queries are answered correctly?

**Key Questions:**
- At what context size does accuracy start to degrade?
- Is the model able to consistently find relevant information?
- How does retrieval latency scale with context length?

**Expected Observations:**
- High accuracy even with 100K+ token contexts
- Latency should increase roughly linearly with context size
- Relevance scores should remain high across all context sizes
- Model should maintain retrieval quality near the 131K token limit

In [None]:
# Display Long-Context Retrieval results
print("=" * 90)
print("LONG-CONTEXT RETRIEVAL TEST RESULTS")
print("=" * 90)

if retrieval_results:
    print(f"{'Context Size':<15} {'Correct':<10} {'Relevance':<12} {'Time (ms)':<12}")
    print("-" * 90)
    
    correct_count = 0
    total_time = 0
    
    for r in retrieval_results:
        if r.get('success', False):
            context_size = f"{r['context_tokens'] // 1000}K"
            is_correct = r.get('is_correct', False)
            if is_correct:
                correct_count += 1
            total_time += r['total_time_ms']
            
            print(f"{context_size:<15} {'‚úÖ' if is_correct else '‚ùå':<10} {r['relevance_score']:<12.2f} {r['total_time_ms']:<12.2f}")
        else:
            print(f"{'Error':<15} {'-':<10} {'-':<12} {'-':<12}")
    
    print("=" * 90)
    
    if retrieval_results:
        success_rate = (correct_count / len(retrieval_results)) * 100 if retrieval_results else 0
        avg_time = total_time / len(retrieval_results) if retrieval_results else 0
        
        print(f"\nSummary:")
        print(f"  Total tests: {len(retrieval_results)}")
        print(f"  Correct answers: {correct_count}")
        print(f"  Success rate: {success_rate:.1f}%")
        print(f"  Average latency: {avg_time:.2f} ms")
        
        # Check for accuracy degradation with context size
        if len(retrieval_results) >= 2:
            first_result = retrieval_results[0]
            last_result = retrieval_results[-1]
            print(f"\nAccuracy comparison:")
            print(f"  Small context (1K): {'‚úÖ' if first_result.get('is_correct', False) else '‚ùå'}")
            print(f"  Large context (100K): {'‚úÖ' if last_result.get('is_correct', False) else '‚ùå'}")
            
            if first_result.get('is_correct') and last_result.get('is_correct'):
                print("  ‚úÖ Model maintains retrieval quality across context sizes")
            elif not last_result.get('is_correct', False):
                print("  ‚ö†Ô∏è  Accuracy degrades with very large contexts")
else:
    print("‚ùå No results to display. Tests may have failed.")