# Lab 3.2.8: TensorRT-LLM Engine - Solutions

This notebook contains solutions for all exercises in Lab 3.2.8.

In [None]:
import numpy as np
from pathlib import Path

## Exercise 1 Solution: Build Custom Engine

Create a custom TensorRT-LLM engine configuration.

In [None]:
from dataclasses import dataclass
from typing import Optional

@dataclass
class CustomTRTLLMConfig:
    """
    Custom configuration for TensorRT-LLM engine building.
    """
    # Model
    model_name: str
    
    # Quantization
    quantization: str = "fp8"  # none, int8, fp8, int4_awq, fp4
    
    # Batch settings
    max_batch_size: int = 8
    max_input_len: int = 2048
    max_output_len: int = 512
    
    # KV-cache
    kv_cache_type: str = "paged"
    max_num_tokens: int = 8192
    
    # Parallelism
    tp_size: int = 1
    pp_size: int = 1
    
    # Optimization level (0-5)
    opt_level: int = 3
    
    def generate_convert_command(self, output_dir: str) -> str:
        """Generate checkpoint conversion command."""
        cmd = f"""python -m tensorrt_llm.commands.convert_checkpoint \\
    --model_dir {self.model_name} \\
    --output_dir {output_dir}/checkpoint \\
    --dtype float16 \\
    --tp_size {self.tp_size}"""
        
        # Add quantization flags
        if self.quantization == "fp8":
            cmd += " \\
    --use_fp8"
        elif self.quantization == "int8":
            cmd += " \\
    --int8_kv_cache"
        elif self.quantization == "int4_awq":
            cmd += " \\
    --use_weight_only --weight_only_precision int4_awq"
        elif self.quantization == "fp4":
            cmd += " \\
    --use_fp4"  # Blackwell native
        
        return cmd
    
    def generate_build_command(self, output_dir: str) -> str:
        """Generate engine build command."""
        max_seq_len = self.max_input_len + self.max_output_len
        
        cmd = f"""trtllm-build \\
    --checkpoint_dir {output_dir}/checkpoint \\
    --output_dir {output_dir}/engine \\
    --max_batch_size {self.max_batch_size} \\
    --max_input_len {self.max_input_len} \\
    --max_seq_len {max_seq_len} \\
    --gemm_plugin float16 \\
    --gpt_attention_plugin float16 \\
    --paged_kv_cache enable \\
    --remove_input_padding enable \\
    --builder_opt {self.opt_level}"""
        
        return cmd
    
    def generate_run_command(self, output_dir: str) -> str:
        """Generate inference run command."""
        cmd = f"""python -m tensorrt_llm.commands.run \\
    --engine_dir {output_dir}/engine \\
    --tokenizer_dir {self.model_name} \\
    --max_output_len {self.max_output_len} \\
    --input_text "Hello, how are you?""""
        
        return cmd
    
    def print_full_workflow(self, output_dir: str):
        """Print complete build and run workflow."""
        print(f"TensorRT-LLM Build Workflow for: {self.model_name}")
        print(f"Quantization: {self.quantization}")
        print("="*70)
        
        print("\n# Step 1: Convert checkpoint")
        print(self.generate_convert_command(output_dir))
        
        print("\n# Step 2: Build engine")
        print(self.generate_build_command(output_dir))
        
        print("\n# Step 3: Run inference")
        print(self.generate_run_command(output_dir))


# Example configurations
print("Example 1: High-throughput chatbot (FP8)")
print("="*70)

chatbot_config = CustomTRTLLMConfig(
    model_name="Qwen/Qwen3-4B-Instruct",
    quantization="fp8",
    max_batch_size=16,
    max_input_len=2048,
    max_output_len=512,
    opt_level=4
)
chatbot_config.print_full_workflow("./engines/chatbot")

print("\n\nExample 2: Code generation (FP4 for speed)")
print("="*70)

code_config = CustomTRTLLMConfig(
    model_name="codellama/CodeLlama-7b-Instruct-hf",
    quantization="fp4",
    max_batch_size=8,
    max_input_len=4096,
    max_output_len=1024,
    opt_level=5
)
code_config.print_full_workflow("./engines/codegen")

## Exercise 2 Solution: Batch Size Optimization

Find the optimal batch size for a given latency requirement.

In [None]:
def simulate_batch_performance(
    batch_sizes: list,
    base_latency_ms: float = 50,
    tokens_per_request: int = 100,
    max_memory_gb: float = 20
) -> dict:
    """
    Simulate performance at different batch sizes.
    
    Models typical TensorRT-LLM behavior:
    - Latency increases sub-linearly with batch size
    - Throughput increases near-linearly until memory bound
    
    Args:
        batch_sizes: List of batch sizes to test
        base_latency_ms: Single-request latency
        tokens_per_request: Tokens generated per request
        max_memory_gb: Maximum GPU memory
        
    Returns:
        Performance results per batch size
    """
    results = {}
    
    for batch_size in batch_sizes:
        # Latency model: sub-linear increase
        # sqrt scaling is typical for well-optimized systems
        latency_ms = base_latency_ms * (1 + 0.1 * np.sqrt(batch_size - 1))
        
        # Memory model: linear with batch
        # Base memory + per-request KV-cache
        base_memory = 3.0  # Model weights
        kv_per_request = 0.5  # GB per request for KV-cache
        memory_gb = base_memory + kv_per_request * batch_size
        
        # Check memory limit
        if memory_gb > max_memory_gb:
            # Memory bound - latency increases sharply
            latency_ms *= 2
            memory_gb = max_memory_gb
        
        # Throughput calculations
        total_tokens = batch_size * tokens_per_request
        tokens_per_sec = total_tokens / (latency_ms / 1000)
        requests_per_sec = batch_size / (latency_ms / 1000)
        
        # Per-request latency
        per_request_latency = latency_ms
        
        results[batch_size] = {
            'batch_latency_ms': latency_ms,
            'per_request_latency_ms': per_request_latency,
            'tokens_per_second': tokens_per_sec,
            'requests_per_second': requests_per_sec,
            'memory_gb': memory_gb,
            'gpu_utilization': min(100, batch_size * 15)  # Rough estimate
        }
    
    return results


def find_optimal_batch_size(
    results: dict,
    max_latency_ms: float = 100,
    min_throughput: float = 0
) -> tuple:
    """
    Find optimal batch size given constraints.
    
    Args:
        results: Performance results from simulate_batch_performance
        max_latency_ms: Maximum acceptable latency
        min_throughput: Minimum required throughput
        
    Returns:
        Tuple of (optimal_batch_size, reason)
    """
    valid_configs = []
    
    for batch_size, metrics in results.items():
        if metrics['per_request_latency_ms'] <= max_latency_ms:
            if metrics['tokens_per_second'] >= min_throughput:
                valid_configs.append((batch_size, metrics))
    
    if not valid_configs:
        return None, "No configuration meets requirements"
    
    # Choose highest throughput within constraints
    best = max(valid_configs, key=lambda x: x[1]['tokens_per_second'])
    
    return best[0], f"Max throughput ({best[1]['tokens_per_second']:.0f} tok/s) within {max_latency_ms}ms latency"


# Run analysis
print("Batch Size Optimization Analysis")
print("="*60)

batch_sizes = [1, 2, 4, 8, 16, 32, 64]
results = simulate_batch_performance(batch_sizes, base_latency_ms=40)

print(f"{'Batch':<8} {'Latency':<12} {'Tok/s':<12} {'Req/s':<10} {'Memory':<10} {'GPU%':<8}")
print("-"*60)

for bs in batch_sizes:
    r = results[bs]
    print(f"{bs:<8} {r['per_request_latency_ms']:<12.1f} {r['tokens_per_second']:<12.0f} "
          f"{r['requests_per_second']:<10.1f} {r['memory_gb']:<10.1f} {r['gpu_utilization']:<8.0f}")

# Find optimal for different scenarios
print("\nOptimal Batch Sizes:")
print("-"*60)

scenarios = [
    ("Low latency (<50ms)", 50, 0),
    ("Balanced (<100ms)", 100, 0),
    ("High throughput (<200ms)", 200, 0),
    ("Minimum 5000 tok/s", float('inf'), 5000),
]

for name, max_lat, min_tput in scenarios:
    optimal, reason = find_optimal_batch_size(results, max_lat, min_tput)
    if optimal:
        print(f"  {name}: batch_size={optimal} - {reason}")
    else:
        print(f"  {name}: {reason}")

## Exercise 3 Solution: Triton Deployment

Create a complete Triton deployment configuration.

In [None]:
def create_triton_deployment(
    model_name: str,
    engine_dir: str,
    output_dir: str,
    max_batch_size: int = 8
):
    """
    Create complete Triton deployment files.
    
    Args:
        model_name: Model identifier
        engine_dir: Path to TensorRT-LLM engine
        output_dir: Output directory for deployment files
        max_batch_size: Maximum batch size
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Model repository structure
    model_repo = output_path / "model_repository" / model_name
    model_repo.mkdir(parents=True, exist_ok=True)
    (model_repo / "1").mkdir(exist_ok=True)
    
    # Config.pbtxt
    config_content = f'''name: "{model_name}"
backend: "tensorrtllm"
max_batch_size: {max_batch_size}

model_transaction_policy {{
  decoupled: True
}}

input [
  {{
    name: "text_input"
    data_type: TYPE_STRING
    dims: [ -1 ]
  }},
  {{
    name: "max_tokens"
    data_type: TYPE_INT32
    dims: [ 1 ]
  }},
  {{
    name: "temperature"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  }},
  {{
    name: "top_p"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  }},
  {{
    name: "stream"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    optional: true
  }}
]

output [
  {{
    name: "text_output"
    data_type: TYPE_STRING
    dims: [ -1 ]
  }}
]

instance_group [
  {{
    count: 1
    kind: KIND_GPU
  }}
]

parameters: {{
  key: "gpt_model_type"
  value: {{
    string_value: "inflight_fused_batching"
  }}
}}

parameters: {{
  key: "engine_dir"
  value: {{
    string_value: "{engine_dir}"
  }}
}}

parameters: {{
  key: "kv_cache_type"
  value: {{
    string_value: "paged"
  }}
}}

dynamic_batching {{
  preferred_batch_size: [1, 2, 4, {max_batch_size}]
  max_queue_delay_microseconds: 100
}}
'''
    
    with open(model_repo / "config.pbtxt", "w") as f:
        f.write(config_content)
    
    # Docker compose
    compose_content = f'''version: '3.8'

services:
  triton:
    image: nvcr.io/nvidia/tritonserver:24.11-trtllm-python-py3
    container_name: triton-{model_name}
    ports:
      - "8000:8000"  # HTTP
      - "8001:8001"  # gRPC
      - "8002:8002"  # Metrics
    volumes:
      - ./model_repository:/models
      - {engine_dir}:{engine_dir}
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command: tritonserver --model-repository=/models
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/v2/health/ready"]
      interval: 30s
      timeout: 10s
      retries: 3
'''
    
    with open(output_path / "docker-compose.yml", "w") as f:
        f.write(compose_content)
    
    # Client script
    client_content = '''#!/usr/bin/env python3
"""Simple Triton client for TensorRT-LLM model."""

import requests
import json
import argparse

def generate(
    prompt: str,
    max_tokens: int = 100,
    temperature: float = 0.7,
    server_url: str = "http://localhost:8000"
) -> str:
    """Generate text from prompt."""
    
    payload = {
        "inputs": [
            {"name": "text_input", "shape": [1, 1], "datatype": "BYTES", 
             "data": [prompt]},
            {"name": "max_tokens", "shape": [1, 1], "datatype": "INT32",
             "data": [max_tokens]},
            {"name": "temperature", "shape": [1, 1], "datatype": "FP32",
             "data": [temperature]}
        ]
    }
    
    response = requests.post(
        f"{server_url}/v2/models/llm/infer",
        json=payload
    )
    
    if response.status_code == 200:
        result = response.json()
        return result["outputs"][0]["data"][0]
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--prompt", default="Hello, how are you?")
    parser.add_argument("--max-tokens", type=int, default=100)
    parser.add_argument("--temperature", type=float, default=0.7)
    args = parser.parse_args()
    
    result = generate(args.prompt, args.max_tokens, args.temperature)
    print(result)
'''
    
    with open(output_path / "client.py", "w") as f:
        f.write(client_content)
    
    # README
    readme_content = f'''# Triton Deployment for {model_name}

## Quick Start

1. Start the server:
```bash
docker-compose up -d
```

2. Check health:
```bash
curl localhost:8000/v2/health/ready
```

3. Generate text:
```bash
python client.py --prompt "Hello, how are you?"
```

## Endpoints

- HTTP: http://localhost:8000
- gRPC: localhost:8001
- Metrics: http://localhost:8002/metrics

## Configuration

- Engine: {engine_dir}
- Max batch size: {max_batch_size}
- Dynamic batching: enabled
- KV-cache: paged
'''
    
    with open(output_path / "README.md", "w") as f:
        f.write(readme_content)
    
    print(f"Created Triton deployment in: {output_path}")
    print(f"\nFiles created:")
    for f in output_path.rglob("*"):
        if f.is_file():
            print(f"  {f.relative_to(output_path)}")


# Create deployment
print("Creating Triton Deployment")
print("="*60)

create_triton_deployment(
    model_name="llama-3b-fp8",
    engine_dir="/engines/llama-3b-fp8",
    output_dir="../data/triton_deployment",
    max_batch_size=16
)

## Summary

Key findings:

1. **Custom configurations** allow tuning for specific use cases
2. **Batch size optimization** balances latency vs throughput
3. **Triton deployment** provides production-ready serving with:
   - Dynamic batching
   - Health checks
   - Metrics monitoring
   - gRPC and HTTP endpoints

### Production Recommendations

| Use Case | Quantization | Batch Size | Notes |
|----------|-------------|------------|-------|
| Chatbot (low latency) | FP8 | 8-16 | Prioritize response time |
| Batch processing | FP4 | 32-64 | Maximum throughput |
| Code generation | FP8 | 16-32 | Balance quality/speed |
| Edge deployment | INT4-AWQ | 4-8 | Memory constrained |