In [1]:
import time
import numpy as np
import pandas as pd
import torch
import torchvision
import torch.jit # Needed for tracing, a key step in TensorRT preparation
from IPython.display import display

# --- Global Configuration ---
BATCH_SIZE = 64 
NUM_WARMUP = 10
NUM_BENCHMARK = 100
# Target only CUDA, as TensorRT is GPU-specific
TARGET_DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "MobileNetV2 (FP16 Optimized)"
if TARGET_DEVICE.type == 'cuda':
    # This fetches the specific GPU name (e.g., "NVIDIA Jetson Nano", "RTX 3090")
    DEVICE_NAME = f"{torch.cuda.get_device_name(0)} (TensorRT/FP16 Sim.)"
else:
    # If no CUDA device, TensorRT optimization is not possible
    DEVICE_NAME = "CPU (No TensorRT Possible)"


# -------------------------------------------------------------
# TENSORRT-STYLE COMPILATION (FP16 Conversion and JIT Trace)
# -------------------------------------------------------------

def compile_to_tensorrt_fp16(model, dummy_input):
    """
    Converts the PyTorch model to half-precision (FP16) and traces it 
    to create a TorchScript module, simulating the preparation for TensorRT.
    """
    if TARGET_DEVICE.type != 'cuda':
        print("Warning: Cannot perform FP16 optimization or TensorRT simulation on CPU.")
        return model, 0.0

    # 1. Convert to Half Precision (FP16)
    # This is the primary optimization step in TensorRT-style compilation
    model_fp16 = model.half()
    
    # Move the dummy input to FP16 as well
    dummy_input_fp16 = dummy_input.half()

    # 2. Trace the model (simulates building an optimized graph)
    # TensorRT works on a static computation graph. TorchScript tracing provides this.
    start_compile_time = time.perf_counter()
    with torch.no_grad():
        # Trace the FP16 model with FP16 input
        traced_model = torch.jit.trace(model_fp16, dummy_input_fp16)
    
    compile_time_ms = (time.perf_counter() - start_compile_time) * 1000
    
    return traced_model, compile_time_ms


# -------------------------------------------------------------
# METRIC 1 & 2: MODEL LOAD TIME & MODEL SIZE CALCULATION
# -------------------------------------------------------------

def calculate_static_metrics(dummy_input):
    """Calculates model load time, size, and TensorRT compilation time."""
    
    # Start timer for model loading
    start_load_load_time = time.perf_counter()

    # Load MobileNetV2 model (FP32 initially)
    torch_model = torchvision.models.mobilenet_v2(
        weights=torchvision.models.MobileNet_V2_Weights.IMAGENET1K_V1
    )
    torch_model.eval()

    # Move model to the TARGET_DEVICE (e.g., CUDA)
    torch_model.to(TARGET_DEVICE)

    # Calculate load time
    model_load_time_ms = (time.perf_counter() - start_load_load_time) * 1000

    # Calculate model size (Static Metric)
    total_params = sum(p.numel() for p in torch_model.parameters())
    # Assuming FP16 optimization: 2 bytes per parameter. Convert to Megabytes.
    model_size_mb = (total_params * 2) / (1024 * 1024)
    
    # Compile the model to the TensorRT/FP16 equivalent
    tensorrt_model, compile_time_ms = compile_to_tensorrt_fp16(torch_model, dummy_input)

    return tensorrt_model, model_load_time_ms, model_size_mb, total_params, compile_time_ms


# -------------------------------------------------------------
# BENCHMARKING FUNCTION
# -------------------------------------------------------------

def benchmark_model(model, input_tensor, num_warmup, num_benchmark):
    """
    Benchmarks the TensorRT-optimized model inference time.
    The model is assumed to be an FP16 JIT-traced CUDA model.
    """
    timings = [] # List to store all individual run times
    device_type = input_tensor.device.type
    
    if device_type != 'cuda':
        # If running on CPU, use the basic CPU timing logic
        print("Running non-optimized benchmark on CPU...")
        
        # Warm-up (CPU)
        for _ in range(num_warmup):
             with torch.no_grad():
                _ = model(input_tensor) 
        
        # Measure performance (CPU)
        for _ in range(num_benchmark):
            start_time = time.perf_counter()
            with torch.no_grad():
                _ = model(input_tensor)
            end_time = time.perf_counter()
            timings.append((end_time - start_time) * 1000) # Time in milliseconds (ms)

    else: # CUDA Timing Logic (for FP16/TensorRT)
        
        # The input tensor must be FP16 to match the compiled model
        input_tensor_fp16 = input_tensor.half() 
        
        starter = torch.cuda.Event(enable_timing=True)
        ender = torch.cuda.Event(enable_timing=True)
        
        print(f"Warming up FP16 optimized model for {num_warmup} iterations on CUDA...")
        with torch.no_grad():
            for _ in range(num_warmup):
                _ = model(input_tensor_fp16)
        torch.cuda.synchronize()
        print("Warm-up complete. Starting benchmark...")
        
        with torch.no_grad():
            for _ in range(num_benchmark):
                starter.record()
                _ = model(input_tensor_fp16)
                ender.record()
                torch.cuda.synchronize() # Wait for GPU
                
                curr_time = starter.elapsed_time(ender)
                timings.append(curr_time) # Time is in milliseconds (ms)

    # --- STATISTICAL CALCULATIONS ---
    timings_np = np.array(timings)
    
    mean_time_ms = timings_np.mean()
    std_time_ms = timings_np.std()
    
    median_latency = np.percentile(timings_np, 50)
    p90_latency = np.percentile(timings_np, 90)
    p99_latency = np.percentile(timings_np, 99)
    
    # METRIC 4: THROUGHPUT (FPS)
    throughput_fps = (BATCH_SIZE / mean_time_ms) * 1000

    print(f"\n--- Benchmark Results ({device_type.upper()} @ BATCH={input_tensor.shape[0]}, Precision: FP16/TensorRT) ---")
    print(f"Inference Time (Avg over {num_benchmark} runs): {mean_time_ms:.3f} ms")
    print(f"Throughput (FPS): {throughput_fps:.2f} FPS")
    print("--------------------------------------------------")

    return {
        'mean_time_ms': mean_time_ms,
        'std_time_ms': std_time_ms,
        'median_latency': median_latency,
        'p90_latency': p90_latency,
        'p99_latency': p99_latency,
        'throughput_fps': throughput_fps,
        'device_type': device_type,
        'batch_size': BATCH_SIZE
    }


# -------------------------------------------------------------
# PANDAS TABLE GENERATION
# -------------------------------------------------------------

def generate_presentation_tables(static_metrics, dynamic_metrics):
    """Generates and prints the two presentation tables."""
    
    # --- TABLE 1: DETAILED BASELINE METRICS (FINDINGS) ---
    
    metrics_data = {
        'Metric': [
            'Target Hardware', 'Inference Device', 'Batch Size', 'Model Size (FP16)', 
            'Total Parameters', 'Model Load Time', 'TensorRT Compile Time', 
            'Avg. Latency (P50)', 'Avg. Latency (Mean)', 'Worst-Case Latency (P99)', 
            'Throughput (FPS)'
        ],
        'Value': [
            DEVICE_NAME, 
            dynamic_metrics['device_type'].upper(), 
            dynamic_metrics['batch_size'], 
            f"{static_metrics['model_size_mb']:.2f}",
            f"{static_metrics['total_params']:,}",
            f"{static_metrics['model_load_time_ms']:.2f}",
            f"{static_metrics['compile_time_ms']:.2f}",
            f"{dynamic_metrics['median_latency']:.3f}",
            f"{dynamic_metrics['mean_time_ms']:.3f}",
            f"{dynamic_metrics['p99_latency']:.3f}",
            f"{dynamic_metrics['throughput_fps']:.2f}"
        ],
        'Unit': [
            'N/A', 'N/A', 'Samples', 'MB', 'Params', 'ms', 'ms', 'ms/batch', 'ms/batch', 'ms/batch', 'FPS'
        ]
    }
    
    df_metrics = pd.DataFrame(metrics_data)
    print("\n\n--- TABLE 1: DETAILED TENSORRT-STYLE (FP16) METRICS ---")
    display(df_metrics)
    
    # --- TABLE 2: PROJECT PLAN (NEXT STEPS) ---


    next_steps_data = {
        'Framework': [
            'PyTorch (Native, FP32)', 
            'PyTorch (TensorRT/FP16 Sim.)', # This is the current benchmark
            'TensorRT (Full INT8)', 
            'TFLite (INT8)', 
            'OpenVINO (FP32/FP16)', 
            'ExecuTorch (INT8)'
        ],
        'Precision': [
            f'FP32', 
            'FP16', 
            'INT8', 
            'INT8', 
            'FP32/FP16', 
            'INT8'
        ],
        'Measured Latency (ms)': [
            'N/A (Baseline)', # Placeholder for comparison with previous run
            f'{dynamic_metrics["mean_time_ms"]:.2f}', # Measured Value
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)'
        ],
        'Measured Throughput (FPS)': [
            'N/A (Baseline)', # Placeholder for comparison with previous run
            f'{dynamic_metrics["throughput_fps"]:.2f}', # Measured Value
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)'
        ],
        'Status': [
            'Completed (Previous Run)',
            'Complete (This Run)',
            'In Progress',
            'Planned',
            'Planned',
            'Planned'
        ]
    }
    
    df_plan = pd.DataFrame(next_steps_data)
    print("\n\n--- TABLE 2: PROJECT PLAN AND NEXT STEPS ---")
    display(df_plan)

# MAIN
if __name__ == '__main__':
    
    # 1. Setup Dummy Input before compilation
    # Dummy input must be FP32 initially before being passed to compilation
    dummy_input_fp32 = torch.randn(BATCH_SIZE, 3, 224, 224).to(TARGET_DEVICE)
    
    # 2. Setup and Compile
    tensorrt_model, model_load_time_ms, model_size_mb, total_params, compile_time_ms = calculate_static_metrics(dummy_input_fp32)
    
    static_metrics = {
        'model_load_time_ms': model_load_time_ms,
        'model_size_mb': model_size_mb,
        'total_params': total_params,
        'compile_time_ms': compile_time_ms,
    }

    print(f"Model compiled and dummy input moved to {TARGET_DEVICE} successfully.")

    # 3. Benchmarking
    # Pass the FP32 dummy input to the benchmark function, which handles FP16 conversion internally
    dynamic_metrics = benchmark_model(tensorrt_model, dummy_input_fp32, NUM_WARMUP, NUM_BENCHMARK)
    
    # 4. Generate Tables
    generate_presentation_tables(static_metrics, dynamic_metrics)

Model compiled and dummy input moved to cuda:0 successfully.
Warming up FP16 optimized model for 10 iterations on CUDA...
Warm-up complete. Starting benchmark...

--- Benchmark Results (CUDA @ BATCH=64, Precision: FP16/TensorRT) ---
Inference Time (Avg over 100 runs): 23.341 ms
Throughput (FPS): 2741.90 FPS
--------------------------------------------------


--- TABLE 1: DETAILED TENSORRT-STYLE (FP16) METRICS ---


Unnamed: 0,Metric,Value,Unit
0,Target Hardware,NVIDIA GeForce RTX 3060 Laptop GPU (TensorRT/F...,
1,Inference Device,CUDA,
2,Batch Size,64,Samples
3,Model Size (FP16),6.69,MB
4,Total Parameters,3504872,Params
5,Model Load Time,129.34,ms
6,TensorRT Compile Time,2629.61,ms
7,Avg. Latency (P50),23.097,ms/batch
8,Avg. Latency (Mean),23.341,ms/batch
9,Worst-Case Latency (P99),24.762,ms/batch




--- TABLE 2: PROJECT PLAN AND NEXT STEPS ---


Unnamed: 0,Framework,Precision,Measured Latency (ms),Measured Throughput (FPS),Status
0,"PyTorch (Native, FP32)",FP32,N/A (Baseline),N/A (Baseline),Completed (Previous Run)
1,PyTorch (TensorRT/FP16 Sim.),FP16,23.34,2741.90,Complete (This Run)
2,TensorRT (Full INT8),INT8,N/A (Planned),N/A (Planned),In Progress
3,TFLite (INT8),INT8,N/A (Planned),N/A (Planned),Planned
4,OpenVINO (FP32/FP16),FP32/FP16,N/A (Planned),N/A (Planned),Planned
5,ExecuTorch (INT8),INT8,N/A (Planned),N/A (Planned),Planned
