In [7]:
import time
import numpy as np
import pandas as pd
import torch
import torchvision
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from IPython.display import display

# --- Global Configuration ---
BATCH_SIZE = 64 
NUM_WARMUP = 10
NUM_BENCHMARK = 100
# Define the target device. We will assume CUDA is preferred if available.
TARGET_DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "MobileNetV2"
if TARGET_DEVICE.type == 'cuda':
    # This fetches the specific GPU name (e.g., "NVIDIA Jetson Nano", "RTX 3090")
    DEVICE_NAME = torch.cuda.get_device_name(0)
else:
    # If no CUDA device, it defaults to CPU
    DEVICE_NAME = "CPU"


# -------------------------------------------------------------
# METRIC 1 & 2: MODEL LOAD TIME & MODEL SIZE CALCULATION
# -------------------------------------------------------------

def calculate_static_metrics():
    """Calculates model load time and size metrics."""
    
    # Start timer for model loading
    start_load_time = time.perf_counter()

    # Load MobileNetV2 model
    torch_model = torchvision.models.mobilenet_v2(
        weights=torchvision.models.MobileNet_V2_Weights.IMAGENET1K_V1
    )
    torch_model.eval()

    # Move model to the TARGET_DEVICE (e.g., CUDA)
    torch_model.to(TARGET_DEVICE)

    # Calculate load time
    model_load_time_ms = (time.perf_counter() - start_load_time) * 1000

    # Calculate model size (Static Metric)
    total_params = sum(p.numel() for p in torch_model.parameters())
    # Assuming FP32: 4 bytes per parameter. Convert to Megabytes.
    model_size_mb = (total_params * 4) / (1024 * 1024)
    
    return torch_model, model_load_time_ms, model_size_mb, total_params


# -------------------------------------------------------------
# BENCHMARKING FUNCTION
# -------------------------------------------------------------

def benchmark_model(model, input_tensor, num_warmup, num_benchmark):
    """
    Benchmarks the model inference time, calculating full latency statistics.
    """
    timings = [] # List to store all individual run times
    device_type = input_tensor.device.type
    
    # --- CUDA TIMING LOGIC ---
    if device_type == 'cuda':
        starter = torch.cuda.Event(enable_timing=True)
        ender = torch.cuda.Event(enable_timing=True)
        
        print(f"Warming up for {num_warmup} iterations on CUDA...")
        with torch.no_grad():
            for _ in range(num_warmup):
                _ = model(input_tensor)
        torch.cuda.synchronize()
        print("Warm-up complete. Starting benchmark...")
        
        with torch.no_grad():
            for _ in range(num_benchmark):
                starter.record()
                _ = model(input_tensor)
                ender.record()
                torch.cuda.synchronize() # Wait for GPU
                
                curr_time = starter.elapsed_time(ender)
                timings.append(curr_time) # Time is in milliseconds (ms)

    # --- CPU TIMING LOGIC ---
    else: 
        print(f"Warming up for {num_warmup} iterations on CPU...")
        
        # Ensure the model is on CPU if the device is CPU
        model_on_cpu = model.to('cpu') 
        input_on_cpu = input_tensor.to('cpu')
        
        # Warm-up
        for _ in range(num_warmup):
            with torch.no_grad():
                _ = model_on_cpu(input_on_cpu) 
            
        print("Warm-up complete. Starting benchmark...")

        # Measure performance
        for _ in range(num_benchmark):
            start_time = time.perf_counter()
            with torch.no_grad():
                _ = model_on_cpu(input_on_cpu)
            end_time = time.perf_counter()
            timings.append((end_time - start_time) * 1000) # Time in milliseconds (ms)
            
        # Move the model back to the TARGET_DEVICE
        model.to(TARGET_DEVICE)

    # --- STATISTICAL CALCULATIONS ---
    # Convert timings to a NumPy array for robust percentile calculation
    timings_np = np.array(timings)
    
    mean_time_ms = timings_np.mean()
    std_time_ms = timings_np.std()
    
    median_latency = np.percentile(timings_np, 50)
    p90_latency = np.percentile(timings_np, 90)
    p99_latency = np.percentile(timings_np, 99)
    
    # METRIC 4: THROUGHPUT (FPS)
    throughput_fps = (BATCH_SIZE / mean_time_ms) * 1000

    print(f"\n--- Benchmark Results ({device_type.upper()} @ BATCH={input_tensor.shape[0]}) ---")
    print(f"Inference Time (Avg over {num_benchmark} runs): {mean_time_ms:.3f} ms")
    print(f"Throughput (FPS): {throughput_fps:.2f} FPS")
    print("--------------------------------------------------")

    return {
        'mean_time_ms': mean_time_ms,
        'std_time_ms': std_time_ms,
        'median_latency': median_latency,
        'p90_latency': p90_latency,
        'p99_latency': p99_latency,
        'throughput_fps': throughput_fps,
        'device_type': device_type,
        'batch_size': BATCH_SIZE
    }


# -------------------------------------------------------------
# PANDAS TABLE GENERATION
# -------------------------------------------------------------

def generate_presentation_tables(static_metrics, dynamic_metrics):
    """Generates and prints the two presentation tables."""
    
    # --- TABLE 1: DETAILED BASELINE METRICS (FINDINGS) ---
    
    metrics_data = {
        'Metric': [
            'Target Hardware', 'Inference Device', 'Batch Size', 'Model Size (FP32)', 
            'Total Parameters', 'Model Load Time', 'Avg. Latency (P50)', 
            'Avg. Latency (Mean)', 'Worst-Case Latency (P99)', 'Throughput (FPS)'
        ],
        'Value': [
            DEVICE_NAME, 
            dynamic_metrics['device_type'].upper(), 
            dynamic_metrics['batch_size'], 
            f"{static_metrics['model_size_mb']:.2f}",
            f"{static_metrics['total_params']:,}",
            f"{static_metrics['model_load_time_ms']:.2f}",
            f"{dynamic_metrics['median_latency']:.3f}",
            f"{dynamic_metrics['mean_time_ms']:.3f}",
            f"{dynamic_metrics['p99_latency']:.3f}",
            f"{dynamic_metrics['throughput_fps']:.2f}"
        ],
        'Unit': [
            'N/A', 'N/A', 'Samples', 'MB', 'Params', 'ms', 'ms/batch', 'ms/batch', 'ms/batch', 'FPS'
        ]
    }
    
    df_metrics = pd.DataFrame(metrics_data)
    print("\n\n--- TABLE 1: DETAILED FP32 BASELINE METRICS ---")
    display(df_metrics)
    
    # --- TABLE 2: PROJECT PLAN (NEXT STEPS) ---


    next_steps_data = {
        'Framework': [
            'PyTorch (Native)', 
            'TensorRT', 
            'TensorRT', 
            'TFLite', 
            'OpenVINO', 
            'ExecuTorch'
        ],
        'Precision': [
            f'FP32 (Baseline)', 
            'FP16', 
            'INT8', 
            'INT8', 
            'FP32/FP16', 
            'INT8'
        ],
        'Measured Latency (ms)': [
            f'{dynamic_metrics["mean_time_ms"]:.2f}', # Measured Value
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)'
        ],
        'Measured Throughput (FPS)': [
            f'{dynamic_metrics["throughput_fps"]:.2f}', # Measured Value
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)',
            'N/A (Planned)'
        ],
        'Status': [
            'Complete',
            'In Progress',
            'Planned',
            'Planned',
            'Planned',
            'Planned'
        ]
    }
    
    df_plan = pd.DataFrame(next_steps_data)
    print("\n\n--- TABLE 2: PROJECT PLAN AND NEXT STEPS ---")
    display(df_plan)

    # MAIN
if __name__ == '__main__':
    
    # 1. Setup 
    torch_model, model_load_time_ms, model_size_mb, total_params = calculate_static_metrics()
    
    static_metrics = {
        'model_load_time_ms': model_load_time_ms,
        'model_size_mb': model_size_mb,
        'total_params': total_params,
    }

    # Dummy Input
    dummy_input = torch.randn(BATCH_SIZE, 3, 224, 224).to(TARGET_DEVICE)
    print(f"Model and dummy input moved to {TARGET_DEVICE} successfully.")

    # 2. Benchmarking
    dynamic_metrics = benchmark_model(torch_model, dummy_input, NUM_WARMUP, NUM_BENCHMARK)
    
    # 3. Generate Tables
    generate_presentation_tables(static_metrics, dynamic_metrics)

    

Model and dummy input moved to cpu successfully.
Warming up for 10 iterations on CPU...
Warm-up complete. Starting benchmark...

--- Benchmark Results (CPU @ BATCH=64) ---
Inference Time (Avg over 100 runs): 499.195 ms
Throughput (FPS): 128.21 FPS
--------------------------------------------------


--- TABLE 1: DETAILED FP32 BASELINE METRICS ---


Unnamed: 0,Metric,Value,Unit
0,Target Hardware,CPU,
1,Inference Device,CPU,
2,Batch Size,64,Samples
3,Model Size (FP32),13.37,MB
4,Total Parameters,3504872,Params
5,Model Load Time,57.72,ms
6,Avg. Latency (P50),498.919,ms/batch
7,Avg. Latency (Mean),499.195,ms/batch
8,Worst-Case Latency (P99),518.864,ms/batch
9,Throughput (FPS),128.21,FPS




--- TABLE 2: PROJECT PLAN AND NEXT STEPS ---


Unnamed: 0,Framework,Precision,Measured Latency (ms),Measured Throughput (FPS),Status
0,PyTorch (Native),FP32 (Baseline),499.20,128.21,Complete
1,TensorRT,FP16,N/A (Planned),N/A (Planned),In Progress
2,TensorRT,INT8,N/A (Planned),N/A (Planned),Planned
3,TFLite,INT8,N/A (Planned),N/A (Planned),Planned
4,OpenVINO,FP32/FP16,N/A (Planned),N/A (Planned),Planned
5,ExecuTorch,INT8,N/A (Planned),N/A (Planned),Planned
