In [6]:
import time
import numpy as np
import pandas as pd
import torch
import torchvision
from IPython.display import display

# --- Global Configuration ---
BATCH_SIZE = 64 
NUM_WARMUP = 10
NUM_BENCHMARK = 100
MODEL_NAME = "MobileNetV2"
HAS_CUDA = torch.cuda.is_available()
DEVICE_NAME_CUDA = torch.cuda.get_device_name(0) if HAS_CUDA else "N/A"
DEVICE_NAME_CPU = "CPU"


# -------------------------------------------------------------
# METRIC 1 & 2: MODEL LOAD TIME & MODEL SIZE CALCULATION
# -------------------------------------------------------------

def calculate_static_metrics():
    """Calculates model load time and size metrics. Loads model onto CPU initially."""
    
    start_load_time = time.perf_counter()

    torch_model = torchvision.models.mobilenet_v2(
        weights=torchvision.models.MobileNet_V2_Weights.IMAGENET1K_V1
    )
    torch_model.eval()
    
    model_load_time_ms = (time.perf_counter() - start_load_time) * 1000

    total_params = sum(p.numel() for p in torch_model.parameters())
    
    # *** DISPLAYING THIS NOW: FP32 size: 4 bytes per parameter ***
    model_size_mb_fp32 = (total_params * 4) / (1024 * 1024)
    
    # We still calculate FP16 size just in case, but won't display it directly.
    model_size_mb_fp16 = (total_params * 2) / (1024 * 1024)
    
    return torch_model, model_load_time_ms, total_params, model_size_mb_fp32, model_size_mb_fp16


# -------------------------------------------------------------
# CORE BENCHMARKING FUNCTION (Internal logic)
# -------------------------------------------------------------

def _run_single_benchmark(model, device_name, num_warmup, num_benchmark):
    """
    Executes the benchmark for a single device (CPU or CUDA).
    Returns a dictionary of metrics for that device.
    """
    device = torch.device(device_name)
    timings = [] 
    
    model.to(device)
    input_tensor = torch.randn(BATCH_SIZE, 3, 224, 224).to(device)
    
    print(f"\n--- Starting Benchmark on {device_name.upper()} ---")
    
    # --- CUDA TIMING LOGIC ---
    if device.type == 'cuda':
        starter = torch.cuda.Event(enable_timing=True)
        ender = torch.cuda.Event(enable_timing=True)
        
        print(f"Warming up for {num_warmup} iterations on CUDA...")
        with torch.no_grad():
            for _ in range(num_warmup):
                _ = model(input_tensor)
        torch.cuda.synchronize()
        print("Warm-up complete. Starting benchmark...")
        
        with torch.no_grad():
            for _ in range(num_benchmark):
                starter.record()
                _ = model(input_tensor)
                ender.record()
                torch.cuda.synchronize() 
                
                curr_time = starter.elapsed_time(ender)
                timings.append(curr_time) 

    # --- CPU TIMING LOGIC ---
    else: 
        print(f"Warming up for {num_warmup} iterations on CPU...")
        
        for _ in range(num_warmup):
            with torch.no_grad():
                _ = model(input_tensor) 
            
        print("Warm-up complete. Starting benchmark...")

        for _ in range(num_benchmark):
            start_time = time.perf_counter()
            with torch.no_grad():
                _ = model(input_tensor)
            end_time = time.perf_counter()
            timings.append((end_time - start_time) * 1000)

    # --- STATISTICAL CALCULATIONS ---
    timings_np = np.array(timings)
    
    mean_time_ms = timings_np.mean()
    median_latency = np.percentile(timings_np, 50)
    p99_latency = np.percentile(timings_np, 99)
    throughput_fps = (BATCH_SIZE / mean_time_ms) * 1000

    print(f"\nResults: Latency={mean_time_ms:.3f} ms, Throughput={throughput_fps:.2f} FPS")
    print("-----------------------------------------------------")

    return {
        'Framework': 'PyTorch (Native)',
        'Inf. Device': device.type.upper(),
        'Hardware Details': DEVICE_NAME_CUDA if device.type == 'cuda' else DEVICE_NAME_CPU,
        'Precision': 'FP32',
        'Avg. Latency (P50) (ms)': median_latency,
        'Avg. Latency (Mean) (ms)': mean_time_ms,
        'Worst-Case Latency (P99) (ms)': p99_latency,
        'Throughput (FPS)': throughput_fps,
        'Batch Size': BATCH_SIZE
    }


# -------------------------------------------------------------
# MASTER BENCHMARK & TABLE GENERATION
# -------------------------------------------------------------

def run_benchmarks_and_generate_table(model, static_metrics):
    """Runs all target benchmarks and generates the comparison table."""
    
    all_results = []

    # 1. Run CPU Benchmark
    cpu_metrics = _run_single_benchmark(model, 'cpu', NUM_WARMUP, NUM_BENCHMARK)
    all_results.append(cpu_metrics)

    # 2. Run CUDA Benchmark (if available)
    if HAS_CUDA:
        cuda_metrics = _run_single_benchmark(model, 'cuda:0', NUM_WARMUP, NUM_BENCHMARK)
        all_results.append(cuda_metrics)
    else:
        print("\nCUDA not available. Skipping CUDA benchmark.")


    # --- CONSOLIDATE AND DISPLAY TABLE ---
    
    df_results = pd.DataFrame(all_results)
    
    final_data = []
    for index, row in df_results.iterrows():
        # Inject static metrics into the dynamic metrics for the final table rows
        final_data.append({
            'Target Hardware': row['Hardware Details'],
            'Inf. Device': row['Inf. Device'],
            'Framework/Precision': f"{row['Framework']} ({row['Precision']})",
            'Batch Size': row['Batch Size'],
            # *** UPDATED HERE: Displaying FP32 size, the actual size of the running model ***
            'Model Size (FP32) (MB)': f"{static_metrics['model_size_mb_fp32']:.2f}", 
            'Total Parameters': f"{static_metrics['total_params']:,}",
            'Model Load Time (ms)': f"{static_metrics['model_load_time_ms']:.2f}",
            'Compile Time (ms)': 'N/A (Native)',
            'Avg. Latency (P50) (ms)': f"{row['Avg. Latency (P50) (ms)']:.3f}",
            'Avg. Latency (Mean) (ms)': f"{row['Avg. Latency (Mean) (ms)']:.3f}",
            'Worst-Case Latency (P99) (ms)': f"{row['Worst-Case Latency (P99) (ms)']:.3f}",
            'Throughput (FPS)': f"{row['Throughput (FPS)']:.2f}",
        })

    df_final = pd.DataFrame(final_data)
    
    # Reorder columns for presentation
    column_order = [
        'Target Hardware',
        'Inf. Device',
        'Framework/Precision',
        'Batch Size',
        'Model Size (FP32) (MB)', # UPDATED COLUMN NAME
        'Total Parameters',
        'Model Load Time (ms)',
        'Compile Time (ms)',
        'Avg. Latency (P50) (ms)', 
        'Avg. Latency (Mean) (ms)',
        'Worst-Case Latency (P99) (ms)',
        'Throughput (FPS)',
    ]
    
    df_final = df_final[column_order]

    print("\n\n--- DETAILED PYTORCH NATIVE BENCHMARK RESULTS (BASELINE) ---")
    display(df_final)

# MAIN EXECUTION
if __name__ == '__main__':
    
    # 1. Setup (Static Metrics & Model Loading)
    torch_model, model_load_time_ms, total_params, model_size_mb_fp32, model_size_mb_fp16 = calculate_static_metrics()
    
    static_metrics = {
        'model_load_time_ms': model_load_time_ms,
        'total_params': total_params,
        'model_size_mb_fp32': model_size_mb_fp32, # Value used in final table
        'model_size_mb_fp16': model_size_mb_fp16, 
    }

    # 2. Benchmarking (Runs CPU and CUDA/GPU)
    run_benchmarks_and_generate_table(torch_model, static_metrics)


--- Starting Benchmark on CPU ---
Warming up for 10 iterations on CPU...
Warm-up complete. Starting benchmark...

Results: Latency=1110.828 ms, Throughput=57.61 FPS
-----------------------------------------------------

--- Starting Benchmark on CUDA:0 ---
Warming up for 10 iterations on CUDA...
Warm-up complete. Starting benchmark...

Results: Latency=38.719 ms, Throughput=1652.92 FPS
-----------------------------------------------------


--- DETAILED PYTORCH NATIVE BENCHMARK RESULTS (BASELINE) ---


Unnamed: 0,Target Hardware,Inf. Device,Framework/Precision,Batch Size,Model Size (FP32) (MB),Total Parameters,Model Load Time (ms),Compile Time (ms),Avg. Latency (P50) (ms),Avg. Latency (Mean) (ms),Worst-Case Latency (P99) (ms),Throughput (FPS)
0,CPU,CPU,PyTorch (Native) (FP32),64,13.37,3504872,46.36,N/A (Native),1110.619,1110.828,1176.451,57.61
1,NVIDIA GeForce RTX 3060 Laptop GPU,CUDA,PyTorch (Native) (FP32),64,13.37,3504872,46.36,N/A (Native),38.692,38.719,39.0,1652.92
