# 4. Exporting Models to ONNX and Running Inference

This notebook demonstrates how to export the trained/optimized PyTorch models to the ONNX (Open Neural Network Exchange) format. It also shows how to run inference using ONNX Runtime on the exported models.

We will export three specific model versions:
1.  **Baseline FP32 Model**: The original MobileNetV2 model adapted for CIFAR-10.
3.  **Baseline QAT INT8 Model**: The baseline model quantized to INT8 using Quantization-Aware Training.

In [1]:
import os
import torch
import numpy as np
import onnxruntime as ort

from nnopt.model.export import export_model_to_onnx
from nnopt.recipes.mobilenetv2_cifar10 import load_mobilenetv2_cifar10_model
from nnopt.model.prune import remove_pruning_reparameterization
from nnopt.model.const import BASE_MODEL_DIR, DEVICE

# Ensure the logger in export is configured (if not already by its import)
import logging
logger = logging.getLogger("nnopt.model.export")
if not logger.hasHandlers():
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
                        handlers=[logging.StreamHandler()])

print(f"PyTorch version: {torch.__version__}")
print(f"ONNX Runtime version: {ort.__version__}")
print(f"Using device: {DEVICE}")

2025-06-13 16:07:16,949 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Using device: cuda, dtype: torch.bfloat16


PyTorch version: 2.7.1+cu126
ONNX Runtime version: 1.22.0
Using device: cuda


## Configuration and Dummy Input

In [2]:
# Define model versions to be exported
baseline_fp32_version = "mobilenetv2_cifar10/fp32/baseline"
struct_pruned_fp32_version = "mobilenetv2_cifar10/fp32/l1_struct_prune_0.3"
pqt_int8_version = "mobilenetv2_cifar10/int8/pqt_baseline"
qat_int8_version = "mobilenetv2_cifar10/int8/qat_baseline"
unstruct_pruned_fp32_version = "mobilenetv2_cifar10/fp32/l1_unstruct_prune_0.7"
pqt_int8_unstruct_pruned_version = "mobilenetv2_cifar10/int8/pqt_l1_unstruct_prune_0.7"
qat_int8_unstruct_pruned_version = "mobilenetv2_cifar10/int8/qat_l1_unstruct_prune_0.7"


# Directory to save ONNX models
ONNX_EXPORT_DIR = os.path.join(BASE_MODEL_DIR, "onnx_exports")
os.makedirs(ONNX_EXPORT_DIR, exist_ok=True)
print(f"ONNX models will be saved in: {ONNX_EXPORT_DIR}")

# Create a dummy input tensor (batch_size, channels, height, width)
# MobileNetV2 typically expects 224x224 images.
# CIFAR-10 images are 32x32, but the model adapts them or uses a standard input size.
# Using 224x224 as per common MobileNetV2 usage and example in pruning notebook.
dummy_input_shape = (1, 3, 224, 224)
dummy_input = torch.randn(dummy_input_shape, device='cpu') # ONNX export prefers CPU dummy input
print(f"Dummy input shape: {dummy_input.shape}")

# Define dynamic axes for batch size flexibility
dynamic_axes = {'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}

ONNX models will be saved in: /home/pbeuran/repos/nnopt/models/onnx_exports
Dummy input shape: torch.Size([1, 3, 224, 224])


## 1. Baseline FP32 Model Export & Inference

In [3]:
# Load the baseline FP32 model
print(f"Loading baseline FP32 model from version: {baseline_fp32_version}")
baseline_fp32_model, _ = load_mobilenetv2_cifar10_model(
    version=baseline_fp32_version,
    mode="jit_trace"
)
baseline_fp32_model.eval()
baseline_fp32_model.to('cpu') # Move model to CPU for export

# Define ONNX path
onnx_path_baseline_fp32 = os.path.join(ONNX_EXPORT_DIR, "mobilenetv2_cifar10_baseline_fp32.onnx")

# Export to ONNX
print(f"Exporting baseline FP32 model to {onnx_path_baseline_fp32}...")
success_fp32 = export_model_to_onnx(
    model=baseline_fp32_model,
    dummy_input=dummy_input,
    onnx_path=onnx_path_baseline_fp32,
    dynamic_axes=dynamic_axes,
    opset_version=13
)

if success_fp32:
    print("Baseline FP32 model exported successfully.")
    # Run inference with ONNX Runtime
    try:
        ort_session_fp32 = ort.InferenceSession(onnx_path_baseline_fp32, providers=['CPUExecutionProvider'])
        input_name_fp32 = ort_session_fp32.get_inputs()[0].name
        output_name_fp32 = ort_session_fp32.get_outputs()[0].name
        
        ort_inputs_fp32 = {input_name_fp32: dummy_input.cpu().numpy()}
        ort_outputs_fp32 = ort_session_fp32.run([output_name_fp32], ort_inputs_fp32)
        print(f"ONNX Runtime (FP32 Baseline) output shape: {ort_outputs_fp32[0].shape}")
        # print(f"ONNX Runtime (FP32 Baseline) output sample: {ort_outputs_fp32[0][0,:5]}") # Print first 5 logits
    except Exception as e:
        print(f"Error running ONNX Runtime for FP32 baseline model: {e}")
else:
    print("Baseline FP32 model export failed.")

2025-06-13 16:07:17,825 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loading MobileNetV2 model for CIFAR-10 from version: mobilenetv2_cifar10/fp32/baseline at /home/pbeuran/repos/nnopt/models
2025-06-13 16:07:17,826 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loaded metadata: {'metrics_values': {'val_metrics': {'accuracy': 0.9254, 'avg_loss': 0.21455251355171204, 'samples_per_second': 9594.238581494608, 'avg_time_per_batch': 0.006596786050597328, 'avg_time_per_sample': 0.00010422921959943779, 'params_stats': {'int_weight_params': 0, 'float_weight_params': 2202560, 'float_bias_params': 10, 'bn_param_params': 34112, 'other_float_params': 0, 'total_params': 2236682, 'approx_memory_mb_for_params': 8.532264709472656}}, 'test_metrics': {'accuracy': 0.9288, 'avg_loss': 0.20640371625423432, 'samples_per_second': 9116.575566348814, 'avg_time_per_batch': 0.006986643948966146, 'avg_time_per_sample': 0.00010969030999876849, 'params_stats': {'int_weight_params': 0, 'float_weight_params': 2202560, 

Loading baseline FP32 model from version: mobilenetv2_cifar10/fp32/baseline


2025-06-13 16:07:18,092 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Successfully loaded JIT traced model from /home/pbeuran/repos/nnopt/models/mobilenetv2_cifar10/fp32/baseline/jit_trace.pt
2025-06-13 16:07:18,113 - nnopt.model.export - INFO - Starting ONNX export to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_baseline_fp32.onnx with opset_version=13...


Exporting baseline FP32 model to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_baseline_fp32.onnx...


2025-06-13 16:07:18,412 - nnopt.model.export - INFO - Model successfully exported to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_baseline_fp32.onnx


Baseline FP32 model exported successfully.
ONNX Runtime (FP32 Baseline) output shape: (1, 10)


### Evaluate Baseline FP32 ONNX Model

In [4]:
# Import necessary functions and data
from nnopt.model.eval import eval_onnx_model
from nnopt.recipes.mobilenetv2_cifar10 import get_cifar10_datasets

# Load CIFAR-10 test dataset
# Assuming the data is already downloaded and preprocessed as in other notebooks
# Adjust data_dir if your CIFAR-10 data is located elsewhere
DATA_DIR = os.path.join(os.getcwd(), '..', 'data', 'image', 'cifar10') 
_, test_dataset, _ = get_cifar10_datasets() # We only need test_dataset

if success_fp32: # Only proceed if the ONNX model was exported successfully
    print("\n--- Evaluating Baseline FP32 ONNX Model on CPU ---")
    onnx_metrics_cpu = eval_onnx_model(
        onnx_model_path=onnx_path_baseline_fp32,
        test_dataset=test_dataset,
        batch_size=32, # Adjust as needed
        device="cpu",
        num_warmup_batches=2 # Smaller warmup for quicker testing
    )
    print(f"CPU ONNX Metrics: {onnx_metrics_cpu}")

    if torch.cuda.is_available() and ort.get_device() == 'GPU':
        print("\n--- Evaluating Baseline FP32 ONNX Model on GPU ---")
        onnx_metrics_gpu = eval_onnx_model(
            onnx_model_path=onnx_path_baseline_fp32,
            test_dataset=test_dataset,
            batch_size=32, # Adjust as needed
            device="cuda",
            num_warmup_batches=2
        )
        print(f"GPU ONNX Metrics: {onnx_metrics_gpu}")
    else:
        print("\nSkipping GPU ONNX evaluation as CUDA is not available or ONNX Runtime GPU provider is not set up.")
else:
    print("\nSkipping ONNX model evaluation as the export failed.")

2025-06-13 16:07:20,533 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loading existing training and validation datasets...
2025-06-13 16:07:22,122 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loading existing test dataset...
2025-06-13 16:07:22,277 - nnopt.model.eval - INFO - Starting ONNX model evaluation for: /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_baseline_fp32.onnx
2025-06-13 16:07:22,277 - nnopt.model.eval - INFO - Evaluation on PyTorch device: cpu, batch size: 32
2025-06-13 16:07:22,278 - nnopt.model.eval - INFO - Using ONNX Runtime providers: ['CPUExecutionProvider']
2025-06-13 16:07:22,300 - nnopt.model.eval - INFO - ONNX Model Input Name: input, Output Name: output
2025-06-13 16:07:22,300 - nnopt.model.eval - INFO - Starting warmup for 2 batches...



--- Evaluating Baseline FP32 ONNX Model on CPU ---


[ONNX Warmup]: 100%|██████████| 2/2 [00:00<00:00,  5.24it/s]
2025-06-13 16:07:22,750 - nnopt.model.eval - INFO - Warmup complete.
2025-06-13 16:07:22,751 - nnopt.model.eval - INFO - Starting ONNX model evaluation pass...
[ONNX Evaluation]: 100%|██████████| 157/157 [00:16<00:00,  9.80it/s]

ONNX Evaluation Complete: Avg Loss: 0.2173, Accuracy: 0.9248
Throughput: 421.06 samples/sec | Avg Batch Time: 75.64 ms | Avg Sample Time: 2.37 ms
System Stats (PyTorch side): CPU Usage: 64.40% | RAM Usage: 8.9/30.9GB (37.4%)
CPU ONNX Metrics: {'accuracy': 0.9248, 'avg_loss': 0.21728210688829422, 'samples_per_second': 421.0579300657744, 'avg_time_per_batch': 0.0756359908789436, 'avg_time_per_sample': 0.002374970113598829}

Skipping GPU ONNX evaluation as CUDA is not available or ONNX Runtime GPU provider is not set up.





## 2. Baseline QAT INT8 Model Export & Inference

In [5]:
# Load the Baseline QAT INT8 model
print(f"Loading Baseline QAT INT8 model from version: {qat_int8_version}")
# The QAT model is saved after torch.quantization.convert, so it's already an INT8 model.
# We use quantized=True in get_mobilenetv2_cifar10_model to load the correct model architecture
# (e.g., torchvision.models.quantization.mobilenet_v2)
qat_int8_model, _ = load_mobilenetv2_cifar10_model(
    version=qat_int8_version,
    device='cpu', # QAT models are typically exported on CPU
    mode="jit_trace"
)
qat_int8_model.eval()
qat_int8_model.to('cpu') # Quantized models run on CPU. ONNX export also expects CPU model.

# Define ONNX path
onnx_path_qat_int8 = os.path.join(ONNX_EXPORT_DIR, "mobilenetv2_cifar10_qat_int8.onnx")

# Export to ONNX
# Opset version 13+ is generally recommended for better support of quantized operators.
print(f"Exporting Baseline QAT INT8 model to {onnx_path_qat_int8}...")
success_qat_int8 = export_model_to_onnx(
    model=qat_int8_model,
    dummy_input=dummy_input, # Dummy input should be FP32 for QAT model export
    onnx_path=onnx_path_qat_int8,
    dynamic_axes=dynamic_axes,
    opset_version=13 # Use opset 13 or higher for QAT models
)

if success_qat_int8:
    print("Baseline QAT INT8 model exported successfully.")
    # Run inference with ONNX Runtime
    try:
        ort_session_qat_int8 = ort.InferenceSession(onnx_path_qat_int8, providers=['CPUExecutionProvider'])
        input_name_qat_int8 = ort_session_qat_int8.get_inputs()[0].name
        output_name_qat_int8 = ort_session_qat_int8.get_outputs()[0].name
        
        # Input to ONNX Runtime for QAT model is also FP32
        ort_inputs_qat_int8 = {input_name_qat_int8: dummy_input.cpu().numpy()}
        ort_outputs_qat_int8 = ort_session_qat_int8.run([output_name_qat_int8], ort_inputs_qat_int8)
        print(f"ONNX Runtime (QAT INT8) output shape: {ort_outputs_qat_int8[0].shape}")
    except Exception as e:
        print(f"Error running ONNX Runtime for QAT INT8 model: {e}")
else:
    print("Baseline QAT INT8 model export failed.")

2025-06-13 16:07:38,823 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loading MobileNetV2 model for CIFAR-10 from version: mobilenetv2_cifar10/int8/qat_baseline at /home/pbeuran/repos/nnopt/models
2025-06-13 16:07:38,824 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loaded metadata: {'metrics_values': {'accuracy': 0.863, 'avg_loss': 0.41772424149513243, 'samples_per_second': 255.84980960873506, 'avg_time_per_batch': 0.1244758939108254, 'avg_time_per_sample': 0.003908543068799918, 'params_stats': {'int_weight_params': 2202560, 'float_weight_params': 0, 'float_bias_params': 17066, 'bn_param_params': 0, 'other_float_params': 0, 'total_params': 2219626, 'approx_memory_mb_for_params': 2.1656265258789062}}}
2025-06-13 16:07:38,824 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loading JIT traced model from /home/pbeuran/repos/nnopt/models/mobilenetv2_cifar10/int8/qat_baseline/jit_trace.pt
2025-06-13 16:07:38,978 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Successfully loaded JIT traced model fr

Loading Baseline QAT INT8 model from version: mobilenetv2_cifar10/int8/qat_baseline


2025-06-13 16:07:38,983 - nnopt.model.export - INFO - Starting ONNX export to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_qat_int8.onnx with opset_version=13...


Exporting Baseline QAT INT8 model to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_qat_int8.onnx...


2025-06-13 16:07:39,414 - nnopt.model.export - INFO - Model successfully exported to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_qat_int8.onnx


Baseline QAT INT8 model exported successfully.
ONNX Runtime (QAT INT8) output shape: (1, 10)


### Evaluate Baseline QAT INT8 ONNX Model

In [6]:
# Ensure test_dataset is loaded

if success_qat_int8: # Only proceed if the ONNX model was exported successfully
    print("\n--- Evaluating Baseline QAT INT8 ONNX Model on CPU ---")
    onnx_metrics_qat_int8_cpu = eval_onnx_model(
        onnx_model_path=onnx_path_qat_int8,
        test_dataset=test_dataset,
        batch_size=32, 
        device="cpu", # QAT models are typically evaluated on CPU
        num_warmup_batches=2
    )
    print(f"CPU ONNX Metrics (QAT INT8): {onnx_metrics_qat_int8_cpu}")

    # Optional: Test QAT INT8 on GPU if supported and desired
    # Note: GPU support for INT8 ONNX models can be more complex and might require specific ONNX opset versions
    # or specific GPU capabilities and ONNX Runtime build options.
    if torch.cuda.is_available() and ort.get_device() == 'GPU':
        print("\n--- Evaluating Baseline QAT INT8 ONNX Model on GPU (Experimental) ---")
        try:
            onnx_metrics_qat_int8_gpu = eval_onnx_model(
                onnx_model_path=onnx_path_qat_int8,
                test_dataset=test_dataset,
                batch_size=32, 
                device="cuda",
                num_warmup_batches=2
            )
            print(f"GPU ONNX Metrics (QAT INT8): {onnx_metrics_qat_int8_gpu}")
        except Exception as e:
            print(f"Could not run QAT INT8 ONNX model on GPU: {e}")
            print("This might be due to operator support or other configuration issues.")
    else:
        print("\nSkipping GPU ONNX evaluation for QAT INT8 model as CUDA is not available or ONNX Runtime GPU provider is not set up.")
else:
    print("\nSkipping ONNX model evaluation for QAT INT8 model as the export failed.")

2025-06-13 16:07:39,462 - nnopt.model.eval - INFO - Starting ONNX model evaluation for: /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_qat_int8.onnx
2025-06-13 16:07:39,462 - nnopt.model.eval - INFO - Evaluation on PyTorch device: cpu, batch size: 32
2025-06-13 16:07:39,463 - nnopt.model.eval - INFO - Using ONNX Runtime providers: ['CPUExecutionProvider']
2025-06-13 16:07:39,493 - nnopt.model.eval - INFO - ONNX Model Input Name: input, Output Name: output
2025-06-13 16:07:39,494 - nnopt.model.eval - INFO - Starting warmup for 2 batches...



--- Evaluating Baseline QAT INT8 ONNX Model on CPU ---


[ONNX Warmup]: 100%|██████████| 2/2 [00:00<00:00,  7.45it/s]
2025-06-13 16:07:39,835 - nnopt.model.eval - INFO - Warmup complete.
2025-06-13 16:07:39,835 - nnopt.model.eval - INFO - Starting ONNX model evaluation pass...
[ONNX Evaluation]: 100%|██████████| 157/157 [00:09<00:00, 17.24it/s]

ONNX Evaluation Complete: Avg Loss: 0.4202, Accuracy: 0.8618
Throughput: 844.57 samples/sec | Avg Batch Time: 37.71 ms | Avg Sample Time: 1.18 ms
System Stats (PyTorch side): CPU Usage: 89.40% | RAM Usage: 8.6/30.9GB (36.6%)
CPU ONNX Metrics (QAT INT8): {'accuracy': 0.8618, 'avg_loss': 0.4202093836784363, 'samples_per_second': 844.5694715047395, 'avg_time_per_batch': 0.03770812802553812, 'avg_time_per_sample': 0.0011840352200018969}

Skipping GPU ONNX evaluation for QAT INT8 model as CUDA is not available or ONNX Runtime GPU provider is not set up.





## 3. L1-structured Pruning FP32 Model Export & Inference

In [7]:
# Load the l1 structured pruned FP32 model 
print(f"Loading l1 structured pruned FP32 model from version: {struct_pruned_fp32_version}")
struct_pruned_fp32_model, _ = load_mobilenetv2_cifar10_model(
    version=struct_pruned_fp32_version,
    mode="jit_trace"
)
struct_pruned_fp32_model.eval()
struct_pruned_fp32_model.to('cpu') # Move model to CPU for export

# Define ONNX path
onnx_path_pruned_fp32 = os.path.join(ONNX_EXPORT_DIR, "mobilenetv2_cifar10_pruned_fp32.onnx")

# Export to ONNX
print(f"Exporting pruned FP32 model to {onnx_path_pruned_fp32}...")
success_fp32 = export_model_to_onnx(
    model=struct_pruned_fp32_model,
    dummy_input=dummy_input,
    onnx_path=onnx_path_pruned_fp32,
    dynamic_axes=dynamic_axes,
    opset_version=13
)

if success_fp32:
    print("Pruned FP32 model exported successfully.")
    # Run inference with ONNX Runtime
    try:
        ort_session_fp32 = ort.InferenceSession(onnx_path_pruned_fp32, providers=['CPUExecutionProvider'])
        input_name_fp32 = ort_session_fp32.get_inputs()[0].name
        output_name_fp32 = ort_session_fp32.get_outputs()[0].name
        
        ort_inputs_fp32 = {input_name_fp32: dummy_input.cpu().numpy()}
        ort_outputs_fp32 = ort_session_fp32.run([output_name_fp32], ort_inputs_fp32)
        print(f"ONNX Runtime (FP32 Pruned) output shape: {ort_outputs_fp32[0].shape}")
        # print(f"ONNX Runtime (FP32 Pruned) output sample: {ort_outputs_fp32[0][0,:5]}") # Print first 5 logits
    except Exception as e:
        print(f"Error running ONNX Runtime for FP32 pruned model: {e}")
else:
    print("Pruned FP32 model export failed.")

2025-06-13 16:07:48,970 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loading MobileNetV2 model for CIFAR-10 from version: mobilenetv2_cifar10/fp32/l1_struct_prune_0.3 at /home/pbeuran/repos/nnopt/models
2025-06-13 16:07:48,972 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loaded metadata: {'metrics_values': {'val_metrics': {'accuracy': 0.7892, 'avg_loss': 0.6139473709106446, 'samples_per_second': 9049.953083963761, 'avg_time_per_batch': 0.006993532303792413, 'avg_time_per_sample': 0.00011049781039992013, 'params_stats': {'int_weight_params': 0, 'float_weight_params': 584051, 'float_bias_params': 10, 'bn_param_params': 17100, 'other_float_params': 0, 'total_params': 601161, 'approx_memory_mb_for_params': 2.2932472229003906}}, 'test_metrics': {'accuracy': 0.7872, 'avg_loss': 0.6139407681465149, 'samples_per_second': 10497.233072640314, 'avg_time_per_batch': 0.006067719662425565, 'avg_time_per_sample': 9.526319870008137e-05, 'params_stats': {'int_weight_params': 0, 'float_weight_params': 5

Loading l1 structured pruned FP32 model from version: mobilenetv2_cifar10/fp32/l1_struct_prune_0.3
Exporting pruned FP32 model to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_pruned_fp32.onnx...


2025-06-13 16:07:49,311 - nnopt.model.export - INFO - Model successfully exported to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_pruned_fp32.onnx


Pruned FP32 model exported successfully.
ONNX Runtime (FP32 Pruned) output shape: (1, 10)


### Evaluate L1 structured Pruned FP32 ONNX Model

In [8]:
if success_fp32: # Only proceed if the ONNX model was exported successfully
    print("\n--- Evaluating Pruned FP32 ONNX Model on CPU ---")
    onnx_metrics_cpu = eval_onnx_model(
        onnx_model_path=onnx_path_pruned_fp32,
        test_dataset=test_dataset,
        batch_size=32, # Adjust as needed
        device="cpu",
        num_warmup_batches=2 # Smaller warmup for quicker testing
    )
    print(f"CPU ONNX Metrics: {onnx_metrics_cpu}")

    if torch.cuda.is_available() and ort.get_device() == 'GPU':
        print("\n--- Evaluating Pruned FP32 ONNX Model on GPU ---")
        onnx_metrics_gpu = eval_onnx_model(
            onnx_model_path=onnx_path_baseline_fp32,
            test_dataset=test_dataset,
            batch_size=32, # Adjust as needed
            device="cuda",
            num_warmup_batches=2
        )
        print(f"GPU ONNX Metrics: {onnx_metrics_gpu}")
    else:
        print("\nSkipping GPU ONNX evaluation as CUDA is not available or ONNX Runtime GPU provider is not set up.")
else:
    print("\nSkipping ONNX model evaluation as the export failed.")

2025-06-13 16:07:49,348 - nnopt.model.eval - INFO - Starting ONNX model evaluation for: /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_pruned_fp32.onnx
2025-06-13 16:07:49,348 - nnopt.model.eval - INFO - Evaluation on PyTorch device: cpu, batch size: 32
2025-06-13 16:07:49,349 - nnopt.model.eval - INFO - Using ONNX Runtime providers: ['CPUExecutionProvider']
2025-06-13 16:07:49,363 - nnopt.model.eval - INFO - ONNX Model Input Name: input, Output Name: output
2025-06-13 16:07:49,363 - nnopt.model.eval - INFO - Starting warmup for 2 batches...



--- Evaluating Pruned FP32 ONNX Model on CPU ---


[ONNX Warmup]: 100%|██████████| 2/2 [00:00<00:00,  3.18it/s]
2025-06-13 16:07:50,080 - nnopt.model.eval - INFO - Warmup complete.
2025-06-13 16:07:50,081 - nnopt.model.eval - INFO - Starting ONNX model evaluation pass...
[ONNX Evaluation]: 100%|██████████| 157/157 [00:33<00:00,  4.64it/s]

ONNX Evaluation Complete: Avg Loss: 0.6140, Accuracy: 0.7890
Throughput: 167.63 samples/sec | Avg Batch Time: 189.98 ms | Avg Sample Time: 5.97 ms
System Stats (PyTorch side): CPU Usage: 95.30% | RAM Usage: 8.7/30.9GB (36.7%)
CPU ONNX Metrics: {'accuracy': 0.789, 'avg_loss': 0.6139658059120178, 'samples_per_second': 167.63468192712253, 'avg_time_per_batch': 0.18997938488532462, 'avg_time_per_sample': 0.005965352685399193}

Skipping GPU ONNX evaluation as CUDA is not available or ONNX Runtime GPU provider is not set up.





# 4. L1-unstructured Pruning FP32 Model Export & Inference

In [9]:
import os
import onnxruntime as ort

print(f"Loading unstructured L1-pruned FP32 model from version: {unstruct_pruned_fp32_version}")
unstruct_pruned_fp32_model, _ = load_mobilenetv2_cifar10_model(
    version=unstruct_pruned_fp32_version,
    mode="jit_trace"
)
unstruct_pruned_fp32_model.eval()
unstruct_pruned_fp32_model.to('cpu')

# Export to ONNX
onnx_path_unstruct_fp32 = os.path.join(
    ONNX_EXPORT_DIR,
    "mobilenetv2_cifar10_unstructured_pruned_fp32.onnx"
)
print(f"Exporting unstructured pruned FP32 model to {onnx_path_unstruct_fp32}...")
success_unstruct = export_model_to_onnx(
    model=unstruct_pruned_fp32_model,
    dummy_input=dummy_input,
    onnx_path=onnx_path_unstruct_fp32,
    dynamic_axes=dynamic_axes,
    opset_version=13
)

if success_unstruct:
    print("Unstructured FP32 model exported successfully.")
    # Create ONNX Runtime session on CPU
    ort_session_unstruct = ort.InferenceSession(
        onnx_path_unstruct_fp32,
        providers=['CPUExecutionProvider']
    )
    input_name = ort_session_unstruct.get_inputs()[0].name
    output_name = ort_session_unstruct.get_outputs()[0].name

    # Run a forward pass
    ort_inputs = {input_name: dummy_input.cpu().numpy()}
    ort_outputs = ort_session_unstruct.run([output_name], ort_inputs)
    print(f"ONNX Runtime (Unstructured FP32) output shape: {ort_outputs[0].shape}")
else:
    print("Unstructured FP32 model export failed.")


2025-06-13 16:08:23,947 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loading MobileNetV2 model for CIFAR-10 from version: mobilenetv2_cifar10/fp32/l1_unstruct_prune_0.7 at /home/pbeuran/repos/nnopt/models
2025-06-13 16:08:23,949 - nnopt.recipes.mobilenetv2_cifar10 - INFO - Loaded metadata: {'unstructured_sparse_config': {'pruning_amount': 0.7}, 'metrics_values': {'val_metrics': {'accuracy': 0.871, 'avg_loss': 0.3521343002319336, 'samples_per_second': 8535.882614956985, 'avg_time_per_batch': 0.007414715278488546, 'avg_time_per_sample': 0.00011715250140011903, 'params_stats': {'int_weight_params': 0, 'float_weight_params': 2202560, 'float_bias_params': 10, 'bn_param_params': 34112, 'other_float_params': 0, 'total_params': 2236682, 'approx_memory_mb_for_params': 8.532264709472656}}, 'test_metrics': {'accuracy': 0.8729, 'avg_loss': 0.3661128273010254, 'samples_per_second': 8581.675884406815, 'avg_time_per_batch': 0.007422124579612489, 'avg_time_per_sample': 0.00011652735589991607, 'params_

Loading unstructured L1-pruned FP32 model from version: mobilenetv2_cifar10/fp32/l1_unstruct_prune_0.7
Exporting unstructured pruned FP32 model to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_unstructured_pruned_fp32.onnx...


2025-06-13 16:08:24,344 - nnopt.model.export - INFO - Model successfully exported to /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_unstructured_pruned_fp32.onnx


Unstructured FP32 model exported successfully.
ONNX Runtime (Unstructured FP32) output shape: (1, 10)


In [10]:
# --- Evaluating Unstructured L1‐Pruned FP32 ONNX Model ---

if success_unstruct:  # Only proceed if the ONNX export succeeded
    print("\n--- Evaluating Unstructured FP32 ONNX Model on CPU ---")
    onnx_metrics_cpu_unstruct = eval_onnx_model(
        onnx_model_path=onnx_path_unstruct_fp32,
        test_dataset=test_dataset,
        batch_size=32,          # Adjust as needed
        device="cpu",
        num_warmup_batches=2    # Smaller warmup for quicker testing
    )
    print(f"CPU ONNX (Unstructured) Metrics: {onnx_metrics_cpu_unstruct}")

    # GPU path (if available)
    if torch.cuda.is_available() and ort.get_device() == 'GPU':
        print("\n--- Evaluating Unstructured FP32 ONNX Model on GPU ---")
        onnx_metrics_gpu_unstruct = eval_onnx_model(
            onnx_model_path=onnx_path_unstruct_fp32,
            test_dataset=test_dataset,
            batch_size=32,
            device="cuda",
            num_warmup_batches=2
        )
        print(f"GPU ONNX (Unstructured) Metrics: {onnx_metrics_gpu_unstruct}")
    else:
        print("\nSkipping GPU ONNX (Unstructured) evaluation as CUDA is not available or ONNX Runtime GPU provider is not set up.")
else:
    print("\nSkipping Unstructured ONNX model evaluation as the export failed.")


2025-06-13 16:08:24,380 - nnopt.model.eval - INFO - Starting ONNX model evaluation for: /home/pbeuran/repos/nnopt/models/onnx_exports/mobilenetv2_cifar10_unstructured_pruned_fp32.onnx
2025-06-13 16:08:24,383 - nnopt.model.eval - INFO - Evaluation on PyTorch device: cpu, batch size: 32
2025-06-13 16:08:24,383 - nnopt.model.eval - INFO - Using ONNX Runtime providers: ['CPUExecutionProvider']
2025-06-13 16:08:24,405 - nnopt.model.eval - INFO - ONNX Model Input Name: input, Output Name: output
2025-06-13 16:08:24,406 - nnopt.model.eval - INFO - Starting warmup for 2 batches...



--- Evaluating Unstructured FP32 ONNX Model on CPU ---


[ONNX Warmup]: 100%|██████████| 2/2 [00:00<00:00,  6.29it/s]
2025-06-13 16:08:24,806 - nnopt.model.eval - INFO - Warmup complete.
2025-06-13 16:08:24,806 - nnopt.model.eval - INFO - Starting ONNX model evaluation pass...
[ONNX Evaluation]: 100%|██████████| 157/157 [00:15<00:00, 10.18it/s]

ONNX Evaluation Complete: Avg Loss: 0.3516, Accuracy: 0.8704
Throughput: 440.68 samples/sec | Avg Batch Time: 72.27 ms | Avg Sample Time: 2.27 ms
System Stats (PyTorch side): CPU Usage: 94.00% | RAM Usage: 8.8/30.9GB (37.3%)
CPU ONNX (Unstructured) Metrics: {'accuracy': 0.8704, 'avg_loss': 0.3515835962295532, 'samples_per_second': 440.67883370405343, 'avg_time_per_batch': 0.07226835355416537, 'avg_time_per_sample': 0.002269226301600793}

Skipping GPU ONNX (Unstructured) evaluation as CUDA is not available or ONNX Runtime GPU provider is not set up.





# 5. L1-unstructured Pruning QAT INT8 Model Export & Inference (with OpenVINO)

In [13]:
# --- OpenVINO inference with sparse acceleration for unstructured pruned model ---

from openvino.runtime import Core
import numpy as np
import os

# Path to the already-exported ONNX file:
onnx_unstruct_fp32 = os.path.join(
    ONNX_EXPORT_DIR,
    "mobilenetv2_cifar10_unstructured_pruned_fp32.onnx"
)

# Initialize OpenVINO runtime
ie = Core()

# Read the ONNX model
ov_model = ie.read_model(onnx_unstruct_fp32)

# Compile with sparse-weight decompression enabled when at least 50% weights are zero
compiled_model = ie.compile_model(
    model=ov_model,
    device_name="CPU",
    config={"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE": "0.5"}
)

# Prepare input as NumPy
input_array = dummy_input.cpu().numpy()

# Inference
results = compiled_model([input_array])

print(f"OpenVINO Sparse Inference output shape: {results[0].shape}")


OpenVINO Sparse Inference output shape: (1, 10)


In [14]:
# --- Evaluating Unstructured L1‐Pruned FP32 Model with OpenVINO Sparse Acceleration (fixed) ---
from tqdm import tqdm
from nnopt.model.eval import eval_model_openvino

if success_unstruct:
    print("\n--- Evaluating Unstructured FP32 Model with OpenVINO Sparse Acceleration ---")
    ov_metrics = eval_model_openvino(
        onnx_model_path=onnx_path_unstruct_fp32,
        test_dataset=test_dataset,
        batch_size=32,             # match your earlier config
        criterion=torch.nn.CrossEntropyLoss(),
        sparse_rate=0.7,           # set to your actual sparsity threshold
        num_warmup_batches=2,
        num_workers=4,
        pin_memory=True
    )
    print(f"OpenVINO Sparse Metrics: {ov_metrics}")
else:
    print("Skipping OpenVINO evaluation as the ONNX export failed.")



--- Evaluating Unstructured FP32 Model with OpenVINO Sparse Acceleration ---


[OpenVINO Eval]: 100%|██████████| 157/157 [00:04<00:00, 37.27it/s]

OpenVINO Sparse Metrics: {'accuracy': 0.8726, 'avg_loss': 0.3507337382555008, 'samples_per_second': 1340.0696910670774, 'avg_time_per_batch': 0.023765281738894032, 'avg_time_per_sample': 0.0007462298466012725}





# Analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
from nnopt.model.eval import eval_model # For PyTorch model evaluation
from nnopt.recipes.mobilenetv2_cifar10 import init_mobilenetv2_cifar10_model, get_cifar10_datasets # To load models and dataset

# Ensure test_dataset is loaded (it should be from earlier cells, e.g., cell d52e6d98)
# If not, uncomment and run:
# DATA_DIR = os.path.join(os.getcwd(), '..', 'data', 'image', 'cifar10')
# _, _, test_dataset = get_cifar10_datasets(data_dir=DATA_DIR) # Or however you load it

# --- 1. PyTorch FP32 Baseline Model Evaluation (CPU) ---
print("Evaluating PyTorch FP32 Baseline Model on CPU for analysis...")
pytorch_fp32_model, _ = init_mobilenetv2_cifar10_model(
    version=baseline_fp32_version, # Defined in cell f411f025
    device='cpu'
)
pytorch_fp32_model.eval()
pytorch_fp32_metrics_cpu = eval_model(
    model=pytorch_fp32_model,
    test_dataset=test_dataset,
    device="cpu",
    use_amp=False, # No AMP for CPU FP32
    dtype=torch.float32,
    batch_size=32, # Consistent batch size
    num_warmup_batches=2
)
print(f"PyTorch FP32 CPU Metrics: {pytorch_fp32_metrics_cpu}")

# --- 2. PyTorch QAT INT8 Baseline Model Evaluation (CPU) ---
print("\nEvaluating PyTorch QAT INT8 Baseline Model on CPU for analysis...")
# qat_int8_version is defined in cell f411f025
pytorch_qat_int8_model, _ = init_mobilenetv2_cifar10_model(
    version=qat_int8_version,
    device='cpu'
)
pytorch_qat_int8_model.eval()
pytorch_qat_int8_metrics_cpu = eval_model(
    model=pytorch_qat_int8_model,
    test_dataset=test_dataset,
    device="cpu",
    use_amp=False, # QAT models are typically run with FP32 interface, actual ops are INT8
    dtype=torch.float32,
    batch_size=32,
    num_warmup_batches=2
)
print(f"PyTorch QAT INT8 CPU Metrics: {pytorch_qat_int8_metrics_cpu}")

# --- 3. Retrieve ONNX Model Metrics (Assumed to be available from previous cells) ---
# Ensure 'onnx_metrics_cpu' and 'onnx_metrics_qat_int8_cpu' are populated from earlier cells
# These should contain keys like 'accuracy', 'avg_time_per_sample'
print(f"\nUsing pre-calculated ONNX FP32 CPU Metrics: {onnx_metrics_cpu}")
print(f"Using pre-calculated ONNX QAT INT8 CPU Metrics: {onnx_metrics_qat_int8_cpu}")


# --- 4. Model Sizes ---
# PyTorch model sizes (from parameters, does not include quantization overhead directly but reflects param precision)
# For a more direct comparison with ONNX file size, you could save the PyTorch models and get file size.
# Here, we use the parameter-based approximation for PyTorch models.
pytorch_fp32_size_mb = pytorch_fp32_metrics_cpu['params_stats']['total_params'] * pytorch_fp32_metrics_cpu['params_stats']['approx_memory_mb_for_params'] / pytorch_fp32_metrics_cpu['params_stats']['total_params'] if pytorch_fp32_metrics_cpu['params_stats']['total_params'] > 0 else 0
# For QAT INT8, the parameters are still stored in FP32 for training, but effective size is smaller.
# The 'params_stats' from eval_model for a quantized model might still reflect FP32 storage if not careful.
# A better measure for PyTorch quantized model size is to save it and check file size, or estimate based on INT8.
# For simplicity, we'll use the reported size from eval_model, but acknowledge it might be an overestimate for QAT.
# A more accurate way for PyTorch INT8 model size:
torch.save(pytorch_qat_int8_model.state_dict(), "temp_qat_int8_model.pth")
pytorch_qat_int8_size_mb = os.path.getsize("temp_qat_int8_model.pth") / (1024 * 1024)
os.remove("temp_qat_int8_model.pth")
print(f"PyTorch QAT INT8 Model Size (saved state_dict): {pytorch_qat_int8_size_mb:.2f} MB")


# ONNX model file sizes
# Ensure 'onnx_path_baseline_fp32' and 'onnx_path_qat_int8' are defined (cell f411f025 and 5b5fb4cd, 2170bb29)
onnx_fp32_size_mb = os.path.getsize(onnx_path_baseline_fp32) / (1024 * 1024) if os.path.exists(onnx_path_baseline_fp32) else 0
onnx_qat_int8_size_mb = os.path.getsize(onnx_path_qat_int8) / (1024 * 1024) if os.path.exists(onnx_path_qat_int8) else 0
print(f"ONNX FP32 Model Size: {onnx_fp32_size_mb:.2f} MB")
print(f"ONNX QAT INT8 Model Size: {onnx_qat_int8_size_mb:.2f} MB")


# --- 5. Prepare data for plotting ---
model_labels = [
    "PyTorch FP32",
    "PyTorch QAT INT8",
    "ONNX FP32",
    "ONNX QAT INT8"
]

# Using test accuracies
accuracies = [
    pytorch_fp32_metrics_cpu['accuracy'],
    pytorch_qat_int8_metrics_cpu['accuracy'],
    onnx_metrics_cpu['accuracy'] if 'onnx_metrics_cpu' in locals() and onnx_metrics_cpu else 0, # from cell d52e6d98
    onnx_metrics_qat_int8_cpu['accuracy'] if 'onnx_metrics_qat_int8_cpu' in locals() and onnx_metrics_qat_int8_cpu else 0 # from cell 8b57290a
]

# CPU inference time per sample (test set)
cpu_time_per_sample = [
    pytorch_fp32_metrics_cpu['avg_time_per_sample'],
    pytorch_qat_int8_metrics_cpu['avg_time_per_sample'],
    onnx_metrics_cpu['avg_time_per_sample'] if 'onnx_metrics_cpu' in locals() and onnx_metrics_cpu else float('inf'),
    onnx_metrics_qat_int8_cpu['avg_time_per_sample'] if 'onnx_metrics_qat_int8_cpu' in locals() and onnx_metrics_qat_int8_cpu else float('inf')
]

# Model sizes in MB
model_sizes_mb = [
    pytorch_fp32_size_mb,
    pytorch_qat_int8_size_mb, # Using saved state_dict size
    onnx_fp32_size_mb,
    onnx_qat_int8_size_mb
]

print("\nData for plotting:")
print(f"Labels: {model_labels}")
print(f"Accuracies: {accuracies}")
print(f"CPU Time/Sample (s): {cpu_time_per_sample}")
print(f"Model Sizes (MB): {model_sizes_mb}")

# Check if all ONNX metrics were loaded correctly
if not ('onnx_metrics_cpu' in locals() and onnx_metrics_cpu and \
        'onnx_metrics_qat_int8_cpu' in locals() and onnx_metrics_qat_int8_cpu):
    print("\nWARNING: ONNX metrics might not be fully loaded. Plots might be incomplete or show zero/infinity values.")
    print("Please ensure the cells evaluating ONNX models (d52e6d98, 8b57290a) have been run successfully.")


In [None]:
# Accuracy Bar Plot (Test Set on CPU)
x = np.arange(len(model_labels))
width = 0.5 # Single bar for test accuracy

fig, ax = plt.subplots(figsize=(10, 6))
rects = ax.bar(x, accuracies, width, label='Test Accuracy (CPU)')

ax.set_ylabel('Accuracy')
ax.set_title('Model Test Accuracy Comparison (CPU)')
ax.set_xticks(x)
ax.set_xticklabels(model_labels, rotation=45, ha="right")
ax.legend()
ax.grid(True, linestyle='--', alpha=0.7)
ax.set_ylim(min(accuracies) * 0.9 if min(accuracies) > 0 else 0, max(accuracies) * 1.1 if max(accuracies) > 0 else 1) # Adjust y-lim dynamically

def autolabel(rects_to_label, ax_to_use):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects_to_label:
        height = rect.get_height()
        ax_to_use.annotate(f'{height:.4f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

autolabel(rects, ax)

fig.tight_layout()
plt.show()

In [None]:
# CPU Inference Time Comparison Plot (Time per Sample)
fig, ax = plt.subplots(figsize=(10, 6))
# Convert times to milliseconds for better readability if they are very small
cpu_time_per_sample_ms = [t * 1000 for t in cpu_time_per_sample]
rects = ax.bar(x, cpu_time_per_sample_ms, width, label='CPU Time/Sample (ms)')

ax.set_ylabel('CPU Time/Sample (milliseconds)')
ax.set_title('Model Inference Time Comparison (CPU)')
ax.set_xticks(x)
ax.set_xticklabels(model_labels, rotation=45, ha="right")
ax.legend()
ax.grid(True, linestyle='--', alpha=0.7)
# ax.set_yscale('log') # Use log scale if times vary greatly

def autolabel_time(rects_to_label, ax_to_use):
    for rect in rects_to_label:
        height = rect.get_height()
        ax_to_use.annotate(f'{height:.3f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

autolabel_time(rects, ax)

fig.tight_layout()
plt.show()

In [None]:
# Model Size Comparison Plot
fig, ax = plt.subplots(figsize=(10, 6))
rects = ax.bar(x, model_sizes_mb, width, label='Model Size (MB)')

ax.set_ylabel('Model Size (MB)')
ax.set_title('Model Size Comparison')
ax.set_xticks(x)
ax.set_xticklabels(model_labels, rotation=45, ha="right")
ax.legend()
ax.grid(True, linestyle='--', alpha=0.7)

def autolabel_size(rects_to_label, ax_to_use):
    for rect in rects_to_label:
        height = rect.get_height()
        ax_to_use.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

autolabel_size(rects, ax)

fig.tight_layout()
plt.show()