### ONNX graph reading

In [1]:
import onnx

# Load the model we created earlier
model_path = "skin_cancer_resnet.onnx"
model = onnx.load(model_path)

# model metadata
print(f"Producer: {model.producer_name}")
print(f"Opset Version: {model.opset_import[0].version}") # Crucial for compatibility!

# graph inputs
for input_node in model.graph.input:
    print(f"Name: {input_node.name} | Shape: {input_node.type.tensor_type.shape.dim}")

# first few layers
for i, node in enumerate(model.graph.node[:5]):
    print(f"Layer {i}: {node.op_type} (Input: {node.input} -> Output: {node.output})")

Producer: tf2onnx
Opset Version: 13
Name: input_image | Shape: [dim_param: "unk__556"
, dim_value: 224
, dim_value: 224
, dim_value: 3
]
Layer 0: Slice (Input: ['input_image', 'const_starts__10', 'const_ends__11', 'const_axes__30'] -> Output: ['functional_1/strided_slice_2:0'])
Layer 1: Slice (Input: ['input_image', 'const_ends__11', 'const_starts__22', 'const_axes__30'] -> Output: ['functional_1/strided_slice_1:0'])
Layer 2: Slice (Input: ['input_image', 'const_starts__22', 'const_axes__30', 'const_axes__30'] -> Output: ['functional_1/strided_slice:0'])
Layer 3: Concat (Input: ['functional_1/strided_slice:0', 'functional_1/strided_slice_1:0', 'functional_1/strided_slice_2:0'] -> Output: ['functional_1/stack_Concat__34:0'])
Layer 4: Add (Input: ['functional_1/stack_Concat__34:0', 'functional_1/Squeeze:0'] -> Output: ['functional_1/BiasAdd:0'])


### Benchmark mesutring

In [2]:
import onnxruntime as ort
import numpy as np
import time

In [4]:
dummy_data = np.random.randn(32, 224, 224, 3).astype(np.float32)
onnx_file = "skin_cancer_resnet.onnx"
print(f"Available Providers: {ort.get_available_providers()}")

Available Providers: ['CoreMLExecutionProvider', 'AzureExecutionProvider', 'CPUExecutionProvider']


In [6]:
def benchmark(provider_name):
    try:
        # Load the "Driver"
        session = ort.InferenceSession(onnx_file, providers=[provider_name])
        input_name = session.get_inputs()[0].name
        
        # Warmup (Get the engine running)
        for _ in range(5):
            _ = session.run(None, {input_name: dummy_data})
            
        # Measure
        start = time.time()
        for _ in range(20): # Run 20 times
            _ = session.run(None, {input_name: dummy_data})
        end = time.time()
        
        print(f"Provider: {provider_name: <25} | Time: {end - start:.4f} seconds")
        
    except Exception as e:
        print(f"Provider: {provider_name: <25} | Failed (Not installed or HW missing)")

In [None]:
benchmark('CPUExecutionProvider')   # Standard CPU
#benchmark('CUDAExecutionProvider')  # NVIDIA GPU But not in Mac (You need onnxruntime-gpu installed for CUDA)



Provider: CPUExecutionProvider      | Time: 33.5380 seconds


In [9]:
benchmark('CoreMLExecutionProvider')

[0;93m2025-11-29 11:25:05.130584 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-11-29 11:25:05.131003 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Provider: CoreMLExecutionProvider   | Time: 0.9692 seconds


In [10]:
benchmark('AzureExecutionProvider')  

Provider: AzureExecutionProvider    | Time: 33.6219 seconds


### Prepare for Edge devices or model Quantizing

In [11]:
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

input_model_path = "skin_cancer_resnet.onnx"
output_model_path = "skin_cancer_mobile_quant.onnx"

quantize_dynamic(
    model_input=input_model_path,
    model_output=output_model_path,
    weight_type=QuantType.QUInt8  # Convert weights to 8-bit integers
)



In [12]:
import os
original_size = os.path.getsize(input_model_path) / (1024 * 1024)
quant_size = os.path.getsize(output_model_path) / (1024 * 1024)

print(f"Original Size: {original_size:.2f} MB")
print(f"Mobile Size:   {quant_size:.2f} MB")
print(f"Reduction:     {original_size / quant_size:.1f}x smaller!")

Original Size: 89.63 MB
Mobile Size:   22.59 MB
Reduction:     4.0x smaller!
