In [33]:
import torch
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights
from torch.export import export as torch_export
from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e
from torch.ao.quantization.quantizer.xnnpack_quantizer import XNNPACKQuantizer
import ai_edge_torch

def build_fp32_cpu_model():
    m = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1).eval().cpu()
    m = m.to(memory_format=torch.channels_last)
    return m

def export_tflite(model_or_exported, out_path, sample_shape=(1, 3, 224, 224)):
    # ai_edge_torch needs CPU sample inputs as a TUPLE
    sample_inputs = (torch.randn(*sample_shape),)
    edge = ai_edge_torch.convert(model_or_exported, sample_inputs)
    edge.export(out_path)
    print(f"✓ Wrote {out_path}")

def calibrate_callable(callable_model, batches=128, bs=8):
    with torch.no_grad():
        for _ in range(batches):
            x = torch.randn(bs, 3, 224, 224).to(memory_format=torch.channels_last)
            callable_model(x)

def main():
    print("torch =", torch.__version__)
    fp32_cpu = build_fp32_cpu_model()

    # Build quantizer config
    quantizer = XNNPACKQuantizer()

    prepared = None
    used_variant = None

    # Variant A: prepare_pt2e expects an nn.Module
    try:
        print("Trying PT2E prepare on nn.Module …")
        prepared = prepare_pt2e(fp32_cpu, quantizer).eval()
        used_variant = "module"
    except Exception as e_mod:
        print("Module path failed:", repr(e_mod))
        # Variant B: prepare_pt2e expects an ExportedProgram
        try:
            print("Trying PT2E prepare on ExportedProgram …")
            ex_in = (torch.randn(1, 3, 224, 224).to(memory_format=torch.channels_last),)
            exported = torch_export(fp32_cpu, ex_in)
            prepared = prepare_pt2e(exported, quantizer).eval()
            used_variant = "exported"
        except Exception as e_exp:
            print("ExportedProgram path failed:", repr(e_exp))
            raise RuntimeError(
                "prepare_pt2e failed for both nn.Module and ExportedProgram "
                "variants. Consider upgrading PyTorch/torchvision."
            )

    # Calibrate
    calibrate_callable(prepared, batches=128, bs=8)

    # Convert to INT8
    int8_obj = convert_pt2e(prepared).eval()

    # Export TFLite
    if used_variant == "module":
        # For consistency, we can also export FP32 from module:
        export_tflite(fp32_cpu, "mobilenetv2_fp32.tflite")
        export_tflite(int8_obj, "mobilenetv2_int8.tflite")
    else:
        # used_variant == "exported": prepared/converted are ExportedPrograms
        # Also export FP32 as ExportedProgram so shapes/layouts match closely
        ex_in = (torch.randn(1, 3, 224, 224).to(memory_format=torch.channels_last),)
        exported_fp32 = torch_export(fp32_cpu, ex_in)
        export_tflite(exported_fp32, "mobilenetv2_fp32.tflite")
        export_tflite(int8_obj,    "mobilenetv2_int8.tflite")

if __name__ == "__main__":
    main()


torch = 2.8.0+cu128
Trying PT2E prepare on nn.Module …
Module path failed: AttributeError("'MobileNetV2' object has no attribute 'meta'")
Trying PT2E prepare on ExportedProgram …


  quantizer = XNNPACKQuantizer()
For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  prepared = prepare_pt2e(fp32_cpu, quantizer).eval()


ExportedProgram path failed: AttributeError("'ExportedProgram' object has no attribute 'meta'")


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  prepared = prepare_pt2e(exported, quantizer).eval()


RuntimeError: prepare_pt2e failed for both nn.Module and ExportedProgram variants. Consider upgrading PyTorch/torchvision.

In [3]:
import torch
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights

from torch.export import export as torch_export
from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e

# 👇 PyTorch 2.8 moved the quantizer here:
from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import XNNPACKQuantizer

import ai_edge_torch

def export_tflite(model_or_exported, out_path, sample_shape=(1, 3, 224, 224)):
    sample_inputs = (torch.randn(*sample_shape),)  # CPU tuple
    edge = ai_edge_torch.convert(model_or_exported, sample_inputs)
    edge.export(out_path)
    print(f"✓ Wrote {out_path}")

def build_fp32_cpu_model():
    m = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1).eval().cpu()
    return m.to(memory_format=torch.channels_last)

def calibrate_callable(callable_model, batches=128, bs=8):
    with torch.no_grad():
        for _ in range(batches):
            x = torch.randn(bs, 3, 224, 224).to(memory_format=torch.channels_last)
            callable_model(x)

def main():
    print("torch:", torch.__version__)
    m_fp32 = build_fp32_cpu_model()

    # Quantizer (ExecuTorch in 2.8)
    quantizer = XNNPACKQuantizer()

    prepared = None
    used_variant = None
    ex_in = (torch.randn(1, 3, 224, 224).to(memory_format=torch.channels_last),)

    # Try Module path with/without example_inputs first
    try:
        prepared = prepare_pt2e(m_fp32, quantizer, example_inputs=ex_in).eval()
        used_variant = ("module", "with_example_inputs")
    except TypeError:
        try:
            prepared = prepare_pt2e(m_fp32, quantizer).eval()
            used_variant = ("module", "no_example_inputs")
        except Exception as e_mod:
            # Fall back to ExportedProgram path
            exported = torch_export(m_fp32, ex_in)
            prepared = prepare_pt2e(exported, quantizer).eval()
            used_variant = ("exported_program", "no_example_inputs")

    # Calibrate & convert
    calibrate_callable(prepared, batches=128, bs=8)
    converted = convert_pt2e(prepared).eval()

    # Export FP32 + INT8 TFLite
    if used_variant[0] == "module":
        export_tflite(m_fp32, "mobilenetv2_fp32.tflite")
        export_tflite(converted, "mobilenetv2_int8.tflite")
    else:
        exported_fp32 = torch_export(m_fp32, ex_in)
        export_tflite(exported_fp32, "mobilenetv2_fp32.tflite")
        export_tflite(converted,      "mobilenetv2_int8.tflite")

if __name__ == "__main__":
    main()


I0000 00:00:1757083531.053137  107884 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
I0000 00:00:1757083531.592349  107884 cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1757083534.301964  107884 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  from .autonotebook import tqdm as notebook_tqdm
For migrations of users: 
1. Eager mode quantization (torch.ao.quantizati

torch: 2.9.0.dev20250811+cpu


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  prepared = prepare_pt2e(exported, quantizer).eval()


AttributeError: 'ExportedProgram' object has no attribute 'meta'

In [1]:
import torch
import torchvision.models as models
from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
from executorch.exir import to_edge_transform_and_lower

model = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
sample_inputs = (torch.randn(1, 3, 224, 224), )

et_program = to_edge_transform_and_lower(
    torch.export.export(model, sample_inputs),
    partitioner=[XnnpackPartitioner()]
).to_executorch()

with open("model.pte", "wb") as f:
    f.write(et_program.buffer)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-7ebf99e0.pth" to /home/joonyoung/.cache/torch/hub/checkpoints/mobilenet_v2-7ebf99e0.pth


100%|██████████| 13.6M/13.6M [00:00<00:00, 18.9MB/s]


In [None]:
import torch
from executorch.runtime import Runtime
from typing import List

runtime = Runtime.get()

input_tensor: torch.Tensor = torch.randn(1, 3, 224, 224)
program = runtime.load_program("model.pte")
method = program.load_method("forward")
output: List[torch.Tensor] = method.execute([input_tensor])
print("Run succesfully via executorch")

from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
import torchvision.models as models

eager_reference_model = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
eager_reference_output = eager_reference_model(input_tensor)

print("Comparing against original PyTorch module")
print(torch.allclose(output[0], eager_reference_output, rtol=1e-3, atol=1e-5))

[program.cpp:134] InternalConsistency verification requested but not available


Run succesfully via executorch
Comparing against original PyTorch module
True


In [28]:
import torch
import torchvision.models as models
from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
import torch.fx as fx

# PT2E from torch.ao (not torchao)
from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e
from torch.ao.quantization import move_exported_model_to_eval

# AI Edge Torch
import ai_edge_torch as aet
from ai_edge_torch.quantize import pt2e_quantizer as aet_q
from ai_edge_torch.quantize import quant_config as aet_qc

# 1) Load eager model + sample
model = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
sample_inputs = (torch.randn(1, 3, 224, 224),)

# 2) Trace to FX for PT2E (we'll quantize THIS gm; do not use torch.export)
gm = fx.symbolic_trace(model).eval()

# 3) AET PT2E quantizer (TFLite-style: symmetric, per-channel weights on axis 0)
qspec = aet_q.get_symmetric_quantization_config(is_per_channel=True)
quantizer = aet_q.PT2EQuantizer().set_global(qspec)

# 4) Prepare + calibrate on the FX GraphModule
prepared = prepare_pt2e(gm, quantizer)
prepared = move_exported_model_to_eval(prepared)
with torch.no_grad():
    for _ in range(32):                      # use real representative data if possible
        prepared(torch.randn(1, 3, 224, 224))

print(prepared)

# 5) Convert to quantized graph (keep Q/DQ explicit for StableHLO/TFLite)
quantized = convert_pt2e(
    prepared,
    use_reference_representation=False,
    fold_quantize=False
).eval()

# (Optional) Inspect that quant ops are present
for n in quantized.graph.nodes:
    if "quantize" in str(n.target) or "dequantize" in str(n.target):
        print(n.op, n.target)

# 6) Export to TFLite via AI Edge Torch
edge_model = aet.convert(
    quantized,                      # NOTE: pass the quantized FX module (not EP)
    sample_inputs,
    quant_config=aet_qc.QuantConfig(pt2e_quantizer=quantizer),
)
edge_model.export("mobilenetv2_int8.tflite")
print("Wrote mobilenetv2_int8.tflite")


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  prepared = prepare_pt2e(gm, quantizer)
  aten_pattern = torch.export.export_for_training(
  aten_pattern = torch.export.export_for_training(
  aten_pattern = torch.export.export_for_training(
  aten_pattern = torch.export.export_for_training(
  aten_pattern = torch.export.export_for_training(
  aten_pattern = torch.export.export_for_training(


GraphModule(
  (features): Module(
    (0): Module(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): Module(
      (conv): Module(
        (0): Module(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): Module(
      (conv): Module(
        (0): Module(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2):

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  quantized = convert_pt2e(


AttributeError: 'Conv2d' object has no attribute 'qconfig'

In [None]:
import torch
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights

# PT2E (torch.ao)
from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e

# AI Edge Torch
import ai_edge_torch as aet
from ai_edge_torch.quantize import pt2e_quantizer as aet_q
from ai_edge_torch.quantize import quant_config as aet_qc

m = mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
example_inputs = (torch.randn(1,3,224,224),)

# 1) Capture to ExportedProgram (ATen graph)
ep = torch.export.export(m, example_inputs).module()  # 2.6+ API

# 2) Configure an AET PT2E quantizer (symmetric, per-channel)
qspec = aet_q.get_symmetric_quantization_config(is_per_channel=True)
quantizer = aet_q.PT2EQuantizer().set_global(qspec)

# 3) Prepare + calibrate
prepared = prepare_pt2e(ep, quantizer)
with torch.no_grad():
    for _ in range(32): prepared(torch.randn(1,3,224,224))

# 4) Convert (keep Q/DQ explicit for TFLite lowering)
quantized = convert_pt2e(prepared, fold_quantize=False)

print(quantized)

# 5) Convert to TFLite
edge_model = aet.convert(
    quantized,
    example_inputs,
    quant_config=aet_qc.QuantConfig(pt2e_quantizer=quantizer),
)
edge_model.export("mobilenetv2_int8.tflite")


<class 'torch.fx.graph_module.GraphModule.__new__.<locals>.GraphModuleImpl'>


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  prepared = prepare_pt2e(ep, quantizer)
For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (

GraphModule(
  (features): Module(
    (0): Module(
      (0): Module()
    )
    (1): Module(
      (conv): Module(
        (0): Module(
          (0): Module()
        )
        (1): Module()
      )
    )
    (2): Module(
      (conv): Module(
        (0): Module(
          (0): Module()
        )
        (1): Module(
          (0): Module()
        )
        (2): Module()
      )
    )
    (3): Module(
      (conv): Module(
        (0): Module(
          (0): Module()
        )
        (1): Module(
          (0): Module()
        )
        (2): Module()
      )
    )
    (4): Module(
      (conv): Module(
        (0): Module(
          (0): Module()
        )
        (1): Module(
          (0): Module()
        )
        (2): Module()
      )
    )
    (5): Module(
      (conv): Module(
        (0): Module(
          (0): Module()
        )
        (1): Module(
          (0): Module()
        )
        (2): Module()
      )
    )
    (6): Module(
      (conv): Module(
        (0): 

W0000 00:00:1757303413.778856    6423 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
W0000 00:00:1757303413.778882    6423 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
I0000 00:00:1757303413.779160    6423 reader.cc:83] Reading SavedModel from: /tmp/tmp6ohyr2g1
I0000 00:00:1757303413.783841    6423 reader.cc:52] Reading meta graph with tags { serve }
I0000 00:00:1757303413.783866    6423 reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmp6ohyr2g1
I0000 00:00:1757303413.820058    6423 loader.cc:236] Restoring SavedModel bundle.
I0000 00:00:1757303414.158779    6423 loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmp6ohyr2g1
I0000 00:00:1757303414.245415    6423 loader.cc:471] SavedModel load for tags { serve }; Status: success: OK. Took 466282 microseconds.
I0000 00:00:1757303415.540812    6423 flatbuffer_export.cc:4150] Estimated count of arithmetic ops: 608.445 M  ops, equivalently 304.223 M  MACs


In [42]:
import numpy as np
import torch
import torch.nn.functional as F

# 1) Prepare one test input (use real, normalized data if possible)
x = torch.randn(1, 3, 224, 224)
with torch.no_grad():
    y_fp = m.eval()(x).cpu().numpy()

# 2) Run TFLite
import tensorflow as tf  # or tflite_runtime.interpreter
interpreter = tf.lite.Interpreter(model_path="mobilenetv2_int8.tflite")
interpreter.allocate_tensors()

inp = interpreter.get_input_details()[0]
out = interpreter.get_output_details()[0]

x_np = x.cpu().numpy().astype(np.float32)

# Handle quantized or float I/O automatically
def set_input(interpreter, detail, x_float):
    if np.issubdtype(detail["dtype"], np.floating):
        interpreter.set_tensor(detail["index"], x_float)
    else:
        scale, zero = detail["quantization"]
        x_q = np.round(x_float / scale + zero)
        qmin = np.iinfo(detail["dtype"]).min
        qmax = np.iinfo(detail["dtype"]).max
        x_q = np.clip(x_q, qmin, qmax).astype(detail["dtype"])
        interpreter.set_tensor(detail["index"], x_q)

def get_output(interpreter, detail):
    y = interpreter.get_tensor(detail["index"])
    if not np.issubdtype(detail["dtype"], np.floating):
        scale, zero = detail["quantization"]
        y = (y.astype(np.float32) - zero) * scale
    return y

set_input(interpreter, inp, x_np)
interpreter.invoke()
y_tfl = get_output(interpreter, out)

# 3) Metrics (PyTorch vs TFLite)
mse = np.mean((y_fp - y_tfl) ** 2)
cos = np.mean(np.sum(y_fp * y_tfl, axis=1) /
              (np.linalg.norm(y_fp, axis=1) * np.linalg.norm(y_tfl, axis=1) + 1e-12))
top1_pt  = y_fp.argmax(axis=1)
top1_tfl = y_tfl.argmax(axis=1)
agree = float((top1_pt == top1_tfl).mean())

print({"mse": mse, "cosine": cos, "top1_agree": agree})


{'mse': np.float32(0.011216138), 'cosine': np.float32(0.9931354), 'top1_agree': 1.0}
