In [13]:
import os
import sys
from pathlib import Path

import torch

# 确保能 import 到你的 LAR_IQA 工程
sys.path.append("..")  # 根据你的 ipynb 所在路径调整

from python.packages.LAR_IQA.scripts.utils import load_model


def export_lar_iqa_onnx(
    checkpoint_path: str = "../python/packages/LAR_IQA/checkpoint_epoch_3.pt",
    out_dir: str = "./out",
    onnx_name: str = "lar_iqa.onnx",
    use_cuda: bool = True,
):
    # 1. 选择设备
    device = "cuda" if (use_cuda and torch.cuda.is_available()) else "cpu"
    print(f"[INFO] Using device: {device}")

    # 2. 加载模型
    ckpt_path = Path(checkpoint_path).resolve()
    if not ckpt_path.exists():
        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")

    print(f"[INFO] Loading checkpoint from: {ckpt_path}")
    model = load_model(str(ckpt_path), False, device)
    model.eval()

    # 3. 构造 dummy 输入（与 preprocess_image 输出形状一致）
    #
    # preprocess_image 中：
    #   image_authentic: Resize 到 (384, 384)
    #   image_synthetic: CenterCrop 到 (1280, 1280)
    #
    # 所以 dummy 输入分别是 [1, 3, 384, 384] 和 [1, 3, 1280, 1280]
    image_authentic = torch.randn(1, 3, 384, 384, device=device)
    image_synthetic = torch.randn(1, 3, 1280, 1280, device=device)

    # 4. 确保导出目录存在
    out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    onnx_path = out_path / onnx_name

    print(f"[INFO] Exporting ONNX to: {onnx_path}")

    # 5. 导出 ONNX
    torch.onnx.export(
        model,
        (image_authentic, image_synthetic),  # 模型的两个输入
        onnx_path.as_posix(),
        export_params=True,  # 保存权重到 ONNX
        opset_version=18,  # 常用的较新 opset 版本（你也可以改成 16/18）
        do_constant_folding=True,  # 常量折叠优化
        input_names=["image_authentic", "image_synthetic"],
        output_names=["score"],
        dynamic_axes={  # 只把 batch 维做成动态，空间尺寸固定
            "image_authentic": {0: "batch_size"},
            "image_synthetic": {0: "batch_size"},
            "score": {0: "batch_size"},
        },
    )

    print("[INFO] ONNX export finished.")
    print(f"[INFO] ONNX model saved at: {onnx_path}")
    return onnx_path


# 在 ipynb 中直接跑这一段即可导出
if __name__ == "__main__":
    export_lar_iqa_onnx()


[INFO] Using device: cuda
[INFO] Loading checkpoint from: F:\ML\PythonAIProject\SMARKMediaTools_web\electron-media-toolbox\python\packages\LAR_IQA\checkpoint_epoch_3.pt
[INFO] Exporting ONNX to: out\lar_iqa.onnx


  torch.onnx.export(


[torch.onnx] Obtain model graph for `MobileNetMerged([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `MobileNetMerged([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...
[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅
Applied 184 of general pattern rewrite rules.
[INFO] ONNX export finished.
[INFO] ONNX model saved at: out\lar_iqa.onnx


In [15]:
import time
import numpy as np
import onnxruntime as ort

# Benchmark PyTorch vs ONNX inference (20 runs average)

# 配置
runs = 200
warmups = 5
device = "cuda" if torch.cuda.is_available() else "cpu"

# If you want to benchmark the PyTorch model, provide checkpoint path (same as export used)
ckpt_path = Path("../python/packages/LAR_IQA/checkpoint_epoch_3.pt").resolve()

# Prepare dummy inputs (与 export 中一致)
input_torch = (
    torch.randn(1, 3, 384, 384, device=device),
    torch.randn(1, 3, 1280, 1280, device=device),
)

def bench_torch(model, inputs, runs=runs, warmups=warmups):
    model.eval()
    # warm-up
    for _ in range(warmups):
        with torch.no_grad():
            _ = model(*inputs)
        if device == "cuda":
            torch.cuda.synchronize()
    times = []
    with torch.no_grad():
        for _ in range(runs):
            t0 = time.perf_counter()
            _ = model(*inputs)
            if device == "cuda":
                torch.cuda.synchronize()
            t1 = time.perf_counter()
            times.append((t1 - t0) * 1000.0)  # ms
    return np.mean(times), np.std(times), times

def bench_onnx(session, np_inputs, runs=runs, warmups=warmups):
    input_names = [i.name for i in session.get_inputs()]
    # warm-up
    for _ in range(warmups):
        _ = session.run(None, {name: arr for name, arr in zip(input_names, np_inputs)})
    times = []
    for _ in range(runs):
        t0 = time.perf_counter()
        _ = session.run(None, {name: arr for name, arr in zip(input_names, np_inputs)})
        t1 = time.perf_counter()
        times.append((t1 - t0) * 1000.0)  # ms
    return np.mean(times), np.std(times), times

# ---------- PyTorch benchmark ----------
if ckpt_path.exists():
    print(f"[INFO] Loading PyTorch model from {ckpt_path} onto {device}")
    model = load_model(str(ckpt_path), False, device)
    mean_torch, std_torch, times_torch = bench_torch(model, input_torch)
    print(f"[RESULT][PyTorch] mean {mean_torch:.2f} ms  std {std_torch:.2f} ms  (n={runs})")
else:
    print(f"[WARN] Checkpoint not found at {ckpt_path}. Skipping PyTorch benchmark.")

# ---------- ONNX benchmark ----------
onnx_path = "./out/lar_iqa.onnx"  # path from export step
onnx_file = Path(onnx_path).resolve()  # onnx_path provided in notebook
if not onnx_file.exists():
    raise FileNotFoundError(f"ONNX model not found: {onnx_file}")

# select providers
available_providers = ort.get_available_providers()
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if "CUDAExecutionProvider" in available_providers else ["CPUExecutionProvider"]
sess_opts = ort.SessionOptions()
sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
print(f"[INFO] Creating ONNX Runtime session for {onnx_file} with providers={providers}")
sess = ort.InferenceSession(onnx_file.as_posix(), sess_opts, providers=providers)

# prepare numpy inputs (move to CPU and convert to float32)
np_inputs = [inp.detach().cpu().numpy().astype(np.float32) if isinstance(inp, torch.Tensor) else np.array(inp, dtype=np.float32) for inp in input_torch]

mean_onnx, std_onnx, times_onnx = bench_onnx(sess, np_inputs)
print(f"[RESULT][ONNX] mean {mean_onnx:.2f} ms  std {std_onnx:.2f} ms  (n={runs})")

# optional: compare outputs once to ensure correctness
with torch.no_grad():
    pt_out = None
    if ckpt_path.exists():
        pt_out = model(*input_torch)
onnx_out = sess.run(None, {name: arr for name, arr in zip([i.name for i in sess.get_inputs()], np_inputs)})
if pt_out is not None:
    # convert pt_out and onnx_out[0] to numpy for comparison
    pt_np = pt_out.detach().cpu().numpy() if isinstance(pt_out, torch.Tensor) else np.array(pt_out)
    onnx_np = np.array(onnx_out[0])
    diff = np.max(np.abs(pt_np - onnx_np))
    print(f"[INFO] max abs diff between PyTorch and ONNX outputs: {diff:.6f}")
else:
    print("[INFO] PyTorch output not available to compare with ONNX.")

[INFO] Loading PyTorch model from F:\ML\PythonAIProject\SMARKMediaTools_web\electron-media-toolbox\python\packages\LAR_IQA\checkpoint_epoch_3.pt onto cuda
[RESULT][PyTorch] mean 43.43 ms  std 24.45 ms  (n=200)
[INFO] Creating ONNX Runtime session for F:\ML\PythonAIProject\SMARKMediaTools_web\electron-media-toolbox\dev\out\lar_iqa.onnx with providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
[RESULT][ONNX] mean 38.61 ms  std 2.50 ms  (n=200)
[INFO] max abs diff between PyTorch and ONNX outputs: 0.000155
