In [1]:
# Code taken from:
# https://github.com/microsoft/onnxruntime-inference-examples/tree/main/quantization/image_classification/cpu

import torch
import onnxruntime
import os
from onnxruntime.quantization import (CalibrationDataReader, QuantFormat, 
                                      QuantType, quantize_static, quant_pre_process)
from PIL import Image
import numpy as np
import time
import cv2

from tqdm import tqdm


input_model_path = '/path/to/stored/onnx-model.onnx'
# Preprocess model - model with input informaton. 
# This model will be create from input model for correct operation of quantization.
preprocess_model_path = '/path/to/save/onnx-preprocess-model.onnx'
output_model_path = '/path/to/save/model-quan.onnx'
calibration_dataset_path = '/path/to/folders/with/frames'


def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, r, (dw, dh)


def preprocess_image(image, new_shape):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image, ratio, dwdh = letterbox(image, new_shape=new_shape, auto=False)
    image = image.transpose((2, 0, 1)).astype(np.float32, copy=False)
    image = np.expand_dims(image, axis=0)
    image = np.ascontiguousarray(image)
    image /= 255
    return image


def _preprocess_images(images_folder: str, batch_size: int, height: int, width: int, size_limit=0):
    """
    Loads a batch of images and preprocess them
    parameter images_folder: path to folder storing images
    parameter batch_size: batch size for the model
    parameter height: image height in pixels
    parameter width: image width in pixels
    parameter size_limit: number of images to load. Default is 0 which means all images are picked.
    return: list of matrices characterizing multiple images
    """
    print('Prepare data...')
    image_names = os.listdir(images_folder)
    if size_limit > 0 and len(image_names) >= size_limit:
        batch_filenames = [image_names[i] for i in range(size_limit)]
    else:
        batch_filenames = image_names
    data_list = []
    batch_data_list = []
    for image_name in tqdm(batch_filenames):
        image_filepath = images_folder + "/" + image_name
        img = cv2.imread(image_filepath)
        nchw_data = preprocess_image(img, new_shape=(height, width))
        batch_data_list.append(nchw_data)
        if len(batch_data_list) == batch_size:
            data_list.append(
                np.concatenate(batch_data_list, axis=0)
            )
            batch_data_list = []
    print('Data is ready!')
    return data_list


class DatasetDataReader(CalibrationDataReader):
    def __init__(self, calibration_image_folder: str, model_path: str, size_limit=5_000):
        self.enum_data = None

        # Use inference session to get input shape.
        session = onnxruntime.InferenceSession(
            model_path, None, 
            providers=['CPUExecutionProvider'],
            #['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
        )
        (batch_size, _, height, width) = session.get_inputs()[0].shape
        print(f'Input: {session.get_inputs()[0].shape}')

        # Convert image to input data
        self.batched_data_list = _preprocess_images(
            calibration_image_folder, batch_size, height, width, size_limit=size_limit
        )
        self.input_name = session.get_inputs()[0].name
        self.datasize = len(self.batched_data_list)

    def get_next(self):
        if self.enum_data is None:
            self.enum_data = self.create_generator()
        return next(self.enum_data, None)
    
    def create_generator(self, print_each=10, divide_by=5):
        counter = 0
        max_counter = len(self.batched_data_list)
        while counter != max_counter:
            yield {
                self.input_name: self.batched_data_list[counter],
            }
            
            counter += 1
            if counter % print_each == 0:
                loaded_percent = int((counter / max_counter) * 100)
                not_loaded_percent_normed = (100-loaded_percent) // divide_by
                loaded_percent_normed = loaded_percent // divide_by
                print('+' * loaded_percent_normed, '-' * not_loaded_percent_normed)
    
    def rewind(self):
        self.enum_data = None

In [2]:
def benchmark(model_path):
    session = onnxruntime.InferenceSession(
        model_path, None, 
        providers=['CPUExecutionProvider'],
        #['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
    )
    input_name = session.get_inputs()[0].name
    batch_size, channel, height, width = session.get_inputs()[0].shape

    total = 0.0
    runs = 10
    input_data = np.zeros((batch_size, channel, height, width), np.float32)
    # Warming up
    _ = session.run([], {input_name: input_data})
    for i in tqdm(range(runs)):
        start = time.perf_counter()
        _ = session.run([], {input_name: input_data})
        end = (time.perf_counter() - start) * 1000
        total += end
        print(f"{end:.2f}ms")
    total /= runs
    print(f"Avg: {total:.2f}ms")

In [3]:
quant_pre_process(
    input_model_path, preprocess_model_path,
    skip_symbolic_shape=True, skip_optimization=False
)

In [4]:
dr = DatasetDataReader(
    calibration_dataset_path, input_model_path
)

Input: [16, 3, 640, 640]
Prepare data...


100%|██████████| 5000/5000 [03:30<00:00, 23.80it/s]

Data is ready!





In [5]:
# Calibrate and quantize model
# Turn off model optimization during quantization
quantize_static(
    preprocess_model_path,
    output_model_path,
    dr,
    quant_format=QuantFormat.QDQ,
    # If you want all operations to be quantized - put None here
    op_types_to_quantize=['Conv', 'Relu', 'Add', 'MatMul', 'Mul'],
    per_channel=False,
    activation_type=QuantType.QUInt8,
    weight_type=QuantType.QUInt8, # QInt8
    optimize_model=False,
)
print("Calibrated and quantized model saved.")

 -------------------
+ ------------------
+ ------------------
++ -----------------
+++ ----------------
+++ ----------------
++++ ---------------
+++++ ---------------
+++++ --------------
++++++ -------------
+++++++ -------------
+++++++ ------------
++++++++ -----------
++++++++ -----------
+++++++++ ----------
++++++++++ ---------
++++++++++ ---------
+++++++++++ --------
++++++++++++ --------
++++++++++++ -------
+++++++++++++ ------
++++++++++++++ ------
++++++++++++++ -----
+++++++++++++++ ----
++++++++++++++++ ----
++++++++++++++++ ---
+++++++++++++++++ --
+++++++++++++++++ --
++++++++++++++++++ -
+++++++++++++++++++ 
+++++++++++++++++++ 
Calibrated and quantized model saved.


In [3]:
print("benchmarking fp32 model...")
benchmark(input_model_path)

print("benchmarking int8 model...")
benchmark(output_model_path)

benchmarking fp32 model...


 10%|█         | 1/10 [00:42<06:23, 42.60s/it]

42598.93ms


 20%|██        | 2/10 [01:28<05:49, 43.65s/it]

46098.53ms


 30%|███       | 3/10 [02:14<05:09, 44.24s/it]

45601.54ms


 40%|████      | 4/10 [02:56<04:22, 43.77s/it]

42694.53ms


 50%|█████     | 5/10 [03:38<03:35, 43.15s/it]

41696.89ms


 60%|██████    | 6/10 [04:21<02:52, 43.17s/it]

43195.51ms


 70%|███████   | 7/10 [05:02<02:06, 42.28s/it]

40207.47ms


 80%|████████  | 8/10 [05:42<01:23, 41.71s/it]

40396.84ms


 90%|█████████ | 9/10 [06:21<00:40, 40.99s/it]

39291.61ms


100%|██████████| 10/10 [07:04<00:00, 42.43s/it]

42501.71ms
Avg: 42428.36ms
benchmarking int8 model...



 10%|█         | 1/10 [00:26<04:00, 26.70s/it]

26696.29ms


 20%|██        | 2/10 [00:54<03:37, 27.15s/it]

28211.14ms


 30%|███       | 3/10 [01:23<03:13, 27.59s/it]

28515.55ms


 40%|████      | 4/10 [01:52<02:47, 27.89s/it]

28501.50ms


 50%|█████     | 5/10 [02:21<02:21, 28.28s/it]

29205.83ms


 60%|██████    | 6/10 [02:50<01:53, 28.50s/it]

28999.91ms


 70%|███████   | 7/10 [03:18<01:25, 28.35s/it]

27915.00ms


 80%|████████  | 8/10 [03:47<00:57, 28.63s/it]

29199.97ms


 90%|█████████ | 9/10 [04:16<00:28, 28.86s/it]

29397.93ms


100%|██████████| 10/10 [04:46<00:00, 28.61s/it]

29098.38ms
Avg: 28574.15ms





In [None]:
import argparse
import onnx
from onnxruntime.quantization.qdq_loss_debug import (
    collect_activations, compute_activation_error, compute_weight_error,
    create_activation_matching, create_weight_matching,
    modify_model_output_intermediate_tensors)

import resnet50_data_reader


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--float_model", required=True, help="Path to original floating point model"
    )
    parser.add_argument("--qdq_model", required=True, help="Path to qdq model")
    parser.add_argument(
        "--calibrate_dataset", default="./test_images", help="calibration data set"
    )
    args = parser.parse_args()
    return args


def _generate_aug_model_path(model_path: str) -> str:
    aug_model_path = (
        model_path[: -len(".onnx")] if model_path.endswith(".onnx") else model_path
    )
    return aug_model_path + ".save_tensors.onnx"


def main():
    # Process input parameters and setup model input data reader
    args = get_args()
    float_model_path = args.float_model
    qdq_model_path = args.qdq_model
    calibration_dataset_path = args.calibrate_dataset

    print("------------------------------------------------\n")
    print("Comparing weights of float model vs qdq model.....")

    matched_weights = create_weight_matching(float_model_path, qdq_model_path)
    weights_error = compute_weight_error(matched_weights)
    for weight_name, err in weights_error.items():
        print(f"Cross model error of '{weight_name}': {err}\n")

    print("------------------------------------------------\n")
    print("Augmenting models to save intermediate activations......")

    aug_float_model = modify_model_output_intermediate_tensors(float_model_path)
    aug_float_model_path = _generate_aug_model_path(float_model_path)
    onnx.save(
        aug_float_model,
        aug_float_model_path,
        save_as_external_data=False,
    )
    del aug_float_model

    aug_qdq_model = modify_model_output_intermediate_tensors(qdq_model_path)
    aug_qdq_model_path = _generate_aug_model_path(qdq_model_path)
    onnx.save(
        aug_qdq_model,
        aug_qdq_model_path,
        save_as_external_data=False,
    )
    del aug_qdq_model

    print("------------------------------------------------\n")
    print("Running the augmented floating point model to collect activations......")
    input_data_reader = resnet50_data_reader.ResNet50DataReader(
        calibration_dataset_path, float_model_path
    )
    float_activations = collect_activations(aug_float_model_path, input_data_reader)

    print("------------------------------------------------\n")
    print("Running the augmented qdq model to collect activations......")
    input_data_reader.rewind()
    qdq_activations = collect_activations(aug_qdq_model_path, input_data_reader)

    print("------------------------------------------------\n")
    print("Comparing activations of float model vs qdq model......")

    act_matching = create_activation_matching(qdq_activations, float_activations)
    act_error = compute_activation_error(act_matching)
    for act_name, err in act_error.items():
        print(f"Cross model error of '{act_name}': {err['xmodel_err']} \n")
        print(f"QDQ error of '{act_name}': {err['qdq_err']} \n")


if __name__ == "__main__":
    main()

In [None]:
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp32 = 'path/to/the/model.onnx'
model_quant = 'path/to/the/model.quant.onnx'
quantized_model = quantize_dynamic(model_fp32, model_quant)

In [4]:
import torch
import onnxruntime
from onnxruntime.quantization.registry import QDQRegistry, QLinearOpsRegistry

In [5]:
QDQRegistry

{'Conv': onnxruntime.quantization.operators.conv.QDQConv,
 'Gemm': onnxruntime.quantization.operators.gemm.QDQGemm,
 'Clip': onnxruntime.quantization.operators.activation.QDQRemovableActivation,
 'Relu': onnxruntime.quantization.operators.activation.QDQRemovableActivation,
 'Reshape': onnxruntime.quantization.operators.direct_q8.QDQDirect8BitOp,
 'Transpose': onnxruntime.quantization.operators.direct_q8.QDQDirect8BitOp,
 'Squeeze': onnxruntime.quantization.operators.direct_q8.QDQDirect8BitOp,
 'Unsqueeze': onnxruntime.quantization.operators.direct_q8.QDQDirect8BitOp,
 'Resize': onnxruntime.quantization.operators.resize.QDQResize,
 'MaxPool': onnxruntime.quantization.operators.maxpool.QDQMaxPool,
 'AveragePool': onnxruntime.quantization.operators.direct_q8.QDQDirect8BitOp,
 'MatMul': onnxruntime.quantization.operators.matmul.QDQMatMul,
 'Split': onnxruntime.quantization.operators.split.QDQSplit,
 'Gather': onnxruntime.quantization.operators.gather.QDQGather,
 'Softmax': onnxruntime.quan

In [6]:
QLinearOpsRegistry

{'ArgMax': onnxruntime.quantization.operators.argmax.QArgMax,
 'Conv': onnxruntime.quantization.operators.conv.QLinearConv,
 'Gemm': onnxruntime.quantization.operators.gemm.QLinearGemm,
 'MatMul': onnxruntime.quantization.operators.matmul.QLinearMatMul,
 'Add': onnxruntime.quantization.operators.binary_op.QLinearBinaryOp,
 'Mul': onnxruntime.quantization.operators.binary_op.QLinearBinaryOp,
 'Relu': onnxruntime.quantization.operators.activation.QLinearActivation,
 'Clip': onnxruntime.quantization.operators.activation.QLinearActivation,
 'LeakyRelu': onnxruntime.quantization.operators.activation.QLinearActivation,
 'Sigmoid': onnxruntime.quantization.operators.activation.QLinearActivation,
 'MaxPool': onnxruntime.quantization.operators.maxpool.QMaxPool,
 'GlobalAveragePool': onnxruntime.quantization.operators.gavgpool.QGlobalAveragePool,
 'Split': onnxruntime.quantization.operators.split.QSplit,
 'Pad': onnxruntime.quantization.operators.pad.QPad,
 'Reshape': onnxruntime.quantization.op