In [26]:
import numpy as np
import os
import onnxruntime
import time
import onnxruntime
from onnxruntime.quantization import CalibrationDataReader, create_calibrator, write_calibration_table

In [19]:
def benchmark(model_path, input_shape=(1,3,224,224), providers=['CPUExecutionProvider']):
    sess_options = onnxruntime.SessionOptions()
    session = onnxruntime.InferenceSession(model_path, sess_options, providers=providers)
    input_name = session.get_inputs()[0].name

    total = 0.0
    runs = 10
    input_data = np.zeros(input_shape, np.float32)
    # Warming up
    _ = session.run([], {input_name: input_data})
    for i in range(runs):
        start = time.perf_counter()
        _ = session.run([], {input_name: input_data})
        end = (time.perf_counter() - start) * 1000
        total += end
        print(f"{end:.2f}ms")
    total /= runs
    print(f"Avg: {total:.2f}ms")

In [20]:
fp32_file= '/home/PJLAB/maningsheng/workspace/mse/ppq/working/end2end.onnx'
int8_file= '/home/PJLAB/maningsheng/workspace/mse/ppq/working/quantized-ort_oos_int8.onnx'
benchmark(fp32_file)

11.08ms
10.72ms
10.60ms
10.51ms
10.61ms
10.46ms
10.49ms
10.50ms
10.48ms
10.57ms
Avg: 10.60ms


In [35]:
model_path = '/data/mse/ppq/working/end2end.onnx'
augmented_model_path = '/data/mse/ppq/working/end2end-ort-trt-int8.onnx'
sess_options = onnxruntime.SessionOptions()
providers=["TensorrtExecutionProvider"]
sess_options.log_severity_level = 0
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
session = onnxruntime.InferenceSession(model_path, sess_options=sess_options, providers=providers)

# INT8 calibration setting
calibration_table_generation_enable = True  # Enable/Disable INT8 calibration

# TensorRT EP INT8 settings
os.environ["ORT_TENSORRT_FP16_ENABLE"] = "1"  # Enable FP16 precision
os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1"  # Enable INT8 precision
os.environ["ORT_TENSORRT_INT8_CALIBRATION_TABLE_NAME"] = "calibration.flatbuffers"  # Calibration table name
os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1"  # Enable engine caching
execution_provider = ["TensorrtExecutionProvider"]


# Generate INT8 calibration table
if calibration_table_generation_enable:
    calibrator = create_calibrator(model_path, [], augmented_model_path=augmented_model_path)
    calibrator.set_execution_providers(["CUDAExecutionProvider"])        
    data_reader = ImageNetDataReader()q
    calibrator.collect_data(data_reader)
    write_calibration_table(calibrator.compute_range())

2022-11-15 16:13:48.842808783 [I:onnxruntime:, inference_session.cc:331 ConstructorCommon] Creating and using per session threadpools since use_per_session_threads_ is true
2022-11-15 16:13:48.842826819 [I:onnxruntime:, inference_session.cc:351 ConstructorCommon] Dynamic block base set to 0
2022-11-15 16:13:48.857497700 [I:onnxruntime:, inference_session.cc:1327 Initialize] Initializing session.
2022-11-15 16:13:48.857524073 [I:onnxruntime:, inference_session.cc:1364 Initialize] Adding default CPU execution provider.
2022-11-15 16:13:48.857676550 [V:onnxruntime:, inference_session.cc:150 VerifyEachNodeIsAssignedToAnEp] Node placements
2022-11-15 16:13:48.857681587 [V:onnxruntime:, inference_session.cc:152 VerifyEachNodeIsAssignedToAnEp] All nodes have been placed on [CPUExecutionProvider].
2022-11-15 16:13:48.857747724 [V:onnxruntime:, session_state.cc:68 CreateGraphInfo] SaveMLValueNameIndexMapping
2022-11-15 16:13:48.857771036 [V:onnxruntime:, session_state.cc:114 CreateGraphInfo] Do

In [37]:
inference_outputs_list = []
for i in range(16):
    inputs = np.random.randn(1, 3, 224, 224).astype(np.float32)
    output = session.run(None, dict(input=inputs))
    inference_outputs_list.append(output)

2022-11-16 19:53:50.360646059 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16 19:53:50.385602139 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16 19:53:50.405855643 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16 19:53:50.425132687 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16 19:53:50.442909269 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16 19:53:50.460875825 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16 19:53:50.479056812 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16 19:53:50.493803559 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16 19:53:50.509407053 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16 19:53:50.525084677 [I:onnxruntime:, sequential_executor.cc:176 Execute] Begin execution
2022-11-16

In [39]:
print([i[0].shape for i in inference_outputs_list])

[(1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000), (1, 1000)]


In [34]:
class ImageNetDataReader(CalibrationDataReader):
    def __init__(self):
        self.num = 16
        
    def get_next(self):
        if self.num > 0:
            self.num -= 1
            img = np.random.randn(1, 3, 224, 224).astype(np.float32)
            return dict(input=img)
        else:
            return None