In [1]:
import os
os.environ['PJRT_DEVICE'] = 'CPU'

import numpy as np
import torch 
import tensorflow as tf
import ai_edge_torch
import torchvision

2025-03-04 20:35:59.583496: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-04 20:35:59.592810: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741109759.604060   34223 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741109759.607640   34223 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-04 20:35:59.620252: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Parameter
import timm

class SeparableConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
        super(SeparableConv2d, self).__init__()

        self.conv1 = nn.Sequential(nn.Conv2d(in_channels, in_channels, kernel_size, stride, padding, dilation, groups=in_channels,
                                    bias=bias),
                                    nn.BatchNorm2d(in_channels))
        self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, 1, 1, bias=bias)


    def forward(self, x):
        x = self.conv1(x)
        x = self.pointwise(x)
        return x


def normal_init(module, mean=0, std=1, bias=0):
        nn.init.normal_(module.weight, mean, std)
        if hasattr(module, 'bias'):
            nn.init.constant_(module.bias, bias)





class ComplexUpsample(nn.Module):
    def __init__(self, input_dim=128, outpt_dim=128):
        super().__init__()

        self.conv1 = nn.Sequential(SeparableConv2d(input_dim, outpt_dim, kernel_size=3, stride=1, padding=1, bias=False),
                                   nn.BatchNorm2d(outpt_dim),
                                   nn.ReLU(inplace=True)
                                   )

        self.conv2 = nn.Sequential(SeparableConv2d(input_dim, outpt_dim, kernel_size=5, stride=1, padding=2, bias=False),
                                   nn.BatchNorm2d(outpt_dim),
                                   nn.ReLU(inplace=True)
                                   )

    def forward(self, inputs):
        # do preprocess

        x = self.conv1(inputs)
        y = self.conv2(inputs)

        z = x + y

        z = nn.functional.interpolate(z, scale_factor=2,mode='bilinear' )

        return z

class Fpn(nn.Module):
    def __init__(self,input_dims=[24,32,96,320],head_dims=[128,128,128] ):
        super().__init__()





        self.latlayer2=nn.Sequential(SeparableConv2d(input_dims[0],head_dims[0]//2,kernel_size=5,padding=2),
                                      nn.BatchNorm2d(head_dims[0]//2),
                                      nn.ReLU(inplace=True))


        self.latlayer3=nn.Sequential(SeparableConv2d(input_dims[1],head_dims[1]//2,kernel_size=5,padding=2),
                                      nn.BatchNorm2d(head_dims[1]//2),
                                      nn.ReLU(inplace=True))

        self.latlayer4 = nn.Sequential(SeparableConv2d(input_dims[2], head_dims[2] // 2,kernel_size=5,padding=2),
                                       nn.BatchNorm2d(head_dims[2] // 2),
                                       nn.ReLU(inplace=True))



        self.upsample3=ComplexUpsample(head_dims[1],head_dims[0]//2)

        self.upsample4 =ComplexUpsample(head_dims[2],head_dims[1]//2)

        self.upsample5 = ComplexUpsample(input_dims[3],head_dims[2]//2)




    def forward(self, inputs):
        ##/24,32,96,320
        c2, c3, c4, c5 = inputs

        c4_lat = self.latlayer4(c4)
        c3_lat = self.latlayer3(c3)
        c2_lat = self.latlayer2(c2)


        upsample_c5=self.upsample5(c5)

        p4=torch.cat([c4_lat,upsample_c5],dim=1)


        upsample_p4=self.upsample4(p4)

        p3=torch.cat([c3_lat,upsample_p4],dim=1)

        upsample_p3 = self.upsample3(p3)

        p2 = torch.cat([c2_lat, upsample_p3],dim=1)


        return p2



class Net(nn.Module):
    def __init__(self, ):
        super().__init__()
        struct = 'Mobilenetv2'
        if 'Mobilenetv2' in struct:
            self.model = timm.create_model('mobilenetv2_100', pretrained=True, features_only=True,exportable=True)

    def forward(self, inputs):
        # do preprocess

        # Convolution layers
        fms = self.model(inputs)

        # for ff in fms:
        #     print(ff.size())

        return fms[-4:]

class CenterNetHead(nn.Module):
    def __init__(self,nc,head_dims=[128,128,128] ):
        super().__init__()



        self.cls =SeparableConv2d(head_dims[0], nc, kernel_size=3, stride=1, padding=1, bias=True)
        self.wh =SeparableConv2d(head_dims[0], 4, kernel_size=3, stride=1, padding=1, bias=True)
        # self.offset =SeparableConv2d(head_dims[0], 2, kernel_size=3, stride=1, padding=1, bias=True)
        # self.iou_head = nn.Conv2d(head_dims[0], 1, kernel_size=3, stride=1, padding=1, bias=True)

        normal_init(self.cls.pointwise, 0, 0.01,-2.19)
        normal_init(self.wh.pointwise, 0, 0.01, 0)



    def forward(self, inputs):


        cls = self.cls(inputs).sigmoid_()
        wh = self.wh(inputs)
        # offset = self.offset(inputs)
        # iou_aware_head = self.iou_head(inputs).sigmoid_().squeeze(1) #[B, H, W]


        return cls,wh

class CenterNet(nn.Module):
    def __init__(self,nc):
        super().__init__()

        self.nc = nc
        self.down_ratio= 4


        ###model structure
        self.backbone = Net()

        self.fpn=Fpn(head_dims=[128,192,256],input_dims=[24,32,96,320])

        self.head = CenterNetHead(self.nc,head_dims=[128,192,256])



        if self.down_ratio==8:
            self.extra_conv=nn.Sequential(SeparableConv2d([24,32,96,320][-2],[24,32,96,320][-1],
                                                    kernel_size=3,stride=2,padding=1),
                                          nn.BatchNorm2d([24,32,96,320][-1]),
                                          nn.ReLU(inplace=True))
        else:
            self.extra_conv=None



        self.device=torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    def forward(self, inputs):

        fms = self.backbone(inputs)

        if self.extra_conv is not None:

            extra_fm=self.extra_conv(fms[-1])
            fms.append(extra_fm)
            fms=fms[1:]

        fpn_fm=self.fpn(fms)

        cls, wh = self.head(fpn_fm)



        return cls,wh*16




In [3]:
model = CenterNet(10)



In [4]:
model.eval()

CenterNet(
  (backbone): Net(
    (model): EfficientNetFeatures(
      (conv_stem): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNormAct2d(
        32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
        (drop): Identity()
        (act): ReLU6(inplace=True)
      )
      (blocks): Sequential(
        (0): Sequential(
          (0): DepthwiseSeparableConv(
            (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (bn1): BatchNormAct2d(
              32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
              (drop): Identity()
              (act): ReLU6(inplace=True)
            )
            (aa): Identity()
            (se): Identity()
            (conv_pw): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn2): BatchNormAct2d(
              16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=Tru

In [5]:
import time

In [6]:
batch_size = 1
input_height = 320
input_width = 320
device = torch.device('cpu')
model.to(device)
dummy_input = torch.randn(batch_size, 3, input_height, input_width).to(device)
print("Start Tracing")
model = torch.jit.trace(model, dummy_input)
print("End Tracing")

# Create dummy input data
# Quantize the model for faster CPU inference
model_quantized = torch.quantization.quantize_dynamic(
    model, {nn.Conv2d, nn.Linear}, dtype=torch.qint8
)
model_quantized.eval()
model_quantized.to(device)



# Warm-up runs (to exclude initialization overhead)
with torch.no_grad():
    for _ in range(10):
        _ = model_quantized(dummy_input)
        print(_[0].shape)

# Timing settings
num_runs = 100
start_time = time.time()

# Run the model multiple times and measure the total time
with torch.no_grad():
    for _ in range(num_runs):
        outputs = model_quantized(dummy_input)

end_time = time.time()
total_time = end_time - start_time
fps = num_runs / total_time

print(f"Total inference time for {num_runs} runs: {total_time:.2f} seconds")
print(f"Average FPS: {fps:.2f}")

Start Tracing
End Tracing
torch.Size([1, 10, 80, 80])
torch.Size([1, 10, 80, 80])


  param_grad = param.grad


torch.Size([1, 10, 80, 80])
torch.Size([1, 10, 80, 80])
torch.Size([1, 10, 80, 80])
torch.Size([1, 10, 80, 80])
torch.Size([1, 10, 80, 80])
torch.Size([1, 10, 80, 80])
torch.Size([1, 10, 80, 80])
torch.Size([1, 10, 80, 80])
Total inference time for 100 runs: 2.80 seconds
Average FPS: 35.68


In [58]:
model = CenterNet(10)
model.eval()
outputs = model(dummy_input)



In [8]:
#If model is traced/scripted, reload the base model
sample_inputs = (torch.randn(1, 3, 320, 320),)
edge_model = ai_edge_torch.convert(model, sample_inputs)

I0000 00:00:1741109787.755734   34223 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9961 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6


INFO:tensorflow:Assets written to: /tmp/tmpzg441gkv/assets


INFO:tensorflow:Assets written to: /tmp/tmpzg441gkv/assets
W0000 00:00:1741109792.719864   34223 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1741109792.719881   34223 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-03-04 20:36:32.720315: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpzg441gkv
2025-03-04 20:36:32.724959: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-03-04 20:36:32.724994: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpzg441gkv
I0000 00:00:1741109792.765325   34223 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled
2025-03-04 20:36:32.771640: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-03-04 20:36:33.121487: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpzg441gkv
2025-03-04 20:36:33.200

In [9]:
edge_output = edge_model(*sample_inputs)

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [10]:
edge_output[0].shape

(1, 10, 80, 80)

In [11]:
outputs[0].shape

torch.Size([1, 10, 80, 80])

In [12]:
if np.allclose(outputs[0].detach().numpy(), edge_output[0], atol=1e-5):
    print("Inference result with Pytorch and TfLite was within tolerance")
else:
    print("Something wrong with Pytorch --> TfLite")

Something wrong with Pytorch --> TfLite


In [13]:
edge_model.export('mbv2_centernet_ltrb.tflite')

In [16]:
avg_fps = 0.0
n = 100
for i in range(n):
  f1 = time.time()
  edge_output = edge_model(*sample_inputs)
  f2 = time.time()
  avg_fps += 1 / (f2-f1)

print(avg_fps/n)


49.951631328792566


In [24]:
from ai_edge_torch.quantize.quant_config import QuantConfig
from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e
from torch._export import capture_pre_autograd_graph

from ai_edge_torch.quantize.pt2e_quantizer import get_symmetric_quantization_config
from ai_edge_torch.quantize.pt2e_quantizer import PT2EQuantizer
from ai_edge_torch.quantize.quant_config import QuantConfig

In [25]:
pt2e_quantizer = PT2EQuantizer().set_global(
    get_symmetric_quantization_config(is_per_channel=True, is_dynamic=True)
)
sample_args = (torch.rand(1,3,320,320),)
pt2e_torch_model = capture_pre_autograd_graph(model, sample_args)
pt2e_torch_model = prepare_pt2e(pt2e_torch_model, pt2e_quantizer)
pt2e_torch_model(*sample_args)


W0304 20:50:29.573360 34223 site-packages/torch/_export/__init__.py:67] capture_pre_autograd_graph() is deprecated and doesn't provide any function guarantee moving forward.
W0304 20:50:29.573709 34223 site-packages/torch/_export/__init__.py:68] Please switch to use torch.export.export_for_training instead.


(tensor([[[[0.0934, 0.0983, 0.0992,  ..., 0.1025, 0.0973, 0.1050],
           [0.0980, 0.1011, 0.1018,  ..., 0.0944, 0.1026, 0.1046],
           [0.0989, 0.1003, 0.1002,  ..., 0.0959, 0.1013, 0.1005],
           ...,
           [0.1007, 0.1016, 0.0947,  ..., 0.0977, 0.0986, 0.1015],
           [0.0944, 0.0929, 0.1112,  ..., 0.1006, 0.1017, 0.1024],
           [0.1013, 0.0961, 0.1015,  ..., 0.1072, 0.1029, 0.1022]],
 
          [[0.0960, 0.0959, 0.1007,  ..., 0.0996, 0.0983, 0.0959],
           [0.0992, 0.0943, 0.1036,  ..., 0.0941, 0.0933, 0.0958],
           [0.1009, 0.0990, 0.1007,  ..., 0.1026, 0.0968, 0.0969],
           ...,
           [0.0997, 0.0986, 0.0955,  ..., 0.1026, 0.0998, 0.0975],
           [0.0984, 0.1013, 0.1022,  ..., 0.0976, 0.0965, 0.0984],
           [0.1005, 0.0934, 0.1056,  ..., 0.0992, 0.0969, 0.0990]],
 
          [[0.0958, 0.0954, 0.0977,  ..., 0.0926, 0.0939, 0.0937],
           [0.0926, 0.0890, 0.1007,  ..., 0.0993, 0.0937, 0.0968],
           [0.0960, 0.09

In [33]:
# Convert the prepared model to a quantized model
pt2e_torch_model = convert_pt2e(pt2e_torch_model, fold_quantize=False)
pt2e_torch_model = torch.ao.quantization.move_exported_model_to_eval(pt2e_torch_model)

In [34]:
pt2e_drq_model = ai_edge_torch.convert(pt2e_torch_model, sample_args, quant_config=QuantConfig(pt2e_quantizer=pt2e_quantizer))



INFO:tensorflow:Assets written to: /tmp/tmpocgk0xnt/assets


INFO:tensorflow:Assets written to: /tmp/tmpocgk0xnt/assets
W0000 00:00:1741110830.249532   34223 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1741110830.249547   34223 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-03-04 20:53:50.249681: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpocgk0xnt
2025-03-04 20:53:50.253350: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-03-04 20:53:50.253376: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpocgk0xnt
2025-03-04 20:53:50.286851: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-03-04 20:53:50.545505: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpocgk0xnt
2025-03-04 20:53:50.603771: I tensorflow/cc/saved_model/loader.cc:466] SavedModel load for tags { serve }; Status: success: OK. Took 354093

In [31]:
pt2e_drq_model.export('pq_mbv2_centernet_ltrb.tflite')

In [35]:
avg_fps = 0.0
n = 100
for i in range(n):
  f1 = time.time()
  pt2e_edge_output = pt2e_drq_model(*sample_args)
  f2 = time.time()
  avg_fps += 1 / (f2-f1)

print(avg_fps/n)

23.114237047021117


In [36]:
def representative_dataset_gen_from_array():
    # Option 2: Using a numpy array of samples
    # Create a small calibration dataset (replace with your actual data)
    calibration_data = np.random.rand(25, 3,320, 320)  # Example for image data with 100 samples
    
    for i in range(len(calibration_data)):
        sample = calibration_data[i:i+1]
        yield [sample.astype(np.float32)]

In [38]:
import tensorflow as tf

# Pass TfLite Converter quantization flags to _ai_edge_converter_flags parameter.
tfl_converter_flags = {'optimizations': [tf.lite.Optimize.DEFAULT],'target_spec.supported_ops' : [tf.lite.OpsSet.TFLITE_BUILTINS_INT8],'inference_input_type':tf.int8,'inference_output_type':tf.int8,
                       'representative_dataset' :representative_dataset_gen_from_array }

tfl_drq_model = ai_edge_torch.convert(
    model, sample_args, _ai_edge_converter_flags=tfl_converter_flags
)

INFO:tensorflow:Assets written to: /tmp/tmpqjrkkcyj/assets


INFO:tensorflow:Assets written to: /tmp/tmpqjrkkcyj/assets
W0000 00:00:1741111146.970063   34223 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1741111146.970080   34223 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-03-04 20:59:06.970260: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpqjrkkcyj
2025-03-04 20:59:06.975161: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-03-04 20:59:06.975219: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpqjrkkcyj
2025-03-04 20:59:07.026725: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-03-04 20:59:07.395066: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpqjrkkcyj
2025-03-04 20:59:07.485848: I tensorflow/cc/saved_model/loader.cc:466] SavedModel load for tags { serve }; Status: success: OK. Took 515591

In [39]:
tfl_drq_model.export('mbv2_tflite_int8.tflite')

In [49]:
torch.rand((1,3,320,320)).to(torch.int8)

tensor([[[[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]],

         [[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]],

         [[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]]]], dtype=torch.int8)

In [50]:
avg_fps = 0.0
n = 100
sample_int8 = torch.rand((1,3,320,320)).to(torch.int8)
for i in range(n):
  f1 = time.time()
  int8_edge_output = tfl_drq_model(sample_int8)
  f2 = time.time()
  avg_fps += 1 / (f2-f1)

print(avg_fps/n)

67.4330118359619


In [51]:
import openvino as ov 

In [None]:
dummy_input = torch.randn(1, 3, 320, 320)
ovmodel =  ov.compile_model(ov.convert_model(model, example_input=dummy_input))

In [None]:
avg_fps = 0.0
n = 100
dummy_input = torch.randn(1, 3, 320, 320)
for i in range(n):
  f1 = time.time()
  out = ovmodel(dummy_input)
  f2 = time.time()
  avg_fps += 1 / (f2-f1)

print(avg_fps/n)

158.9495103578814


In [55]:
import nncf
from torch.utils.data import Dataset, DataLoader

In [57]:
class RandomDataset(Dataset):
    def __init__(self, size=100, shape=(3, 320, 320)):
        self.size = size
        self.shape = shape
        
    def __len__(self):
        return self.size
        
    def __getitem__(self, idx):
        # Generate random tensor in range [0, 1]
        random_tensor = torch.rand(self.shape)
        return random_tensor

In [61]:
calibration_dataset = RandomDataset(size=100, shape=(3, 320, 320))
calibration_dataloader = DataLoader(calibration_dataset, batch_size=8)
core = ov.Core()

def transform_fn(data_item):
    output = data_item
    return output.float()

calibration_dataset = nncf.Dataset(calibration_dataloader, transform_fn)
# 5. Create quantized model
quantized_model = nncf.quantize(
    model=model,
    calibration_dataset=calibration_dataset
)

# 7. Convert to OpenVINO IR
dummy_input = torch.randn(1, 3, 320,320).float()
quantized_model_ir = ov.convert_model(quantized_model, example_input=dummy_input, input=[-1,3,320,320])
ov.save_model(quantized_model_ir, "./int8.xml")
int8_compiled_model = core.compile_model(quantized_model_ir, 'CPU')

print("Model successfully quantized to INT8 and saved as OpenVINO IR")

Output()



Output()

  return self._level_low.item()
  return self._level_high.item()
Tensor-likes are not close!

Mismatched elements: 62185 / 64000 (97.2%)
Greatest absolute difference: 0.0016182512044906616 at index (0, 8, 38, 20) (up to 1e-05 allowed)
Greatest relative difference: 0.01566274797813593 at index (0, 0, 4, 51) (up to 1e-05 allowed)
  _check_trace(
Tensor-likes are not close!

Mismatched elements: 25564 / 25600 (99.9%)
Greatest absolute difference: 0.2747565507888794 at index (0, 1, 42, 20) (up to 1e-05 allowed)
Greatest relative difference: 2036842.4069914538 at index (0, 1, 9, 54) (up to 1e-05 allowed)
  _check_trace(


Model successfully quantized to INT8 and saved as OpenVINO IR


In [62]:
avg_fps = 0.0
n = 100
dummy_input = torch.randn(1, 3, 320, 320)
for i in range(n):
  f1 = time.time()
  out = int8_compiled_model(dummy_input)
  f2 = time.time()
  avg_fps += 1 / (f2-f1)

print(avg_fps/n)

429.92943314090473
