In [1]:
import sys

sys.path.append("../misc")
from utils import DataShaper
import os
import torch
import math
import matplotlib.pyplot as plt
import numpy as np
from brevitas.nn import QuantConv2d, QuantIdentity, QuantReLU
from brevitas.quant.fixed_point import (
    Int8ActPerTensorFixedPoint,
    Int8WeightPerTensorFixedPoint,
    Uint8ActPerTensorFixedPoint,
)

ds = DataShaper()
torch.manual_seed(0)
design = "four_bottleneck"
# aie_teardown()
sys.path.append("../../../utils")
import xrtutils

xclbin_path = os.path.abspath("../bottleneck_block/" + design + "/build/final.xclbin")
insts_path = os.path.abspath("../bottleneck_block/" + design + "/build/insts.txt")

log_folder = "log/log_" + design
if not os.path.exists(log_folder):
    os.makedirs(log_folder)
enable_aie = True
aie_is_setup = False
enable_trace = False
trace_file = "traces/trace.txt"

app = None
in_buf = None
arg1_buf = None
out_buf = None
dtype_in = np.dtype("int8")
dtype_wts = np.dtype("int8")
dtype_out = np.dtype("uint8")

shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
# shape_in_wts1  = (8,8,1,1,8,8) #out,in,ky,kx,in8,out8
# shape_in_wts2  = (8,8,3,3,8,8)  #out,in,ky,kx,in8,out8
# shape_in_wts3  = (32,8,1,1,8,8)   #out,in,ky,kx,in8,out8
# shape_in_wts_skip  = (8,32,1,1,8,8) #out,in,ky,kx,in8,out8
shape_total_wts = (282624, 1)
shape_out = (32, 32, 32, 8)

trace_size = 8192


def setup_aie(
    xclbin_path,
    insts_path,
    in_0_shape,
    in_0_dtype,
    in_1_shape,
    in_1_dtype,
    out_buf_shape,
    out_buf_dtype,
    enable_trace=False,
    kernel_name="MLIR_AIE",
):
    app = xrtutils.AIE_Application(xclbin_path, insts_path, kernel_name)
    app.register_buffer(2, shape=in_0_shape, dtype=in_0_dtype)
    app.register_buffer(3, shape=in_1_shape, dtype=in_1_dtype)
    if enable_trace:
        out_buf_len_bytes = np.prod(out_buf_shape) * np.dtype(out_buf_dtype).itemsize
        out_buf_shape = (out_buf_len_bytes + trace_size,)
        out_buf_dtype = np.uint8
    app.register_buffer(4, shape=out_buf_shape, dtype=out_buf_dtype)
    return app


def extract_trace(out_buf, out_buf_shape, out_buf_dtype):
    trace_size_words = trace_size // 4
    out_buf_flat = out_buf.reshape((-1,)).view(np.uint32)
    output_prefix = (
        out_buf_flat[:-trace_size_words].view(out_buf_dtype).reshape(out_buf_shape)
    )
    trace_suffix = out_buf_flat[-trace_size_words:]
    return output_prefix, trace_suffix


def write_out_trace(trace, file_name):
    out_str = "\n".join(f"{i:0{8}x}" for i in trace if i != 0)
    with open(file_name, "w") as f:
        f.write(out_str)


app = setup_aie(
    xclbin_path,
    insts_path,
    shape_in_act,
    dtype_in,
    shape_total_wts,
    dtype_wts,
    shape_out,
    dtype_out,
    enable_trace,
)

In [2]:
import matplotlib.pyplot as plt
import torchvision
import torch
import torch.nn as nn
import numpy as np
import os
from brevitas.nn import QuantConv2d, QuantIdentity, QuantReLU
from brevitas.quant.fixed_point import (
    Int8ActPerTensorFixedPoint,
    Int8WeightPerTensorFixedPoint,
    Uint8ActPerTensorFixedPoint,
)

torch.manual_seed(0)
torch.use_deterministic_algorithms(True)


num_classes = 10


def init_pad_input(x, input_channels, desired_channels=4):
    padding = torch.zeros(1, input_channels * (desired_channels - 1), 32, 32)
    return torch.cat((x, padding), 1)


# try:

In [4]:
for i in range(0, 4):

    class QuantBottleneck_projected(nn.Module):
        expansion = 4

        def __init__(self, in_planes=64, planes=64):
            super(QuantBottleneck_projected, self).__init__()
            # block 0
            self.quant_id_1 = QuantIdentity(
                act_quant=Int8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block0_conv1 = QuantConv2d(
                in_planes,
                planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block0_conv2 = QuantConv2d(
                planes,
                planes,
                kernel_size=3,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                padding=1,
                padding_mode="zeros",
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block0_conv3 = QuantConv2d(
                planes,
                self.expansion * planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block0_relu1 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block0_relu2 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block0_relu3 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )

            self.shortcut = QuantConv2d(
                in_planes,
                self.expansion * planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )

            # block 1
            self.quant_block1_conv1 = QuantConv2d(
                self.expansion * in_planes,
                planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block1_conv2 = QuantConv2d(
                planes,
                planes,
                kernel_size=3,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                padding=1,
                padding_mode="zeros",
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block1_conv3 = QuantConv2d(
                planes,
                self.expansion * planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block1_relu1 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block1_relu2 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block1_relu3 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )

            self.quant_add_1 = QuantIdentity(
                act_quant=Int8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            # Quant_add_1 shares the scale factors with block0_relu3, however one is signed and the other one is unsigned
            self.quant_add_1.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl = (
                self.quant_block0_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl
            )
            self.quant_add_1.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl = (
                self.quant_block0_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl
            )

            # block 2
            self.quant_block2_conv1 = QuantConv2d(
                self.expansion * in_planes,
                planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block2_conv2 = QuantConv2d(
                planes,
                planes,
                kernel_size=3,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                padding=1,
                padding_mode="zeros",
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block2_conv3 = QuantConv2d(
                planes,
                self.expansion * planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block2_relu1 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block2_relu2 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block2_relu3 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )

            self.quant_add_2 = QuantIdentity(
                act_quant=Int8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            # Quant_add_1 shares the scale factors with block0_relu3, however one is signed and the other one is unsigned
            self.quant_add_2.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl = (
                self.quant_block1_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl
            )
            self.quant_add_2.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl = (
                self.quant_block1_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl
            )

            # block 3
            self.quant_block3_conv1 = QuantConv2d(
                self.expansion * in_planes,
                planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block3_conv2 = QuantConv2d(
                planes,
                planes,
                kernel_size=3,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                padding=1,
                padding_mode="zeros",
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block3_conv3 = QuantConv2d(
                planes,
                self.expansion * planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block3_relu1 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block3_relu2 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block3_relu3 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )

            self.quant_add_3 = QuantIdentity(
                act_quant=Int8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            # Quant_add_1 shares the scale factors with block0_relu3, however one is signed and the other one is unsigned
            self.quant_add_3.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl = (
                self.quant_block2_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl
            )
            self.quant_add_3.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl = (
                self.quant_block2_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl
            )

        def forward(self, x):
            out_q = self.quant_id_1(x)
            out_rhs = self.quant_block0_conv1(out_q)
            out_rhs = self.quant_block0_relu1(out_rhs)
            out_rhs = self.quant_block0_conv2(out_rhs)
            out_rhs = self.quant_block0_relu2(out_rhs)
            out_rhs = self.quant_block0_conv3(out_rhs)
            out_rhs = self.quant_id_1(out_rhs)
            out_lhs = self.shortcut(out_q)
            out_lhs = self.quant_id_1(out_lhs)
            out_block0 = out_rhs + out_lhs
            out_block0 = self.quant_block0_relu3(out_block0)
            # block 1
            out_rhs1 = self.quant_block1_conv1(out_block0)
            out_rhs1 = self.quant_block1_relu1(out_rhs1)
            out_rhs1 = self.quant_block1_conv2(out_rhs1)
            out_rhs1 = self.quant_block1_relu2(out_rhs1)
            out_rhs1 = self.quant_block1_conv3(out_rhs1)
            out_rhs1 = self.quant_add_1(out_rhs1)
            out_block1 = out_block0 + out_rhs1
            # out_block1=out_block0
            out_block1 = self.quant_block1_relu3(out_block1)

            # block 2
            out_rhs2 = self.quant_block2_conv1(out_block1)
            out_rhs2 = self.quant_block2_relu1(out_rhs2)
            out_rhs2 = self.quant_block2_conv2(out_rhs2)
            out_rhs2 = self.quant_block2_relu2(out_rhs2)
            out_rhs2 = self.quant_block2_conv3(out_rhs2)
            out_rhs2 = self.quant_add_2(out_rhs2)
            out_block2 = out_block1 + out_rhs2
            # out_block1=out_block0
            out_block2 = self.quant_block2_relu3(out_block2)

            # block 3
            out_rhs3 = self.quant_block3_conv1(out_block2)
            out_rhs3 = self.quant_block3_relu1(out_rhs3)
            out_rhs3 = self.quant_block3_conv2(out_rhs3)
            out_rhs3 = self.quant_block3_relu2(out_rhs3)
            out_rhs3 = self.quant_block3_conv3(out_rhs3)
            out_rhs3 = self.quant_add_3(out_rhs3)
            out_block3 = out_block2 + out_rhs3
            # out_block1=out_block0
            out_block3 = self.quant_block3_relu3(out_block3)

            return out_block3

    input = torch.randn(1, 64, 32, 32)
    quant_bottleneck_model = QuantBottleneck_projected()

    quant_id_1 = QuantIdentity(
        act_quant=Int8ActPerTensorFixedPoint, bit_width=8, return_quant_tensor=True
    )
    quant_bottleneck_model.eval()
    quant_id_1.eval()
    # from brevitas_examples.imagenet_classification.ptq.ptq_common import calibrate
    # calibrate([(torch.rand(32,64,32,32), 1) for _ in range(5)], quant_bottleneck_model)
    # #
    # from brevitas.fx import brevitas_symbolic_trace
    # model = brevitas_symbolic_trace(quant_bottleneck_model)
    # print(model.graph)

    init_scale = quant_bottleneck_model.quant_id_1.quant_act_scale()
    block_0_relu_1 = quant_bottleneck_model.quant_block0_relu1.quant_act_scale()
    block_0_relu_2 = quant_bottleneck_model.quant_block0_relu2.quant_act_scale()
    block_0_relu_3 = quant_bottleneck_model.quant_block0_relu3.quant_act_scale()

    block_0_weight_scale1 = (
        quant_bottleneck_model.quant_block0_conv1.quant_weight_scale()
    )
    block_0_weight_scale2 = (
        quant_bottleneck_model.quant_block0_conv2.quant_weight_scale()
    )
    block_0_weight_scale3 = (
        quant_bottleneck_model.quant_block0_conv3.quant_weight_scale()
    )
    block_0_weight_scale_skip = quant_bottleneck_model.shortcut.quant_weight_scale()

    # Block 1
    block_1_relu_1 = quant_bottleneck_model.quant_block1_relu1.quant_act_scale()
    block_1_relu_2 = quant_bottleneck_model.quant_block1_relu2.quant_act_scale()
    block_1_relu_3 = quant_bottleneck_model.quant_block1_relu3.quant_act_scale()

    block_1_weight_scale1 = (
        quant_bottleneck_model.quant_block1_conv1.quant_weight_scale()
    )
    block_1_weight_scale2 = (
        quant_bottleneck_model.quant_block1_conv2.quant_weight_scale()
    )
    block_1_weight_scale3 = (
        quant_bottleneck_model.quant_block1_conv3.quant_weight_scale()
    )
    block_1_quant_add_1 = quant_bottleneck_model.quant_add_1.quant_act_scale()

    # Block 2
    block_2_relu_1 = quant_bottleneck_model.quant_block2_relu1.quant_act_scale()
    block_2_relu_2 = quant_bottleneck_model.quant_block2_relu2.quant_act_scale()
    block_2_relu_3 = quant_bottleneck_model.quant_block2_relu3.quant_act_scale()

    block_2_weight_scale1 = (
        quant_bottleneck_model.quant_block2_conv1.quant_weight_scale()
    )
    block_2_weight_scale2 = (
        quant_bottleneck_model.quant_block2_conv2.quant_weight_scale()
    )
    block_2_weight_scale3 = (
        quant_bottleneck_model.quant_block2_conv3.quant_weight_scale()
    )
    block_2_quant_add_1 = quant_bottleneck_model.quant_add_2.quant_act_scale()

    # Block 3
    block_3_relu_1 = quant_bottleneck_model.quant_block3_relu1.quant_act_scale()
    block_3_relu_2 = quant_bottleneck_model.quant_block3_relu2.quant_act_scale()
    block_3_relu_3 = quant_bottleneck_model.quant_block3_relu3.quant_act_scale()

    block_3_weight_scale1 = (
        quant_bottleneck_model.quant_block3_conv1.quant_weight_scale()
    )
    block_3_weight_scale2 = (
        quant_bottleneck_model.quant_block3_conv2.quant_weight_scale()
    )
    block_3_weight_scale3 = (
        quant_bottleneck_model.quant_block3_conv3.quant_weight_scale()
    )
    block_3_quant_add_1 = quant_bottleneck_model.quant_add_3.quant_act_scale()

    block_0_int_weight_1 = quant_bottleneck_model.quant_block0_conv1.quant_weight().int(
        float_datatype=True
    )
    block_0_int_weight_2 = quant_bottleneck_model.quant_block0_conv2.quant_weight().int(
        float_datatype=True
    )
    block_0_int_weight_3 = quant_bottleneck_model.quant_block0_conv3.quant_weight().int(
        float_datatype=True
    )
    block_0_int_weight_skip = quant_bottleneck_model.shortcut.quant_weight().int(
        float_datatype=True
    )

    block_1_int_weight_1 = quant_bottleneck_model.quant_block1_conv1.quant_weight().int(
        float_datatype=True
    )
    block_1_int_weight_2 = quant_bottleneck_model.quant_block1_conv2.quant_weight().int(
        float_datatype=True
    )
    block_1_int_weight_3 = quant_bottleneck_model.quant_block1_conv3.quant_weight().int(
        float_datatype=True
    )

    block_2_int_weight_1 = quant_bottleneck_model.quant_block2_conv1.quant_weight().int(
        float_datatype=True
    )
    block_2_int_weight_2 = quant_bottleneck_model.quant_block2_conv2.quant_weight().int(
        float_datatype=True
    )
    block_2_int_weight_3 = quant_bottleneck_model.quant_block2_conv3.quant_weight().int(
        float_datatype=True
    )

    block_3_int_weight_1 = quant_bottleneck_model.quant_block3_conv1.quant_weight().int(
        float_datatype=True
    )
    block_3_int_weight_2 = quant_bottleneck_model.quant_block3_conv2.quant_weight().int(
        float_datatype=True
    )
    block_3_int_weight_3 = quant_bottleneck_model.quant_block3_conv3.quant_weight().int(
        float_datatype=True
    )

    block_0_combined_scale1 = -torch.log2(
        init_scale * block_0_weight_scale1 / block_0_relu_1
    )  # RHS after first conv1x1 | clip 0-->255
    block_0_combined_scale2 = -torch.log2(
        block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2
    )  # RHS after second conv3x3 | clip 0-->255
    block_0_combined_scale3 = -torch.log2(
        block_0_relu_2 * block_0_weight_scale3 / init_scale
    )  # RHS after third conv1x1 | clip -128-->+127
    block_0_combined_scale_skip = -torch.log2(
        init_scale * block_0_weight_scale_skip / init_scale
    )  # LHS after conv1x1 | clip -128-->+127
    block_0_combined_scale4 = -torch.log2(
        init_scale / block_0_relu_3
    )  # After addition | clip 0-->255

    block_1_combined_scale1 = -torch.log2(
        block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1
    )  # RHS after first conv1x1 | clip 0-->255
    block_1_combined_scale2 = -torch.log2(
        block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2
    )  # RHS after second conv3x3 | clip 0-->255
    block_1_combined_scale3 = -torch.log2(
        block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1
    )  # RHS after third conv1x1 | clip -128-->+127
    block_1_combined_scale4 = -torch.log2(
        block_1_quant_add_1 / block_1_relu_3
    )  # After addition | clip 0-->255

    block_2_combined_scale1 = -torch.log2(
        block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1
    )  # RHS after first conv1x1 | clip 0-->255
    block_2_combined_scale2 = -torch.log2(
        block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2
    )  # RHS after second conv3x3 | clip 0-->255
    block_2_combined_scale3 = -torch.log2(
        block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1
    )  # RHS after third conv1x1 | clip -128-->+127
    block_2_combined_scale4 = -torch.log2(
        block_2_quant_add_1 / block_2_relu_3
    )  # After addition | clip 0-->255

    block_3_combined_scale1 = -torch.log2(
        block_2_relu_3 * block_3_weight_scale1 / block_3_relu_1
    )  # RHS after first conv1x1 | clip 0-->255
    block_3_combined_scale2 = -torch.log2(
        block_3_relu_1 * block_3_weight_scale2 / block_3_relu_2
    )  # RHS after second conv3x3 | clip 0-->255
    block_3_combined_scale3 = -torch.log2(
        block_3_relu_2 * block_3_weight_scale3 / block_3_quant_add_1
    )  # RHS after third conv1x1 | clip -128-->+127
    block_3_combined_scale4 = -torch.log2(
        block_3_quant_add_1 / block_3_relu_3
    )  # After addition | clip 0-->255

    print("_________POST PTQ SCALES_________")
    print("init_scale:", init_scale)
    print("block_0_relu1:", block_0_relu_1)
    print("block_0_relu2:", block_0_relu_2)
    print("block_0_relu3:", block_0_relu_3)

    print("block_0_weight_scale1:", block_0_weight_scale1)
    print("block_0_weight_scale2:", block_0_weight_scale2)
    print("block_0_weight_scale3:", block_0_weight_scale3)
    print("block_0_weight_scale_skip:", block_0_weight_scale_skip)
    print("--------------------------------------------------------------")
    print("block_1_quant_add_1:", block_1_quant_add_1)
    print("block_1_relu1:", block_1_relu_1)
    print("block_1_relu2:", block_1_relu_2)
    print("block_1_relu3:", block_1_relu_3)
    print("block_1_weight_scale1:", block_1_weight_scale1)
    print("block_1_weight_scale2:", block_1_weight_scale2)
    print("block_1_weight_scale3:", block_1_weight_scale3)
    print("--------------------------------------------------------------")
    print("block_2_quant_add_1:", block_2_quant_add_1)
    print("block_2_relu1:", block_2_relu_1)
    print("block_2_relu2:", block_2_relu_2)
    print("block_2_relu3:", block_2_relu_3)
    print("block_2_weight_scale1:", block_2_weight_scale1)
    print("block_2_weight_scale2:", block_2_weight_scale2)
    print("block_2_weight_scale3:", block_2_weight_scale3)
    print("--------------------------------------------------------------")
    print("block_3_quant_add_1:", block_3_quant_add_1)
    print("block_3_relu1:", block_3_relu_1)
    print("block_3_relu2:", block_3_relu_2)
    print("block_3_relu3:", block_3_relu_3)
    print("block_3_weight_scale1:", block_3_weight_scale1)
    print("block_3_weight_scale2:", block_3_weight_scale2)
    print("block_3_weight_scale3:", block_3_weight_scale3)
    print("--------------------------------------------------------------")
    print("combined_scale block0 after first conv1x1:", block_0_combined_scale1.item())
    print("combined_scale block0 after second conv3x3:", block_0_combined_scale2.item())
    print("combined_scale block0 after third conv1x1:", block_0_combined_scale3.item())
    print(
        "combined_scale block0 after adding skip connection:",
        (block_0_combined_scale4).item(),
    )
    print(
        "combined_scale block0 after skip conv1x1:", block_0_combined_scale_skip.item()
    )
    print("--------------------------------------------------------------")
    print("combined_scale block1 after first conv1x1:", block_1_combined_scale1.item())
    print("combined_scale block1 after second conv3x3:", block_1_combined_scale2.item())
    print("combined_scale block1 after third conv1x1:", block_1_combined_scale3.item())
    print(
        "combined_scale block1 after adding skip connection:",
        (block_1_combined_scale4).item(),
    )
    print("--------------------------------------------------------------")
    print("combined_scale block2 after first conv1x1:", block_2_combined_scale1.item())
    print("combined_scale block2 after second conv3x3:", block_2_combined_scale2.item())
    print("combined_scale block2 after third conv1x1:", block_2_combined_scale3.item())
    print(
        "combined_scale block2 after adding skip connection:",
        (block_2_combined_scale4).item(),
    )
    print("--------------------------------------------------------------")
    print("combined_scale block3 after first conv1x1:", block_3_combined_scale1.item())
    print("combined_scale block3 after second conv3x3:", block_3_combined_scale2.item())
    print("combined_scale block3 after third conv1x1:", block_3_combined_scale3.item())
    print(
        "combined_scale block3 after adding skip connection:",
        (block_3_combined_scale4).item(),
    )

    q_bottleneck_out = quant_bottleneck_model(input)
    gold_out = q_bottleneck_out.int(float_datatype=True).data.numpy().astype(dtype_out)
    print("Golden::Brevitas::", gold_out)
    gold_out.tofile(log_folder + "/gold_out.txt", sep=",", format="%d")

    from brevitas.export import export_onnx_qcdq

    # ref_input = torch.ones(1, 3, 32, 32, device="cpu", dtype=dtype)
    export_onnx_qcdq(quant_bottleneck_model, input, log_folder + "/" + design + ".onnx")
    # # Brevitas convolution
    q_inp = quant_id_1(input)
    int_inp = q_inp.int(float_datatype=True)

    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)

    before_input.tofile(
        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
    )
    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
    block0_wts1 = ds.reorder_mat(
        block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block0_wts2 = ds.reorder_mat(
        block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block0_wts3 = ds.reorder_mat(
        block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block0_wts_skip = ds.reorder_mat(
        block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )

    total_wts = np.concatenate(
        (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None
    )

    block1_wts1 = ds.reorder_mat(
        block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block1_wts2 = ds.reorder_mat(
        block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block1_wts3 = ds.reorder_mat(
        block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )

    total_wts2 = np.concatenate(
        (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None
    )

    block2_wts1 = ds.reorder_mat(
        block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block2_wts2 = ds.reorder_mat(
        block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block2_wts3 = ds.reorder_mat(
        block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )

    total_wts3 = np.concatenate(
        (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None
    )

    block3_wts1 = ds.reorder_mat(
        block_3_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block3_wts2 = ds.reorder_mat(
        block_3_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block3_wts3 = ds.reorder_mat(
        block_3_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )

    total_wts4 = np.concatenate(
        (total_wts3, block3_wts1, block3_wts2, block3_wts3), axis=None
    )

    total_wts4.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
    print("total_wts", total_wts2.shape)
    # for i in range (0,1):
    app.buffers[2].write(ifm_mem_fmt)  # input's standard format CYX | scalar YCX
    app.buffers[3].write(total_wts4)  # wts's standard format OIYX | scalar OIYX
    # app.buffers[3].write(int_weight2.data.numpy().astype(dtype_in),offset=2048) # wts's standard format OIYX | scalar OIYX
    app.run()
    output3 = app.buffers[4].read()
    if enable_trace:
        output3, trace = extract_trace(output3, shape_out, dtype_out)
        write_out_trace(trace, trace_file)
    # temp_out=output3.reshape(32,256, 32)
    # ofm_mem_fmt = temp_out.swapaxes(0,1)
    temp_out = output3.reshape(32, 32, 32, 8)
    temp2_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
    ofm_mem_fmt = temp2_out.reshape(256, 32, 32)
    ofm_mem_fmt.tofile(
        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
    )

    ofm_mem_fmt = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
    print("AIE output:::", ofm_mem_fmt)
    print(type(ofm_mem_fmt))
    print(type(q_bottleneck_out))
    print(
        "difference::",
        torch.max(torch.abs(ofm_mem_fmt * block_3_relu_3 - q_bottleneck_out)),
    )
    diff = torch.abs(ofm_mem_fmt - gold_out)
    # print("diff::",diff)
    # for i, x1 in enumerate(diff):
    #     for j, x2 in enumerate(x1):
    #         for k, x3 in enumerate(x2):
    #             for l, x4 in enumerate(x3):
    #                 if x4 > 3:
    #                     print("i:",i,", j:",j,", k:", k, ", l:", l, ", val:",x4)
    #                     print("ofm_mem_fmt val:",ofm_mem_fmt[i,j,k,l])
    #                     print("gold_out val:",gold_out[i,j,k,l])
    sq_abs = torch.square(torch.abs(ofm_mem_fmt * block_3_relu_3 - q_bottleneck_out))
    print("rms::", torch.sqrt(torch.sum(sq_abs) / torch.numel(sq_abs)))
    assert np.allclose(ofm_mem_fmt, gold_out, rtol=0, atol=5)

_________POST PTQ SCALES_________
init_scale: tensor(0.0078)
block_0_relu1: tensor(0.0039)
block_0_relu2: tensor(0.0039)
block_0_relu3: tensor(0.0039)
block_0_weight_scale1: tensor(0.0010, grad_fn=<DivBackward0>)
block_0_weight_scale2: tensor(0.0005, grad_fn=<DivBackward0>)
block_0_weight_scale3: tensor(0.0010, grad_fn=<DivBackward0>)
block_0_weight_scale_skip: tensor(0.0010, grad_fn=<DivBackward0>)
--------------------------------------------------------------
block_1_quant_add_1: tensor(0.0039)
block_1_relu1: tensor(0.0039)
block_1_relu2: tensor(0.0039)
block_1_relu3: tensor(0.0039)
block_1_weight_scale1: tensor(0.0005, grad_fn=<DivBackward0>)
block_1_weight_scale2: tensor(0.0005, grad_fn=<DivBackward0>)
block_1_weight_scale3: tensor(0.0010, grad_fn=<DivBackward0>)
--------------------------------------------------------------
block_2_quant_add_1: tensor(0.0039)
block_2_relu1: tensor(0.0039)
block_2_relu2: tensor(0.0039)
block_2_relu3: tensor(0.0039)
block_2_weight_scale1: tensor(0.0



verbose: False, log level: Level.ERROR

total_wts (143360,)
AIE output::: tensor([[[[ 92,   3,  80,  ...,  64,   6,  82],
          [153, 106,  15,  ..., 165,  69,  14],
          [  0,   1,   3,  ...,   6, 242, 106],
          ...,
          [ 17, 106, 154,  ...,   0,  13, 102],
          [246,   9,  61,  ...,   9,  41,  13],
          [ 23,  10,  12,  ...,  19,   0,   3]],

         [[ 32,  26,   3,  ...,   3,  82,   1],
          [  2,  68,   0,  ...,   6,   0, 158],
          [  6,  25,   0,  ...,   0, 131,   1],
          ...,
          [ 13,   4,   1,  ...,   0,   0,   0],
          [ 95,   1,  30,  ...,  56, 125,  72],
          [158,   2,   0,  ...,   2,   0,   2]],

         [[250,   5,   8,  ..., 182,   4,  35],
          [  5,  10,   0,  ..., 188,  83, 116],
          [  8,  12,   9,  ...,  85, 104,   5],
          ...,
          [ 13,   1,   8,  ..., 199,  71,   8],
          [130, 197,   8,  ...,   3,  73,   7],
          [ 39,  10,   3,  ...,  11,  20,   2]],

         ..

  print("difference::",torch.max(torch.abs(ofm_mem_fmt*block_3_relu_3 - q_bottleneck_out)))
  sq_abs = torch.square(torch.abs(ofm_mem_fmt*block_3_relu_3 - q_bottleneck_out))


Golden::Brevitas:: [[[[ 95   5   0 ...   3   0  35]
   [  0   1  12 ...  27 185   0]
   [ 84  47 105 ...   0 138  61]
   ...
   [  0  81   6 ...  56   6   9]
   [  0   0   0 ...   0   0   0]
   [  0   0  48 ...   5  64   8]]

  [[  4  74 184 ... 106   6   4]
   [  4  49  56 ... 101   9  16]
   [ 18  41  12 ... 138  85  12]
   ...
   [137   7   6 ...  82  13  37]
   [ 15  68  68 ...  12   6  95]
   [ 85 104   9 ... 175  26 170]]

  [[  0   1  62 ...  19   0   4]
   [  6  17   0 ...  19   0  39]
   [  8   0  27 ...  21  23  14]
   ...
   [ 13  17  45 ...  13   0 164]
   [ 14   3 105 ...  29  13  69]
   [ 52 226  34 ... 147  94  99]]

  ...

  [[ 43  53  50 ...  15  85  64]
   [ 10 122  26 ...  60  23  89]
   [  6  85  23 ... 105  10  16]
   ...
   [  7  99  64 ...  17  64  10]
   [ 16  11 165 ...  25 188  11]
   [ 18  96 154 ...  13  25   7]]

  [[ 86  14  14 ...  19  10 249]
   [ 20  35  57 ...  21  21   7]
   [  8  12   3 ... 123   9  35]
   ...
   [ 51  70 108 ...  12  16  20]
   [  7



verbose: False, log level: Level.ERROR

total_wts (143360,)
AIE output::: tensor([[[[ 95,   5,   0,  ...,   3,   0,  35],
          [  0,   1,  12,  ...,  27, 185,   0],
          [ 83,  47, 105,  ...,   0, 138,  61],
          ...,
          [  0,  81,   6,  ...,  56,   7,   9],
          [  0,   0,   0,  ...,   0,   0,   0],
          [  0,   0,  48,  ...,   4,  64,   7]],

         [[  4,  74, 184,  ..., 106,   6,   4],
          [  4,  49,  56,  ..., 101,   9,  16],
          [ 18,  41,  12,  ..., 138,  84,  11],
          ...,
          [138,   6,   5,  ...,  82,  12,  37],
          [ 15,  68,  68,  ...,  12,   5,  95],
          [ 85, 104,   9,  ..., 175,  26, 170]],

         [[  0,   1,  62,  ...,  20,   0,   4],
          [  6,  17,   0,  ...,  19,   0,  39],
          [  8,   0,  27,  ...,  21,  23,  15],
          ...,
          [ 13,  17,  45,  ...,  13,   0, 164],
          [ 14,   3, 105,  ...,  29,  13,  69],
          [ 54, 225,  34,  ..., 149,  93,  99]],

         ..

  print("difference::",torch.max(torch.abs(ofm_mem_fmt*block_3_relu_3 - q_bottleneck_out)))
  sq_abs = torch.square(torch.abs(ofm_mem_fmt*block_3_relu_3 - q_bottleneck_out))


Golden::Brevitas:: [[[[ 76  94   2 ...  16   3   9]
   [ 34  96  73 ...  21  15 167]
   [ 19  20  99 ...   8  20  11]
   ...
   [ 22   5   7 ...   7  17 176]
   [ 98   1  13 ...   5 127   8]
   [ 16   6  19 ...  66  17   8]]

  [[ 85  90 121 ...  27  76   5]
   [  3 125   5 ...   0   9   0]
   [ 26  13 179 ...   3   0  24]
   ...
   [  8   9  16 ...   0  98   3]
   [170  37 108 ...  93  69   3]
   [ 22   7   0 ...   9  83 101]]

  [[  7  99   0 ... 110  12  77]
   [158   0  99 ...  71  23   0]
   [ 91   4  18 ...   8  87  48]
   ...
   [  3  52  49 ...  69   0   0]
   [  0   6 107 ...   0   0   8]
   [  0 125 248 ...   0  71 181]]

  ...

  [[ 20  23   6 ...   8   6   9]
   [218   3  96 ...   7 247  59]
   [ 45   5  11 ...  10   5 250]
   ...
   [ 62   8   2 ...  71   6  24]
   [ 13 197  43 ...   2  18   0]
   [  0   0  86 ...   3 122  74]]

  [[ 99  10 132 ...   5  16   2]
   [ 11  23   4 ...  10  26   4]
   [  6   8  18 ...   0   0   2]
   ...
   [ 10  87   8 ...   0  13  10]
   [  3



verbose: False, log level: Level.ERROR

total_wts (143360,)
AIE output::: tensor([[[[ 76,  94,   2,  ...,  16,   3,   9],
          [ 35,  96,  73,  ...,  21,  15, 167],
          [ 18,  19,  99,  ...,   8,  21,  11],
          ...,
          [ 22,   5,   7,  ...,   7,  16, 177],
          [ 98,   1,  13,  ...,   4, 127,   8],
          [ 15,   6,  18,  ...,  66,  16,   8]],

         [[ 85,  90, 121,  ...,  27,  76,   4],
          [  3, 125,   5,  ...,   0,   8,   0],
          [ 26,  13, 179,  ...,   3,   0,  23],
          ...,
          [  8,   9,  16,  ...,   0,  98,   2],
          [172,  37, 108,  ...,  92,  69,   4],
          [ 22,   6,   0,  ...,   9,  82, 101]],

         [[  7,  99,   0,  ..., 110,  12,  77],
          [158,   0,  99,  ...,  71,  24,   0],
          [ 91,   4,  18,  ...,   8,  87,  48],
          ...,
          [  4,  51,  49,  ...,  69,   0,   0],
          [  0,   5, 106,  ...,   0,   0,   8],
          [  0, 125, 248,  ...,   0,  70, 181]],

         ..

  print("difference::",torch.max(torch.abs(ofm_mem_fmt*block_3_relu_3 - q_bottleneck_out)))
  sq_abs = torch.square(torch.abs(ofm_mem_fmt*block_3_relu_3 - q_bottleneck_out))


_________POST PTQ SCALES_________
init_scale: tensor(0.0078)
block_0_relu1: tensor(0.0039)
block_0_relu2: tensor(0.0039)
block_0_relu3: tensor(0.0039)
block_0_weight_scale1: tensor(0.0010, grad_fn=<DivBackward0>)
block_0_weight_scale2: tensor(0.0005, grad_fn=<DivBackward0>)
block_0_weight_scale3: tensor(0.0010, grad_fn=<DivBackward0>)
block_0_weight_scale_skip: tensor(0.0010, grad_fn=<DivBackward0>)
--------------------------------------------------------------
block_1_quant_add_1: tensor(0.0039)
block_1_relu1: tensor(0.0039)
block_1_relu2: tensor(0.0039)
block_1_relu3: tensor(0.0039)
block_1_weight_scale1: tensor(0.0005, grad_fn=<DivBackward0>)
block_1_weight_scale2: tensor(0.0005, grad_fn=<DivBackward0>)
block_1_weight_scale3: tensor(0.0010, grad_fn=<DivBackward0>)
--------------------------------------------------------------
block_2_quant_add_1: tensor(0.0039)
block_2_relu1: tensor(0.0039)
block_2_relu2: tensor(0.0039)
block_2_relu3: tensor(0.0039)
block_2_weight_scale1: tensor(0.0



verbose: False, log level: Level.ERROR

total_wts (143360,)
AIE output::: tensor([[[[131,   0,   0,  ...,   0, 197,   0],
          [ 17,   0,  36,  ...,  15, 183,   0],
          [  1,  12, 191,  ..., 181,  43,   0],
          ...,
          [  0, 244, 143,  ...,   0,  57, 115],
          [247, 137,  46,  ...,   0,  25,  77],
          [  1,   4,   0,  ...,   0,   4,  80]],

         [[ 80,  98, 223,  ..., 129,   9,  55],
          [  6,   7,  30,  ..., 251,  15,  10],
          [ 10,  10, 255,  ...,  83,  50, 111],
          ...,
          [196,   8,  14,  ...,   9,  15,   2],
          [ 15,   6,  20,  ...,   4,   9,   0],
          [  0,  11,  68,  ...,   9,  52, 197]],

         [[  7,  93,  40,  ...,  99,   2, 156],
          [ 34,  13,  80,  ..., 169,   9,   4],
          [ 12, 107,   2,  ...,  14, 209,   4],
          ...,
          [ 14,  11,   9,  ..., 100,  13,   6],
          [115,  22,  44,  ...,  13,  51,   9],
          [ 55,  11,  96,  ..., 150,   7,   2]],

         ..

  print("difference::",torch.max(torch.abs(ofm_mem_fmt*block_3_relu_3 - q_bottleneck_out)))
  sq_abs = torch.square(torch.abs(ofm_mem_fmt*block_3_relu_3 - q_bottleneck_out))


In [None]:
if enable_trace:
    print(trace)
else:
    print("tracing not enabled")