In [1]:
import sys

sys.path.append("../misc")
from utils import DataShaper
import os
import torch
import math
import matplotlib.pyplot as plt
import numpy as np
from brevitas.nn import QuantConv2d, QuantIdentity, QuantReLU
from brevitas.quant.fixed_point import (
    Int8ActPerTensorFixedPoint,
    Int8WeightPerTensorFixedPoint,
    Uint8ActPerTensorFixedPoint,
)

ds = DataShaper()
torch.manual_seed(0)
design = "layer_1"
# aie_teardown()
sys.path.append("../../../utils")
import xrtutils

xclbin_path = os.path.abspath("../network/" + design + "/build/final.xclbin")
insts_path = os.path.abspath("../network/" + design + "/build/insts.txt")

log_folder = "log/log_" + design
if not os.path.exists(log_folder):
    os.makedirs(log_folder)
enable_aie = True
aie_is_setup = False
enable_trace = True
trace_file = "traces/trace.txt"

app = None
in_buf = None
arg1_buf = None
out_buf = None
dtype_in = np.dtype("int8")
dtype_wts = np.dtype("int8")
dtype_out = np.dtype("uint8")

shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
# shape_in_wts1  = (8,8,1,1,8,8) #out,in,ky,kx,in8,out8
# shape_in_wts2  = (8,8,3,3,8,8)  #out,in,ky,kx,in8,out8
# shape_in_wts3  = (32,8,1,1,8,8)   #out,in,ky,kx,in8,out8
# shape_in_wts_skip  = (8,32,1,1,8,8) #out,in,ky,kx,in8,out8
shape_total_wts = (212992, 1)
shape_out = (32, 32, 32, 8)

trace_size = 16384


def setup_aie(
    xclbin_path,
    insts_path,
    in_0_shape,
    in_0_dtype,
    in_1_shape,
    in_1_dtype,
    out_buf_shape,
    out_buf_dtype,
    enable_trace=False,
    kernel_name="MLIR_AIE",
):
    app = xrtutils.AIE_Application(xclbin_path, insts_path, kernel_name)
    app.register_buffer(2, shape=in_0_shape, dtype=in_0_dtype)
    app.register_buffer(3, shape=in_1_shape, dtype=in_1_dtype)
    if enable_trace:
        out_buf_len_bytes = np.prod(out_buf_shape) * np.dtype(out_buf_dtype).itemsize
        out_buf_shape = (out_buf_len_bytes + trace_size,)
        out_buf_dtype = np.uint8
    app.register_buffer(4, shape=out_buf_shape, dtype=out_buf_dtype)
    return app


def extract_trace(out_buf, out_buf_shape, out_buf_dtype):
    trace_size_words = trace_size // 4
    out_buf_flat = out_buf.reshape((-1,)).view(np.uint32)
    output_prefix = (
        out_buf_flat[:-trace_size_words].view(out_buf_dtype).reshape(out_buf_shape)
    )
    trace_suffix = out_buf_flat[-trace_size_words:]
    return output_prefix, trace_suffix


def write_out_trace(trace, file_name):
    out_str = "\n".join(f"{i:0{8}x}" for i in trace if i != 0)
    with open(file_name, "w") as f:
        f.write(out_str)


app = setup_aie(
    xclbin_path,
    insts_path,
    shape_in_act,
    dtype_in,
    shape_total_wts,
    dtype_wts,
    shape_out,
    dtype_out,
    enable_trace,
)

In [2]:
import matplotlib.pyplot as plt
import torchvision
import torch
import torch.nn as nn
import numpy as np
import os
from brevitas.nn import QuantConv2d, QuantIdentity, QuantReLU
from brevitas.quant.fixed_point import (
    Int8ActPerTensorFixedPoint,
    Int8WeightPerTensorFixedPoint,
    Uint8ActPerTensorFixedPoint,
)

torch.manual_seed(0)
torch.use_deterministic_algorithms(True)


num_classes = 10


def init_pad_input(x, input_channels, desired_channels=4):
    padding = torch.zeros(1, input_channels * (desired_channels - 1), 32, 32)
    return torch.cat((x, padding), 1)


# try:

In [3]:
for i in range(0, 4):

    class QuantBottleneck_projected(nn.Module):
        expansion = 4

        def __init__(self, in_planes=64, planes=64):
            super(QuantBottleneck_projected, self).__init__()
            # block 0
            self.quant_id_1 = QuantIdentity(
                act_quant=Int8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block0_conv1 = QuantConv2d(
                in_planes,
                planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block0_conv2 = QuantConv2d(
                planes,
                planes,
                kernel_size=3,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                padding=1,
                padding_mode="zeros",
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block0_conv3 = QuantConv2d(
                planes,
                self.expansion * planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block0_relu1 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block0_relu2 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block0_relu3 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )

            self.shortcut = QuantConv2d(
                in_planes,
                self.expansion * planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )

            # block 1
            self.quant_block1_conv1 = QuantConv2d(
                self.expansion * in_planes,
                planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block1_conv2 = QuantConv2d(
                planes,
                planes,
                kernel_size=3,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                padding=1,
                padding_mode="zeros",
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block1_conv3 = QuantConv2d(
                planes,
                self.expansion * planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block1_relu1 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block1_relu2 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block1_relu3 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )

            self.quant_add_1 = QuantIdentity(
                act_quant=Int8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            # Quant_add_1 shares the scale factors with block0_relu3, however one is signed and the other one is unsigned
            self.quant_add_1.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl = (
                self.quant_block0_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl
            )
            self.quant_add_1.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl = (
                self.quant_block0_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl
            )

            # block 2
            self.quant_block2_conv1 = QuantConv2d(
                self.expansion * in_planes,
                planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block2_conv2 = QuantConv2d(
                planes,
                planes,
                kernel_size=3,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                padding=1,
                padding_mode="zeros",
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block2_conv3 = QuantConv2d(
                planes,
                self.expansion * planes,
                kernel_size=1,
                bit_width=8,
                weight_bit_width=8,
                bias=False,
                weight_quant=Int8WeightPerTensorFixedPoint,
                return_quant_tensor=True,
            )
            self.quant_block2_relu1 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block2_relu2 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            self.quant_block2_relu3 = QuantReLU(
                act_quant=Uint8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )

            self.quant_add_2 = QuantIdentity(
                act_quant=Int8ActPerTensorFixedPoint,
                bit_width=8,
                return_quant_tensor=True,
            )
            # Quant_add_1 shares the scale factors with block0_relu3, however one is signed and the other one is unsigned
            self.quant_add_2.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl = (
                self.quant_block1_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl
            )
            self.quant_add_2.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl = (
                self.quant_block1_relu3.act_quant.fused_activation_quant_proxy.tensor_quant.int_scaling_impl
            )

        def forward(self, x):
            out_q = self.quant_id_1(x)
            out_rhs = self.quant_block0_conv1(out_q)
            out_rhs = self.quant_block0_relu1(out_rhs)
            out_rhs = self.quant_block0_conv2(out_rhs)
            out_rhs = self.quant_block0_relu2(out_rhs)
            out_rhs = self.quant_block0_conv3(out_rhs)
            out_rhs = self.quant_id_1(out_rhs)
            out_lhs = self.shortcut(out_q)
            out_lhs = self.quant_id_1(out_lhs)
            out_block0 = out_rhs + out_lhs
            out_block0 = self.quant_block0_relu3(out_block0)
            # block 1
            out_rhs1 = self.quant_block1_conv1(out_block0)
            out_rhs1 = self.quant_block1_relu1(out_rhs1)
            out_rhs1 = self.quant_block1_conv2(out_rhs1)
            out_rhs1 = self.quant_block1_relu2(out_rhs1)
            out_rhs1 = self.quant_block1_conv3(out_rhs1)
            out_rhs1 = self.quant_add_1(out_rhs1)
            out_block1 = out_block0 + out_rhs1
            # out_block1=out_block0
            out_block1 = self.quant_block1_relu3(out_block1)

            # block 1
            out_rhs2 = self.quant_block2_conv1(out_block1)
            out_rhs2 = self.quant_block2_relu1(out_rhs2)
            out_rhs2 = self.quant_block2_conv2(out_rhs2)
            out_rhs2 = self.quant_block2_relu2(out_rhs2)
            out_rhs2 = self.quant_block2_conv3(out_rhs2)
            out_rhs2 = self.quant_add_2(out_rhs2)
            out_block2 = out_block1 + out_rhs2
            # out_block1=out_block0
            out_block2 = self.quant_block2_relu3(out_block2)

            return out_block2

    input = torch.randn(1, 64, 32, 32)
    quant_bottleneck_model = QuantBottleneck_projected()

    quant_id_1 = QuantIdentity(
        act_quant=Int8ActPerTensorFixedPoint, bit_width=8, return_quant_tensor=True
    )
    quant_bottleneck_model.eval()
    quant_id_1.eval()
    # from brevitas_examples.imagenet_classification.ptq.ptq_common import calibrate
    # calibrate([(torch.rand(32,64,32,32), 1) for _ in range(5)], quant_bottleneck_model)
    # #
    # from brevitas.fx import brevitas_symbolic_trace
    # model = brevitas_symbolic_trace(quant_bottleneck_model)
    # print(model.graph)

    init_scale = quant_bottleneck_model.quant_id_1.quant_act_scale()
    block_0_relu_1 = quant_bottleneck_model.quant_block0_relu1.quant_act_scale()
    block_0_relu_2 = quant_bottleneck_model.quant_block0_relu2.quant_act_scale()
    block_0_relu_3 = quant_bottleneck_model.quant_block0_relu3.quant_act_scale()

    block_0_weight_scale1 = (
        quant_bottleneck_model.quant_block0_conv1.quant_weight_scale()
    )
    block_0_weight_scale2 = (
        quant_bottleneck_model.quant_block0_conv2.quant_weight_scale()
    )
    block_0_weight_scale3 = (
        quant_bottleneck_model.quant_block0_conv3.quant_weight_scale()
    )
    block_0_weight_scale_skip = quant_bottleneck_model.shortcut.quant_weight_scale()

    # Block 2
    block_1_relu_1 = quant_bottleneck_model.quant_block1_relu1.quant_act_scale()
    block_1_relu_2 = quant_bottleneck_model.quant_block1_relu2.quant_act_scale()
    block_1_relu_3 = quant_bottleneck_model.quant_block1_relu3.quant_act_scale()

    block_1_weight_scale1 = (
        quant_bottleneck_model.quant_block1_conv1.quant_weight_scale()
    )
    block_1_weight_scale2 = (
        quant_bottleneck_model.quant_block1_conv2.quant_weight_scale()
    )
    block_1_weight_scale3 = (
        quant_bottleneck_model.quant_block1_conv3.quant_weight_scale()
    )
    block_1_quant_add_1 = quant_bottleneck_model.quant_add_1.quant_act_scale()

    # Block 3
    block_2_relu_1 = quant_bottleneck_model.quant_block2_relu1.quant_act_scale()
    block_2_relu_2 = quant_bottleneck_model.quant_block2_relu2.quant_act_scale()
    block_2_relu_3 = quant_bottleneck_model.quant_block2_relu3.quant_act_scale()

    block_2_weight_scale1 = (
        quant_bottleneck_model.quant_block2_conv1.quant_weight_scale()
    )
    block_2_weight_scale2 = (
        quant_bottleneck_model.quant_block2_conv2.quant_weight_scale()
    )
    block_2_weight_scale3 = (
        quant_bottleneck_model.quant_block2_conv3.quant_weight_scale()
    )
    block_2_quant_add_1 = quant_bottleneck_model.quant_add_2.quant_act_scale()

    block_0_int_weight_1 = quant_bottleneck_model.quant_block0_conv1.quant_weight().int(
        float_datatype=True
    )
    block_0_int_weight_2 = quant_bottleneck_model.quant_block0_conv2.quant_weight().int(
        float_datatype=True
    )
    block_0_int_weight_3 = quant_bottleneck_model.quant_block0_conv3.quant_weight().int(
        float_datatype=True
    )
    block_0_int_weight_skip = quant_bottleneck_model.shortcut.quant_weight().int(
        float_datatype=True
    )

    block_1_int_weight_1 = quant_bottleneck_model.quant_block1_conv1.quant_weight().int(
        float_datatype=True
    )
    block_1_int_weight_2 = quant_bottleneck_model.quant_block1_conv2.quant_weight().int(
        float_datatype=True
    )
    block_1_int_weight_3 = quant_bottleneck_model.quant_block1_conv3.quant_weight().int(
        float_datatype=True
    )

    block_2_int_weight_1 = quant_bottleneck_model.quant_block2_conv1.quant_weight().int(
        float_datatype=True
    )
    block_2_int_weight_2 = quant_bottleneck_model.quant_block2_conv2.quant_weight().int(
        float_datatype=True
    )
    block_2_int_weight_3 = quant_bottleneck_model.quant_block2_conv3.quant_weight().int(
        float_datatype=True
    )

    block_0_combined_scale1 = -torch.log2(
        init_scale * block_0_weight_scale1 / block_0_relu_1
    )  # RHS after first conv1x1 | clip 0-->255
    block_0_combined_scale2 = -torch.log2(
        block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2
    )  # RHS after second conv3x3 | clip 0-->255
    block_0_combined_scale3 = -torch.log2(
        block_0_relu_2 * block_0_weight_scale3 / init_scale
    )  # RHS after third conv1x1 | clip -128-->+127
    block_0_combined_scale_skip = -torch.log2(
        init_scale * block_0_weight_scale_skip / init_scale
    )  # LHS after conv1x1 | clip -128-->+127
    block_0_combined_scale4 = -torch.log2(
        init_scale / block_0_relu_3
    )  # After addition | clip 0-->255

    block_1_combined_scale1 = -torch.log2(
        block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1
    )  # RHS after first conv1x1 | clip 0-->255
    block_1_combined_scale2 = -torch.log2(
        block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2
    )  # RHS after second conv3x3 | clip 0-->255
    block_1_combined_scale3 = -torch.log2(
        block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1
    )  # RHS after third conv1x1 | clip -128-->+127
    block_1_combined_scale4 = -torch.log2(
        block_1_quant_add_1 / block_1_relu_3
    )  # After addition | clip 0-->255

    block_2_combined_scale1 = -torch.log2(
        block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1
    )  # RHS after first conv1x1 | clip 0-->255
    block_2_combined_scale2 = -torch.log2(
        block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2
    )  # RHS after second conv3x3 | clip 0-->255
    block_2_combined_scale3 = -torch.log2(
        block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1
    )  # RHS after third conv1x1 | clip -128-->+127
    block_2_combined_scale4 = -torch.log2(
        block_2_quant_add_1 / block_2_relu_3
    )  # After addition | clip 0-->255

    print("_________POST PTQ SCALES_________")
    print("init_scale:", init_scale)
    print("block_0_relu1:", block_0_relu_1)
    print("block_0_relu2:", block_0_relu_2)
    print("block_0_relu3:", block_0_relu_3)

    print("block_0_weight_scale1:", block_0_weight_scale1)
    print("block_0_weight_scale2:", block_0_weight_scale2)
    print("block_0_weight_scale3:", block_0_weight_scale3)
    print("block_0_weight_scale_skip:", block_0_weight_scale_skip)
    print("--------------------------------------------------------------")
    print("block_1_quant_add_1:", block_1_quant_add_1)
    print("block_1_relu1:", block_1_relu_1)
    print("block_1_relu2:", block_1_relu_2)
    print("block_1_relu3:", block_1_relu_3)
    print("block_1_weight_scale1:", block_1_weight_scale1)
    print("block_1_weight_scale2:", block_1_weight_scale2)
    print("block_1_weight_scale3:", block_1_weight_scale3)
    print("--------------------------------------------------------------")
    print("block_2_quant_add_1:", block_2_quant_add_1)
    print("block_2_relu1:", block_2_relu_1)
    print("block_2_relu2:", block_2_relu_2)
    print("block_2_relu3:", block_2_relu_3)
    print("block_2_weight_scale1:", block_2_weight_scale1)
    print("block_2_weight_scale2:", block_2_weight_scale2)
    print("block_2_weight_scale3:", block_2_weight_scale3)
    print("--------------------------------------------------------------")
    print("combined_scale block0 after first conv1x1:", block_0_combined_scale1.item())
    print("combined_scale block0 after second conv3x3:", block_0_combined_scale2.item())
    print("combined_scale block0 after third conv1x1:", block_0_combined_scale3.item())
    print(
        "combined_scale block0 after adding skip connection:",
        (block_0_combined_scale4).item(),
    )
    print(
        "combined_scale block0 after skip conv1x1:", block_0_combined_scale_skip.item()
    )
    print("--------------------------------------------------------------")
    print("combined_scale block1 after first conv1x1:", block_1_combined_scale1.item())
    print("combined_scale block1 after second conv3x3:", block_1_combined_scale2.item())
    print("combined_scale block1 after third conv1x1:", block_1_combined_scale3.item())
    print(
        "combined_scale block1 after adding skip connection:",
        (block_1_combined_scale4).item(),
    )
    print("--------------------------------------------------------------")
    print("combined_scale block2 after first conv1x1:", block_2_combined_scale1.item())
    print("combined_scale block2 after second conv3x3:", block_2_combined_scale2.item())
    print("combined_scale block2 after third conv1x1:", block_2_combined_scale3.item())
    print(
        "combined_scale block2 after adding skip connection:",
        (block_2_combined_scale4).item(),
    )

    q_bottleneck_out = quant_bottleneck_model(input)
    gold_out = q_bottleneck_out.int(float_datatype=True).data.numpy().astype(dtype_out)
    print("Golden::Brevitas::", gold_out)
    gold_out.tofile(log_folder + "/gold_out.txt", sep=",", format="%d")

    from brevitas.export import export_onnx_qcdq

    # ref_input = torch.ones(1, 3, 32, 32, device="cpu", dtype=dtype)
    export_onnx_qcdq(quant_bottleneck_model, input, log_folder + "/" + design + ".onnx")
    # # Brevitas convolution
    q_inp = quant_id_1(input)
    int_inp = q_inp.int(float_datatype=True)

    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)

    before_input.tofile(
        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
    )
    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
    block0_wts1 = ds.reorder_mat(
        block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block0_wts2 = ds.reorder_mat(
        block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block0_wts3 = ds.reorder_mat(
        block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block0_wts_skip = ds.reorder_mat(
        block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )

    total_wts = np.concatenate(
        (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None
    )

    block1_wts1 = ds.reorder_mat(
        block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block1_wts2 = ds.reorder_mat(
        block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block1_wts3 = ds.reorder_mat(
        block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )

    total_wts2 = np.concatenate(
        (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None
    )

    block2_wts1 = ds.reorder_mat(
        block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block2_wts2 = ds.reorder_mat(
        block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )
    block2_wts3 = ds.reorder_mat(
        block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
    )

    total_wts3 = np.concatenate(
        (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None
    )

    total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
    print("total_wts", total_wts2.shape)
    # for i in range (0,1):
    app.buffers[2].write(ifm_mem_fmt)  # input's standard format CYX | scalar YCX
    app.buffers[3].write(total_wts3)  # wts's standard format OIYX | scalar OIYX
    # app.buffers[3].write(int_weight2.data.numpy().astype(dtype_in),offset=2048) # wts's standard format OIYX | scalar OIYX
    app.run()
    output3 = app.buffers[4].read()
    if enable_trace:
        output3, trace = extract_trace(output3, shape_out, dtype_out)
        write_out_trace(trace, trace_file)
    # temp_out=output3.reshape(32,256, 32)
    # ofm_mem_fmt = temp_out.swapaxes(0,1)
    temp_out = output3.reshape(32, 32, 32, 8)
    temp2_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
    ofm_mem_fmt = temp2_out.reshape(256, 32, 32)
    ofm_mem_fmt.tofile(
        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
    )

    ofm_mem_fmt = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
    print("AIE output:::", ofm_mem_fmt)
    print(type(ofm_mem_fmt))
    print(type(q_bottleneck_out))
    print(
        "difference::",
        torch.max(torch.abs(ofm_mem_fmt * block_2_relu_3 - q_bottleneck_out)),
    )
    diff = torch.abs(ofm_mem_fmt - gold_out)
    # print("diff::",diff)
    # for i, x1 in enumerate(diff):
    #     for j, x2 in enumerate(x1):
    #         for k, x3 in enumerate(x2):
    #             for l, x4 in enumerate(x3):
    #                 if x4 > 3:
    #                     print("i:",i,", j:",j,", k:", k, ", l:", l, ", val:",x4)
    #                     print("ofm_mem_fmt val:",ofm_mem_fmt[i,j,k,l])
    #                     print("gold_out val:",gold_out[i,j,k,l])
    sq_abs = torch.square(torch.abs(ofm_mem_fmt * block_2_relu_3 - q_bottleneck_out))
    print("rms::", torch.sqrt(torch.sum(sq_abs) / torch.numel(sq_abs)))
    # assert(np.allclose(ofm_mem_fmt ,gold_out , rtol=0, atol=3))

_________POST PTQ SCALES_________
init_scale: tensor(0.0078)
block_0_relu1: tensor(0.0039)
block_0_relu2: tensor(0.0039)
block_0_relu3: tensor(0.0039)
block_0_weight_scale1: tensor(0.0010, grad_fn=<DivBackward0>)
block_0_weight_scale2: tensor(0.0005, grad_fn=<DivBackward0>)
block_0_weight_scale3: tensor(0.0010, grad_fn=<DivBackward0>)
block_0_weight_scale_skip: tensor(0.0010, grad_fn=<DivBackward0>)
--------------------------------------------------------------
block_1_quant_add_1: tensor(0.0039)
block_1_relu1: tensor(0.0039)
block_1_relu2: tensor(0.0039)
block_1_relu3: tensor(0.0039)
block_1_weight_scale1: tensor(0.0005, grad_fn=<DivBackward0>)
block_1_weight_scale2: tensor(0.0005, grad_fn=<DivBackward0>)
block_1_weight_scale3: tensor(0.0010, grad_fn=<DivBackward0>)
--------------------------------------------------------------
block_2_quant_add_1: tensor(0.0039)
block_2_relu1: tensor(0.0039)
block_2_relu2: tensor(0.0039)
block_2_relu3: tensor(0.0039)
block_2_weight_scale1: tensor(0.0

  return super().rename(names)


verbose: False, log level: Level.ERROR

total_wts (143360,)
AIE output::: tensor([[[[  9,  10, 139,  ...,  12,   5,   2],
          [ 12,  16,   1,  ...,   5,   8,   6],
          [253,   7,   0,  ...,   0, 111,  65],
          ...,
          [ 17,  67,   2,  ...,   3,   2, 117],
          [  5,  25,  95,  ..., 205,  73, 117],
          [  7, 106,  23,  ..., 115,   4,   0]],

         [[  0,  10, 175,  ...,  83,   6,  25],
          [ 74, 118,   9,  ...,  67,  12,   4],
          [121,   8,   6,  ...,   7,   0,   8],
          ...,
          [ 79,  32,  31,  ...,   5,  76, 199],
          [  5,   3,  49,  ...,  11,   3,   8],
          [  6,  40,  48,  ...,   3,   5,  55]],

         [[  0, 120,   0,  ...,   0, 219, 115],
          [ 21,   0,   0,  ...,   9, 126,  55],
          [  0,   0, 195,  ...,  22, 243,  26],
          ...,
          [  0,   0,   0,  ...,  14,   0,   9],
          [ 29,   0,   0,  ...,   0,   0,   0],
          [ 91,   0, 108,  ...,   4,  41,   0]],

         ..

  print("difference::",torch.max(torch.abs(ofm_mem_fmt*block_2_relu_3 - q_bottleneck_out)))
  sq_abs = torch.square(torch.abs(ofm_mem_fmt*block_2_relu_3 - q_bottleneck_out))


Golden::Brevitas:: [[[[  0   0 110 ... 133   0   0]
   [  0 121   0 ...   0   0   0]
   [116  44 136 ...   0   0   0]
   ...
   [  0  30   0 ...  93   0   0]
   [ 75   0   0 ...   0  60   0]
   [244  24   0 ...  38 106  33]]

  [[ 17   1   9 ...   5  72  91]
   [130  87  15 ...   5 114   7]
   [  0 118 170 ...   2   7  79]
   ...
   [  1 113  19 ...  12  16  77]
   [ 13   8  28 ...   8  14  41]
   [124  73  26 ...  15  50   7]]

  [[  0 115   2 ...  23   7   0]
   [  0   0   0 ...   0   0   0]
   [ 60   0   6 ... 101   0   1]
   ...
   [  0  73   0 ...   0   4  59]
   [  0 115  52 ...   0   2   0]
   [  0  49  19 ...  33 118   4]]

  ...

  [[ 10  18   0 ...   0 228   0]
   [  0   0   0 ...   0   0   3]
   [100  29   3 ...   0 111  38]
   ...
   [  8   0   0 ...   0   1   0]
   [ 15   0  25 ...   1   0   0]
   [228  49   0 ...   5  31  95]]

  [[ 45  13   6 ...  92   7   9]
   [  0 223 255 ...  72   4   6]
   [  7  63   1 ... 152  29  97]
   ...
   [ 37   0   5 ...  64  58  11]
   [  0



verbose: False, log level: Level.ERROR

total_wts (143360,)
AIE output::: tensor([[[[  0,   0, 110,  ..., 133,   0,   0],
          [  0, 121,   0,  ...,   0,   0,   0],
          [116,  44, 136,  ...,   0,   0,   0],
          ...,
          [  0,  30,   0,  ...,  93,   0,   0],
          [ 75,   0,   0,  ...,   0,  60,   0],
          [244,  24,   0,  ...,  38, 108,  33]],

         [[ 17,   1,   9,  ...,   5,  72,  91],
          [129,  87,  16,  ...,   4, 114,   7],
          [  0, 118, 170,  ...,   2,   6,  79],
          ...,
          [  1, 113,  19,  ...,  12,  16,  77],
          [ 13,   8,  28,  ...,   8,  14,  41],
          [124,  73,  26,  ...,  14,  48,   7]],

         [[  0, 115,   2,  ...,  23,   7,   0],
          [  0,   0,   0,  ...,   0,   0,   0],
          [ 60,   0,   5,  ..., 101,   0,   1],
          ...,
          [  0,  73,   0,  ...,   0,   4,  59],
          [  0, 115,  52,  ...,   1,   2,   0],
          [  0,  49,  19,  ...,  33, 118,   4]],

         ..

  print("difference::",torch.max(torch.abs(ofm_mem_fmt*block_2_relu_3 - q_bottleneck_out)))
  sq_abs = torch.square(torch.abs(ofm_mem_fmt*block_2_relu_3 - q_bottleneck_out))


Golden::Brevitas:: [[[[  7  15  68 ...  24  36  69]
   [124  31   0 ...   0  84   1]
   [ 71 170  91 ...   0 121   0]
   ...
   [  1 106 131 ...   0  94   0]
   [  3   0   1 ...   4  13   5]
   [  0   0 116 ... 119  95  49]]

  [[252 251   3 ... 255  84   1]
   [  0   0 169 ...  11 174 250]
   [171   3  99 ...  41   9  22]
   ...
   [108   7   7 ... 126   6  58]
   [ 28  24  13 ...   6   3  75]
   [202   4   3 ... 105   2  84]]

  [[  0  83 249 ... 192   9  53]
   [  7  27   0 ... 171   3   1]
   [ 48 123   9 ... 104 140   6]
   ...
   [  6  30   0 ... 148  16   0]
   [107  34 104 ...   1   6  44]
   [  3 114   0 ...   0 108  75]]

  ...

  [[128  64   5 ...   5   8  46]
   [203  30 218 ...   0  13  91]
   [ 22   0 113 ...   5  12   1]
   ...
   [ 81   0   1 ...   7  72   6]
   [ 66  44   7 ...  26   7   3]
   [ 35   0   0 ... 114   3  19]]

  [[  0   0 169 ...   0   0  29]
   [ 23   0 186 ...   8 120 127]
   [  0   9   0 ...   0   7 196]
   ...
   [  0   5  32 ...  27  85  56]
   [  0



verbose: False, log level: Level.ERROR

total_wts (143360,)
AIE output::: tensor([[[[  7,  15,  68,  ...,  24,  36,  69],
          [125,  31,   0,  ...,   0,  84,   1],
          [ 71, 170,  91,  ...,   0, 121,   0],
          ...,
          [  1, 106, 131,  ...,   0,  93,   0],
          [  3,   0,   1,  ...,   4,  13,   5],
          [  0,   0, 116,  ..., 119,  95,  48]],

         [[252, 251,   3,  ..., 255,  84,   1],
          [  0,   0, 169,  ...,  11, 174, 250],
          [171,   3,  99,  ...,  41,   9,  22],
          ...,
          [108,   6,   7,  ..., 126,   6,  58],
          [ 28,  24,  13,  ...,   6,   3,  75],
          [203,   4,   3,  ..., 105,   2,  84]],

         [[  0,  83, 249,  ..., 192,   9,  53],
          [  7,  27,   0,  ..., 171,   3,   1],
          [ 48, 124,   9,  ..., 104, 140,   6],
          ...,
          [  6,  30,   0,  ..., 148,  16,   0],
          [105,  34, 105,  ...,   1,   5,  44],
          [  3, 114,   0,  ...,   0, 108,  75]],

         ..

  print("difference::",torch.max(torch.abs(ofm_mem_fmt*block_2_relu_3 - q_bottleneck_out)))
  sq_abs = torch.square(torch.abs(ofm_mem_fmt*block_2_relu_3 - q_bottleneck_out))


Golden::Brevitas:: [[[[  4   4   0 ... 109 185   1]
   [  0  24   0 ...   0   0   0]
   [  0   0   0 ...   0 138   2]
   ...
   [ 79   0   0 ...  29   1  89]
   [  2 137  73 ...  27 130   0]
   [  2   0   0 ...  50   0   0]]

  [[219  91  95 ...   0   6  86]
   [ 13  22  68 ...   5   0   0]
   [  0  90   0 ...   4   0   2]
   ...
   [  0   0 149 ...   0   0 145]
   [ 34  79 130 ...   0  27   0]
   [113  40   0 ...   0   0  42]]

  [[  3  56   8 ...  75   8 111]
   [ 49   7   3 ...   4  15  12]
   [116   9  85 ...   6   4 235]
   ...
   [116  70 218 ...   7   3 146]
   [ 63  13  32 ...   4   9  58]
   [127  95   7 ...  13  17 255]]

  ...

  [[147   0  41 ...  81 216   0]
   [  0 164   3 ... 242   0 126]
   [  0   0 116 ...  35   0   0]
   ...
   [ 40   0   0 ...   0 198   3]
   [  0  83   0 ... 151   0   0]
   [  0   0   0 ... 173 237   0]]

  [[ 93  41  52 ...  53 202  13]
   [ 16  30 190 ...  22  23 206]
   [ 18  19  28 ...  17  98  12]
   ...
   [ 14 102  36 ...  71  57  96]
   [ 19



verbose: False, log level: Level.ERROR

total_wts (143360,)
AIE output::: tensor([[[[  4,   4,   0,  ..., 109, 185,   1],
          [  0,  24,   0,  ...,   0,   0,   0],
          [  0,   0,   0,  ...,   0, 138,   2],
          ...,
          [ 79,   0,   0,  ...,  28,   1,  89],
          [  2, 138,  73,  ...,  27, 130,   0],
          [  2,   0,   0,  ...,  50,   0,   0]],

         [[218,  91,  95,  ...,   0,   6,  86],
          [ 13,  22,  68,  ...,   4,   0,   0],
          [  0,  90,   0,  ...,   4,   0,   2],
          ...,
          [  0,   0, 149,  ...,   0,   0, 146],
          [ 34,  79, 130,  ...,   0,  27,   0],
          [113,  40,   0,  ...,   0,   0,  42]],

         [[  4,  56,   8,  ...,  75,   7, 111],
          [ 49,   7,   3,  ...,   4,  14,  12],
          [116,   9,  85,  ...,   6,   4, 235],
          ...,
          [116,  70, 218,  ...,   7,   3, 146],
          [ 63,  13,  31,  ...,   4,   9,  58],
          [127,  95,   7,  ...,  13,  17, 255]],

         ..

  print("difference::",torch.max(torch.abs(ofm_mem_fmt*block_2_relu_3 - q_bottleneck_out)))
  sq_abs = torch.square(torch.abs(ofm_mem_fmt*block_2_relu_3 - q_bottleneck_out))


In [4]:
if enable_trace:
    print(trace)
else:
    print("tracing not enabled")

[2156134400 3690978303 3690978303 ... 3690978303 3690978303 3690978303]
