In [243]:
import numpy as np
from typing import List, Optional, Tuple

# ==========================================================
# 1. Scratchpad Memory (SPM) - handles tiling and padding
# ==========================================================
class ScratchpadMemory:
    def __init__(self, input_tensor, tile_size, kernel_size, stride=1, verbose=True):
        self.input = input_tensor
        self.H, self.W, self.C = input_tensor.shape
        self.tile_size = tile_size
        self.kernel_size = kernel_size
        self.stride = stride
        self.verbose = verbose

    def generate_tile_addresses(self):
        T, K, s = self.tile_size, self.kernel_size, self.stride
        addrs = []
        row_step = T - K + 1 if T - K + 1 > 0 else 1
        col_step = T - K + 1 if T - K + 1 > 0 else 1

        num_tiles_row = int(np.ceil((self.H - K + 1) / row_step))
        num_tiles_col = int(np.ceil((self.W - K + 1) / col_step))

        if self.verbose:
            print(f"\n[SPM] Generating tile addresses...")
            print(f"  Input shape = ({self.H},{self.W},{self.C}), Tile size = {T}, Kernel = {K}, Stride = {s}")
            print(f"  Step sizes -> row_step = {row_step}, col_step = {col_step}")
            print(f"  Expected total tiles = {num_tiles_row} x {num_tiles_col} = {num_tiles_row*num_tiles_col}\n")

        for t_i in range(num_tiles_row):
            for t_j in range(num_tiles_col):
                row = t_i * row_step
                col = t_j * col_step
                addrs.append((row, col))
                if self.verbose:
                    pad_r = max(0, row + T - self.H)
                    pad_c = max(0, col + T - self.W)
                    print(f"  -> Tile ({t_i},{t_j}): start=({row},{col}), "
                          f"cover=({row}:{row+T}, {col}:{col+T}), "
                          f"pad_rows={pad_r}, pad_cols={pad_c}")

        return addrs

    def read_row(self, tile_start, row_idx):
        r0, c0 = tile_start
        T = self.tile_size
        row_data = np.zeros((T, self.C))
        for j in range(T):
            rr, cc = r0 + row_idx, c0 + j
            if rr < self.H and cc < self.W:
                row_data[j, :] = self.input[rr, cc, :]
        # if self.verbose:
        #     print(f"[SPM] Read row {row_idx} from tile starting at {tile_start}, "
        #           f"valid range=({r0+row_idx},{c0}:{c0+T})")
        return row_data  # shape [T, C]

In [244]:
class ToeplitzBuffer:
    def __init__(self, tile_size, kernel_size, num_channels, stride=1):
        self.tile_size = tile_size
        self.kernel_size = kernel_size
        self.num_channels = num_channels
        self.stride = stride
        self.buffer = []

    def fill_buffer(self, row_data):
        self.buffer.append(row_data)

    def stream_columns(self):
        """Return list of column-major Toeplitz vectors for this tile."""
        K, T, C = self.kernel_size, self.tile_size, self.num_channels
        if len(self.buffer) < K:
            raise ValueError("Buffer not full yet")

        block = np.stack(self.buffer[-K:], axis=0)  # [K, T, C]
        cols = []
        for j in range(T - K + 1):
            patch = block[:, j:j+K, :].reshape(-1)
            cols.append(patch)
        return cols

    def stream_row(self, row_data):
        """
        row_data: shape [tile_size, num_channels]
        Returns: list of Toeplitz rows (flattened) ready for SA
        """
        self.buffer.append(row_data)
        if len(self.buffer) < self.kernel_size:
            return []

        K = self.kernel_size
        C = self.num_channels
        toeplitz_rows = []

        current_block = np.stack(self.buffer[-K:], axis=0)  # shape [K, tile_size, C]

        for j in range(0, self.tile_size - K + 1, self.stride):
            patch = current_block[:, j:j+K, :].reshape(-1)
            toeplitz_rows.append(patch)

        return toeplitz_rows

    # def stream_columns(self):
    #     """
    #     Converts the buffered rows into column-major flattened vectors
    #     ready for column-staggered SA streaming.
    #     Returns: list of arrays of length K*num_channels (for SA row input)
    #     """
    #     K = self.kernel_size
    #     C = self.num_channels
    #     T = self.tile_size

    #     if len(self.buffer) < K:
    #         return []

    #     # Stack last K rows
    #     block = np.stack(self.buffer[-K:], axis=0)  # [K, T, C]

    #     cols = []
    #     for j in range(0, T - K + 1, self.stride):
    #         col_vec = block[:, j:j+K, :].reshape(-1)
    #         # padded_col = np.zeros(self.tile_size)  # pad to array_size
    #         # padded_col[:len(col_vec)] = col_vec
    #         cols.append(col_vec)
    #     return cols



In [245]:
class KernelLoader:
    def __init__(self, kernels: np.ndarray, array_size: int):
        self.R, self.S, self.C, self.K = kernels.shape
        self.kernels = kernels
        self.array_size = array_size

    def get_kernel_matrix(self, k: int):
        """
        Flatten one kernel and return padded [array_size, array_size] matrix.
        """
        k_flat = self.kernels[:, :, :, k].reshape(-1)  # shape [R*S*C]
        H = len(k_flat)
        padded = np.zeros((self.array_size, self.array_size))
        padded[:H, 0] = k_flat  # map into first column (common for weight-stationary)
        return padded


In [246]:
# class PE:
#     def __init__(self, links: Optional[List['PE']] = None):
#         if links is None:
#             links = [None, None, None, None]
#         self.links = links
#         self.activation = 0.0
#         self.weight = 0.0
#         self.accumulation = 0.0

# class SystolicArray:
#     def __init__(self, size: int):
#         self.size = size
#         self.array = [[PE() for _ in range(size)] for _ in range(size)]

#     def load_weights(self, weight_matrix):
#         """Load a stationary weight matrix into the SA"""
#         for i in range(self.size):
#             for j in range(self.size):
#                 self.array[i][j].weight = weight_matrix[i, j]

#     def feed_activation_col(self, col_vec):
#         """
#         Feed one Toeplitz column (vector) into the SA, staggered.
#         col_vec: list/np.array of activations (length ≤ size).
#         - Values injected into column 0, one per row, with delay.
#         - Requires calling cycle() repeatedly to flush.
#         """
#         depth = len(col_vec)
#         # Stream with stagger (pipeline fill)
#         for t in range(depth + self.size - 1):
#             for r in range(self.size):
#                 idx = t - r
#                 if 0 <= idx < depth:
#                     self.array[r][0].activation = col_vec[idx]
#                 else:
#                     self.array[r][0].activation = 0.0
#             self.cycle()

#     def cycle(self):
#         """One systolic cycle: MAC + shift activations rightward"""
#         # 1. Compute MAC
#         for i in range(self.size):
#             for j in range(self.size):
#                 pe = self.array[i][j]
#                 pe.accumulation += pe.activation * pe.weight

#         # 2. Shift activations right
#         for i in range(self.size):
#             for j in reversed(range(self.size - 1)):
#                 self.array[i][j + 1].activation = self.array[i][j].activation

#     def collect_output(self):
#         out = np.zeros((self.size, self.size))
#         for i in range(self.size):
#             for j in range(self.size):
#                 out[i, j] = self.array[i][j].accumulation
#         return out

#     def reset_accumulation(self):
#         for i in range(self.size):
#             for j in range(self.size):
#                 self.array[i][j].accumulation = 0.0

In [247]:
# class PE:
#     def __init__(self, links: Optional[List['PE']] = None):
#         if links is None:
#             links = [None, None, None, None]
#         self.links = links
#         self.activation = 0.0
#         self.weight = 0.0
#         self.accumulation = 0.0

# class SystolicArray:
#     def __init__(self, size: int):
#         self.size = size
#         self.array = [[PE() for _ in range(size)] for _ in range(size)]

#     def load_weights(self, weight_matrix):
#         for i in range(self.size):
#             for j in range(self.size):
#                 self.array[i][j].weight = weight_matrix[i,j]

#     def feed_activation_row(self, row):
#         for j in range(len(row)):
#             self.array[0][j].activation = row[j]

#     def cycle(self):
#         # compute accumulation
#         for i in range(self.size):
#             for j in range(self.size):
#                 pe = self.array[i][j]
#                 pe.accumulation += pe.activation * pe.weight
#         # shift activations down
#         for i in reversed(range(self.size-1)):
#             for j in range(self.size):
#                 self.array[i+1][j].activation = self.array[i][j].activation

#     def collect_output(self):
#         out = np.zeros((self.size, self.size))
#         for i in range(self.size):
#             for j in range(self.size):
#                 out[i,j] = self.array[i][j].accumulation
#         return out

#     def reset_accumulation(self):
#         for i in range(self.size):
#             for j in range(self.size):
#                 self.array[i][j].accumulation = 0.0

In [248]:
import numpy as np
# import matplotlib.pyplot as plt
    
class SystolicArray():
    def __init__(self, size : int, PE_latency : int, dtype : np.dtype):
        self.size = size
        self.PE_latency = PE_latency
        self.dtype = dtype
        self.weight            = np.zeros(shape=(size, size), dtype=dtype)
        self.PE_total          = np.zeros(shape=(size, size), dtype=dtype)
        self.input_from_left   = np.zeros(shape=(size, size), dtype=dtype)
        self.total_from_above  = np.zeros(shape=(size, size), dtype=dtype)
        self.output_right      = np.zeros(shape=(size, size), dtype=dtype)
        self.output_down       = np.zeros(shape=(size, size), dtype=dtype)
        self.partial_sum_FIFOs = np.zeros(shape=(2 * size, size), dtype=dtype)
        self.input_FIFOs       = np.zeros(shape=(size, 2 * size), dtype=dtype)
        self.output_FIFOs      = np.zeros(shape=(2 * size, size), dtype=dtype)

    def load_weights(self, weight):
        self.weight = weight.T
    
    def load_input_FIFOs(self, input):
        for i in range(self.size):
            self.input_FIFOs[i, self.size - i: 2 * self.size - i] = input[i, :]

    def load_partial_sum_FIFOs(self, PS):
        for i in range(self.size):
            self.partial_sum_FIFOs[self.size - i : 2 * self.size - i, i] = PS[i, :]
        
    def read_output_FIFOs(self):
        out = np.zeros(shape = (self.size, self.size), dtype = self.dtype)
        for i in range(self.size):
            out[i, :] = self.output_FIFOs[self.size - i : 2 * self.size - i, i]
        return out


    def cycle(self):
        # Write to output FIFOs
        self.output_FIFOs = np.roll(self.output_FIFOs, 1, axis = 0)
        self.output_FIFOs[0, :] = self.output_down[-1, :]

        # Move outputs down
        self.total_from_above = np.roll(self.output_down, 1, axis = 0)

        # Read partial sum into top row
        self.total_from_above[0,:] = self.partial_sum_FIFOs[-1, :]
        self.partial_sum_FIFOs = np.roll(self.partial_sum_FIFOs, 1, axis = 0)
        self.partial_sum_FIFOs[0, :] = self.dtype(0)

        # Move inputs right
        self.input_from_left = np.roll(self.output_right, 1, axis = 1)

        # Write new input into left column
        self.input_from_left[:, 0] = self.input_FIFOs[:, -1]
        self.input_FIFOs = np.roll(self.input_FIFOs, 1, axis = 1)
        self.input_FIFOs[:, 0] = self.dtype(0)
        
        #Perform MAC computations
        self.PE_total = (self.weight * self.input_from_left).astype(self.dtype) + self.total_from_above
        self.output_down = self.PE_total
        self.output_right = self.input_from_left

In [249]:
# ==========================================================
# 6. Accumulator
# ==========================================================
class PsumBuffer:
    def __init__(self):
        self.outputs = []

    def accumulate(self, tile_output):
        self.outputs.append(tile_output)

    def get_final(self):
        return np.sum(self.outputs, axis=0)

In [250]:
class ConvEngine:
    def __init__(self, input_tensor, kernels, array_size=32, stride=1, verbose=True):
        self.input_tensor = input_tensor
        self.kernels = kernels
        self.array_size = array_size
        self.stride = stride
        self.verbose = verbose

        self.H, self.W, self.C = input_tensor.shape
        self.R, self.S, self.Ck, self.K = kernels.shape
        assert self.C == self.Ck, "Channel mismatch!"

        self.sa = SystolicArray(array_size)

    # # --- Diagonal wavefront streaming into SA ---
    # def stream_to_sa(sa, toeplitz_cols, array_size):
    #     """
    #     sa: SystolicArray object
    #     toeplitz_cols: list of flattened activation vectors (length <= array_size)
    #     """
    #     num_cycles = len(toeplitz_cols) + array_size - 1
    #     for t in range(num_cycles):
    #         for col_idx in range(array_size):
    #             if 0 <= t - col_idx < len(toeplitz_cols):
    #                 act_vec = toeplitz_cols[t - col_idx]
    #             else:
    #                 act_vec = np.zeros(array_size)

    #             # pad vector to array_size
    #             if len(act_vec) < array_size:
    #                 padded = np.zeros(array_size)
    #                 padded[:len(act_vec)] = act_vec
    #                 act_vec = padded

    #             # feed activations into SA
    #             for r in range(array_size):
    #                 sa.array[r][col_idx].activation = act_vec[r]

    #         sa.cycle()
        
        # --- Main SA simulation for a tile ---
    # def process_tile(sa, spm, tile_start, kernels, array_size):
    #     """
    #     sa: SystolicArray object
    #     spm: ScratchpadMemory object
    #     tile_start: (row, col)
    #     kernels: R x S x C x K
    #     """
    #     R, S, C, K = kernels.shape
    #     toeplitz = ToeplitzBuffer(array_size, R, C)
    #     # Fill buffer with tile rows
    #     for r in range(array_size):
    #         row_data = spm.read_row(tile_start, r)
    #         toeplitz.fill_buffer(row_data)

    #     toeplitz_cols = toeplitz.stream_columns()

    #     # For each kernel
    #     tile_psum = np.zeros((len(toeplitz_cols), K))
    #     for k_idx in range(K):
    #         weight_flat = kernels[:, :, :, k_idx].reshape(-1)
    #         # pad weights to array_size
    #         weight_chunk = np.zeros(array_size)
    #         weight_chunk[:len(weight_flat)] = weight_flat
    #         weight_matrix = np.zeros((array_size, array_size))
    #         weight_matrix[:, 0] = weight_chunk  # weight-stationary
    #         sa.load_weights(weight_matrix)

    #         # Stream activations
    #         stream_to_sa(sa, toeplitz_cols, array_size)

    #         # Collect SA output for this kernel
    #         sa_out = sa.collect_output()
    #         tile_psum[:, k_idx] = sa_out[:len(toeplitz_cols), 0]

    #         sa.reset_accumulation()

    #     return tile_psum
        
    def run(self):
        spm = ScratchpadMemory(self.input_tensor, self.array_size, self.R,
                            stride=self.stride, verbose=self.verbose)

        # Output dimensions
        out_H = (self.H - self.R) // self.stride + 1
        out_W = (self.W - self.S) // self.stride + 1
        output = np.zeros((out_H, out_W, self.K))

        # 1. Get all tile start addresses
        tile_addrs = spm.generate_tile_addresses()
        if self.verbose:
            print(f"\n[Engine] === Tiling complete: {len(tile_addrs)} tiles ===")

        # 2. Process kernels in groups of SA width
        group_size = self.array_size
        num_groups = int(np.ceil(self.K / group_size))
        for g in range(num_groups):
            k_start = g * group_size
            k_end = min((g+1) * group_size, self.K)

            if self.verbose:
                print(f"\n[Engine] === Processing kernel group {g} ({k_start}..{k_end-1}) ===")
                print()

            # Reset SA accumulators
            self.sa.reset_accumulation()

            # Figure out how many chunks are needed (same across all kernels in this group)
            max_chunks = max(
                int(np.ceil(self.kernels[:, :, :, k].size / self.array_size))
                for k in range(k_start, k_end)
            )

            # 🔑 Loop over chunks round-robin style
            for chunk_id in range(max_chunks):
                # 1. Load this chunk for ALL kernels in the group
                weight_matrix = np.zeros((self.array_size, self.array_size))

                for col, k in enumerate(range(k_start, k_end)):
                    k_flat = self.kernels[:, :, :, k].reshape(-1)
                    start = chunk_id * self.array_size
                    end = min((chunk_id + 1) * self.array_size, len(k_flat))

                    chunk = np.zeros(self.array_size)
                    if start < len(k_flat):
                        chunk[:end - start] = k_flat[start:end]

                    weight_matrix[:, col] = chunk

                    if self.verbose:
                        print(f"[Kernel {k}] Chunk {chunk_id+1}/{max_chunks} loaded "
                            f"indices {start}:{end} into column {col}")
                        
                # if self.verbose:
                #     print(f"\nWeight matrix for chunk {chunk_id+1}:\n{weight_matrix}\n")

                # for c in range(weight_matrix.shape[1]):
                #     print(f"Column {c}: {weight_matrix[:, c]}")

                self.sa.load_weights(weight_matrix)
                self.sa.cycle()  # settle weights

                # 2. Process all tiles with current chunk weights
                for (row, col_start) in tile_addrs:
                    if self.verbose:
                        print(f"\n[Engine] Processing tile at start=({row},{col_start}) "
                            f"for chunk {chunk_id+1}")

                    # toeplitz = ToeplitzBuffer(self.array_size, self.R, self.C,
                    #                         stride=self.stride)

                    # for r in range(self.array_size):
                    #     row_data = spm.read_row((row, col_start), r)
                    #     toeplitz_rows = toeplitz.stream_row(row_data)

                    #     for act_row in toeplitz_rows:
                    #         num_act_chunks = int(np.ceil(len(act_row) / self.array_size))
                    #         for act_chunk_id in range(num_act_chunks):
                    #             a_start = act_chunk_id * self.array_size
                    #             a_end = min((act_chunk_id + 1) * self.array_size, len(act_row))
                    #             act_chunk = np.zeros(self.array_size)
                    #             act_chunk[:a_end - a_start] = act_row[a_start:a_end]

                    #             for r in range(self.array_size):
                    #                 if r < len(act_chunk):
                    #                     self.sa.array[r][0].activation = act_chunk[r]
                    #                 else:
                    #                     self.sa.array[r][0].activation = 0.0
                    #             self.sa.cycle()
                    toeplitz = ToeplitzBuffer(self.array_size, self.R, self.C, stride=self.stride)

                    for r_idx in range(self.array_size):
                        row_data = spm.read_row((row, col_start), r_idx)
                        toeplitz.buffer.append(row_data)  # Fill the buffer for K rows

                    # Convert buffered rows into column-major vectors
                    toeplitz_cols = toeplitz.stream_columns()

                    # Column-staggered streaming (diagonal wavefront)
                    for t in range(len(toeplitz_cols) + self.array_size - 1):
                        for col_idx in range(self.array_size):
                            if 0 <= t - col_idx < len(toeplitz_cols):
                                act_vec = toeplitz_cols[t - col_idx]
                            else:
                                act_vec = np.zeros(self.array_size)

                            # Pad act_vec if it's shorter than array_size
                            if len(act_vec) < self.array_size:
                                padded_vec = np.zeros(self.array_size)
                                padded_vec[:len(act_vec)] = act_vec
                                act_vec = padded_vec
                            for r in range(self.array_size):
                                self.sa.array[r][col_idx].activation = act_vec[r]

                        self.sa.cycle()
                        print(f"\nCycle {t+1}:")
                        for col_idx in range(self.array_size):
                            col_acts = [self.sa.array[r][col_idx].activation for r in range(self.array_size)]
                            print(f"Column {col_idx}: {col_acts}")
                    # for r in range(self.array_size):
                    #     row_data = spm.read_row((row, col_start), r)
                    #     toeplitz_cols = toeplitz.stream_row(row_data)

                    #     # Each column is a Toeplitz vector → feed directly
                    #     for col_vec in toeplitz_cols:
                    #         self.sa.feed_activation_col(col_vec)

                    sa_out = self.sa.collect_output()  # final psums for tile-region

                    # Write into output
                    block_H = min(self.array_size, out_H - row)
                    block_W = min(self.array_size, out_W - col_start)
                    for col_idx, k in enumerate(range(k_start, k_end)):
                        valid_out = sa_out[:block_H, col_idx:col_idx + block_W]
                        target_slice = output[row:row + block_H, col_start:col_start + block_W, k]
                        min_H = min(valid_out.shape[0], target_slice.shape[0])
                        min_W = min(valid_out.shape[1], target_slice.shape[1])
                        target_slice[:min_H, :min_W] = valid_out[:min_H, :min_W]

                    # Reset SA accumulations before next tile
                    self.sa.reset_accumulation()
                    toeplitz.buffer = []

        return output
    


In [251]:
class ConvEngineMatmul:
    def __init__(self, input_tensor, kernels, tile_size=32, stride=1, verbose=True):
        """
        input_tensor: H x W x C
        kernels: R x S x C x K
        tile_size: for tiling (matches ScratchpadMemory.tile_size)
        """
        self.input_tensor = input_tensor
        self.kernels = kernels
        self.tile_size = tile_size
        self.stride = stride
        self.verbose = verbose

        self.H, self.W, self.C = input_tensor.shape
        self.R, self.S, self.Ck, self.K = kernels.shape
        assert self.C == self.Ck, "Channel mismatch!"

        self.spm = ScratchpadMemory(input_tensor, tile_size, self.R,
                                    stride=stride, verbose=verbose)

    # def run(self):
    #     # Output dimensions
    #     out_H = (self.H - self.R) // self.stride + 1
    #     out_W = (self.W - self.S) // self.stride + 1
    #     output = np.zeros((out_H, out_W, self.K))

    #     # 1. Generate tile addresses
    #     tile_addrs = self.spm.generate_tile_addresses()
    #     if self.verbose:
    #         print(f"\n[Engine] Total tiles: {len(tile_addrs)}")

    #     # 2. Process each tile
    #     for tile_idx, (row, col) in enumerate(tile_addrs):
    #         if self.verbose:
    #             print(f"\n[Tile {tile_idx}] Start at ({row},{col})")

    #         # Determine tile dimensions
    #         tile_H = min(self.tile_size, out_H - row)
    #         tile_W = min(self.tile_size, out_W - col)
    #         num_patches = tile_H * tile_W

    #         # Build Toeplitz-like patches for the tile
    #         Tmat = np.zeros((num_patches, self.R * self.S * self.C))
    #         patch_idx = 0
    #         for i in range(row, row + tile_H * self.stride, self.stride):
    #             for j in range(col, col + tile_W * self.stride, self.stride):
    #                 patch = np.zeros((self.R, self.S, self.C))
    #                 for r in range(self.R):
    #                     row_data = self.spm.read_row((i, j), r)  # shape = [tile_size, C]
    #                     # only take first S elements from row_data for this patch
    #                     patch[r, :, :] = row_data[:self.S, :]
    #                 Tmat[patch_idx, :] = patch.reshape(-1)
        #             patch_idx += 1

        #     if self.verbose:
        #         print(f"  Tmat shape: {Tmat.shape}")

        #     # 3. Multiply with all kernels
        #     kernel_flat_list = [self.kernels[:, :, :, k].reshape(-1) for k in range(self.K)]
        #     kernel_matrix = np.stack(kernel_flat_list, axis=1)  # shape = (R*S*C, K)
        #     Y = Tmat @ kernel_matrix  # shape = (num_patches, K)

        #     # 4. Write back to output
        #     idx = 0
        #     for i in range(tile_H):
        #         for j in range(tile_W):
        #             for k in range(self.K):
        #                 output[row + i, col + j, k] = Y[idx, k]
        #             idx += 1

        # return output

    def run(self):
        # Output dimensions
        out_H = (self.H - self.R) // self.stride + 1
        out_W = (self.W - self.S) // self.stride + 1
        output = np.zeros((out_H, out_W, self.K))

        # 1. Generate tile addresses
        tile_addrs = self.spm.generate_tile_addresses()
        if self.verbose:
            print(f"\n[Engine] Total tiles: {len(tile_addrs)}")

        # 2. Process each tile
        for tile_idx, (row, col) in enumerate(tile_addrs):
            if self.verbose:
                print(f"\n[Tile {tile_idx}] Start at ({row},{col})")

            # Determine tile dimensions
            # tile_H = min(self.tile_size, out_H - row)
            # tile_W = min(self.tile_size, out_W - col)
            tile_H = max(0, min(self.tile_size, out_H - row))
            tile_W = max(0, min(self.tile_size, out_W - col))
            if tile_H == 0 or tile_W == 0:
                continue  # skip invalid tiles  
            num_patches = tile_H * tile_W

            # Build Toeplitz-like patches for the tile
            Tmat = np.zeros((num_patches, self.R * self.S * self.C))
            patch_idx = 0
            # for i in range(row, row + tile_H * self.stride, self.stride):
            #     for j in range(col, col + tile_W * self.stride, self.stride):
            #         patch = np.zeros((self.R, self.S, self.C))
            #         for r in range(self.R):
            #             row_data = self.spm.read_row((i, j), r)  # shape = [tile_size, C]
            #             patch[r, :, :] = row_data[:self.S, :]
            #         Tmat[patch_idx, :] = patch.reshape(-1)
            #         patch_idx += 1

            for i_out in range(row, row + tile_H):       # output-space row
                for j_out in range(col, col + tile_W):   # output-space col
                    i_in = i_out * self.stride           # map to input-space row
                    j_in = j_out * self.stride           # map to input-space col

                    patch = np.zeros((self.R, self.S, self.C))
                    for r in range(self.R):
                        row_data = self.spm.read_row((i_in, j_in), r)  # shape = [tile_size, C]
                        patch[r, :, :] = row_data[:self.S, :]
                    Tmat[patch_idx, :] = patch.reshape(-1)
                    patch_idx += 1

            if self.verbose:
                print(f"  Tmat shape: {Tmat.shape}")

            # 3. Multiply with kernels (chunked systolic style)
            kernel_flat_list = [self.kernels[:, :, :, k].reshape(-1) for k in range(self.K)]
            kernel_matrix = np.stack(kernel_flat_list, axis=1)  # shape = (R*S*C, K)

            Y = np.zeros((num_patches, self.K))

            # chunk Toeplitz and kernel matrix into blocks of at most 32
            m, n = Tmat.shape  # m = num_patches, n = R*S*C
            k = self.K

            block_m = 32
            block_k = 32

            for i in range(0, m, block_m):   # patches dimension
                for j in range(0, k, block_k):  # kernel dimension
                    for p in range(0, n, block_k):  # inner dimension (reduction)
                        T_block = Tmat[i:i+block_m, p:p+block_k]
                        K_block = kernel_matrix[p:p+block_k, j:j+block_k]
                        Y[i:i+block_m, j:j+block_k] += T_block @ K_block

            # 4. Write back to output
            idx = 0
            for i in range(tile_H):
                for j in range(tile_W):
                    for k in range(self.K):
                        output[row + i, col + j, k] = Y[idx, k]
                    idx += 1

        return output

In [252]:
class ConvEngineSA:
    def __init__(self, input_tensor, kernels, tile_size=32, stride=1, verbose=True, dtype=np.float16):
        """
        input_tensor: H x W x C
        kernels: R x S x C x K
        """
        self.input_tensor = input_tensor
        self.kernels = kernels
        self.tile_size = tile_size
        self.stride = stride
        self.verbose = verbose

        self.H, self.W, self.C = input_tensor.shape
        self.R, self.S, self.Ck, self.K = kernels.shape
        assert self.C == self.Ck, "Channel mismatch!"

        self.spm = ScratchpadMemory(input_tensor, tile_size, self.R,
                                    stride=stride, verbose=verbose)

        # systolic array of size = tile_size (like before)
        self.sa = SystolicArray(size=tile_size, PE_latency=1, dtype=dtype)

    def run(self):
        # Output dimensions
        out_H = (self.H - self.R) // self.stride + 1
        out_W = (self.W - self.S) // self.stride + 1
        output = np.zeros((out_H, out_W, self.K), dtype=self.sa.dtype)

        # Generate tiles
        tile_addrs = self.spm.generate_tile_addresses()
        if self.verbose:
            print(f"\n[EngineSA] Total tiles: {len(tile_addrs)}")

        # --- Flatten all kernels into weight matrix (one kernel per column)
        kernel_flat_list = [self.kernels[:, :, :, k].reshape(-1) for k in range(self.K)]
        kernel_matrix = np.stack(kernel_flat_list, axis=1)  # (R*S*C, K)

        # Pad to SA size (tile_size)
        pad_len = self.tile_size * self.tile_size - kernel_matrix.shape[0]
        if pad_len > 0:
            kernel_matrix = np.vstack([kernel_matrix, np.zeros((pad_len, self.K), dtype=self.sa.dtype)])

        # Load weights into systolic array (each kernel = one column)
        W = np.zeros((self.sa.size, self.sa.size), dtype=self.sa.dtype)
        for k in range(self.K):
            W[:, k] = kernel_matrix[:self.sa.size, k]
        self.sa.load_weights(W)

        # Process each tile
        for tile_idx, (row, col) in enumerate(tile_addrs):
            if self.verbose:
                print(f"\n[Tile {tile_idx}] Start at ({row},{col})")

            # tile_H = min(self.tile_size, out_H - row)
            # tile_W = min(self.tile_size, out_W - col)
            tile_H = max(0, min(self.tile_size, out_H - row))
            tile_W = max(0, min(self.tile_size, out_W - col))

            # --- Build Toeplitz-like patches
            patches = []
            for i in range(row, row + tile_H * self.stride, self.stride):
                for j in range(col, col + tile_W * self.stride, self.stride):
                    patch = np.zeros((self.R, self.S, self.C))
                    for r in range(self.R):
                        row_data = self.spm.read_row((i, j), r)  # shape [T, C]
                        patch[r, :, :] = row_data[:self.S, :]
                    patches.append(patch.reshape(-1))
            Tmat = np.stack(patches, axis=0)  # (num_patches, R*S*C)

            # Pad to SA size
            num_patches = Tmat.shape[0]
            if Tmat.shape[1] < self.sa.size:
                pad = np.zeros((num_patches, self.sa.size - Tmat.shape[1]))
                Tmat = np.hstack([Tmat, pad])
            
            if self.verbose:
                print(f"  -> Streaming {num_patches} patches into SA")

            # --- Stream into systolic array (row by row, diagonal wavefront)
            for patch in Tmat:
                act_matrix = np.zeros((self.sa.size, self.sa.size), dtype=self.sa.dtype)
                act_matrix[:, 0] = patch[:self.sa.size]
                self.sa.load_input_FIFOs(act_matrix)
                self.sa.cycle()

            # Flush pipeline (extra cycles to propagate)
            for _ in range(2*self.sa.size):
                self.sa.cycle()

            # --- Collect outputs
            sa_out = self.sa.read_output_FIFOs()  # (size, size)
            if self.verbose:
                print(f"  SA output shape: {sa_out.shape}")

            # Write back (just top-left block relevant)
            for i in range(tile_H):
                for j in range(tile_W):
                    for k in range(self.K):
                        output[row+i, col+j, k] = sa_out[i, k]

        return output

In [253]:
if __name__ == "__main__":
    H, W, C = 64, 64, 4
    R = S = 5
    stride = 3
    input_tensor = np.random.randint(0,5,(H,W,C)).astype(float)
    kernels = np.random.randint(0,5,(R,S,C,2)).astype(float)

    # engine1 = ConvEngine(input_tensor, kernels, stride=stride, array_size=32, verbose=True)
    # sa_output1 = engine1.run()
    engine2 = ConvEngineMatmul(input_tensor, kernels, stride=stride, tile_size=32, verbose=True)
    sa_output2 = engine2.run()
    # engine3 = ConvEngineSA(input_tensor, kernels, stride=stride, tile_size=32, verbose=True)
    # sa_output3 = engine3.run()


    # Golden convolution
    def conv2d_stride(input_tensor, kernel, stride=1):
        H, W, C = input_tensor.shape
        R, S, _, K = kernel.shape
        out_H = (H - R)//stride + 1
        out_W = (W - S)//stride + 1
        out = np.zeros((out_H, out_W, K))
        for k in range(K):
            for i in range(out_H):
                for j in range(out_W):
                    patch = input_tensor[i*stride:i*stride+R, j*stride:j*stride+S, :]
                    out[i,j,k] = np.sum(patch * kernel[:,:,:,k])
        return out

    golden = conv2d_stride(input_tensor, kernels, stride=stride)

    # print("SA Simulator Output Shape:", sa_output1.shape)
    # print("Golden Conv Shape:", golden.shape)
    # for k in range(sa_output1.shape[2]):
    #     diff = np.max(np.abs(sa_output1[:,:,k] - golden[:,:,k]))
    #     print(f"Kernel {k} max absolute difference: {diff}")

    # # Print some sample values for debugging
    # print(f"\nSample outputs (kernel 0, first few positions):")
    # print(f"SA output: {sa_output1[0:3, 0:3, 0]}")
    # print(f"Golden:    {golden[0:3, 0:3, 0]}")

    print("Matmul Simulator Output Shape:", sa_output2.shape)
    print("Golden Conv Shape:", golden.shape)
    for k in range(sa_output2.shape[2]):
        diff = np.max(np.abs(sa_output2[:,:,k] - golden[:,:,k]))
        print(f"Kernel {k} max absolute difference: {diff}")

    print(f"\nSample outputs (kernel 0, first few positions):")
    print(f"SA output: {sa_output2[0:3, 0:3, 0]}")
    print(f"Golden:    {golden[0:3, 0:3, 0]}")
    # print(f"\nSample outputs (kernel 1, first few positions):")
    # print(f"SA output: {sa_output[0:3, 0:3, 1]}")
    # print(f"Golden:    {golden[0:3, 0:3, 1]}")
    # print(f"\nSample outputs (kernel 2, first few positions):")
    # print(f"SA output: {sa_output[0:3, 0:3, 2]}")
    # print(f"Golden:    {golden[0:3, 0:3, 2]}")
    # print(f"\nSample outputs (kernel 3, first few positions):")
    # print(f"SA output: {sa_output[0:3, 0:3, 3]}")
    # print(f"Golden:    {golden[0:3, 0:3, 3]}")
    # print("SA New Simulator Output Shape:", sa_output3.shape)
    # print("Golden Conv Shape:", golden.shape)
    # for k in range(sa_output3.shape[2]):
    #     diff = np.max(np.abs(sa_output3[:,:,k] - golden[:,:,k]))
    #     print(f"Kernel {k} max absolute difference: {diff}")

    # # Print some sample values for debugging
    # print(f"\nSample outputs (kernel 0, first few positions):")
    # print(f"SA output: {sa_output3[0:3, 0:3, 0]}")
    # print(f"Golden:    {golden[0:3, 0:3, 0]}")

#tiling and loading kernel is correct, output shapes are matching, difference are around 200
#next steps: look into psum how is it handled, rotation (transpose) of SA at the end of operation, 
        #need to check if toeplitz is being streamed column or row wise into SA - i need it to stream using this column-staggered Toeplitz streaming (with initial zeros)
        #check where we need to collect psum - after a chunk? after a tile? -After a chunk → accumulate psums internally (still incomplete).
	# •	After a kernel (all chunks done) → psums are final for that kernel and tile region.
	# •	After a tile → flush psums out, move to next tile.

#next step: sanity check - try actual matmul see if it works


[SPM] Generating tile addresses...
  Input shape = (64,64,4), Tile size = 32, Kernel = 5, Stride = 3
  Step sizes -> row_step = 28, col_step = 28
  Expected total tiles = 3 x 3 = 9

  -> Tile (0,0): start=(0,0), cover=(0:32, 0:32), pad_rows=0, pad_cols=0
  -> Tile (0,1): start=(0,28), cover=(0:32, 28:60), pad_rows=0, pad_cols=0
  -> Tile (0,2): start=(0,56), cover=(0:32, 56:88), pad_rows=0, pad_cols=24
  -> Tile (1,0): start=(28,0), cover=(28:60, 0:32), pad_rows=0, pad_cols=0
  -> Tile (1,1): start=(28,28), cover=(28:60, 28:60), pad_rows=0, pad_cols=0
  -> Tile (1,2): start=(28,56), cover=(28:60, 56:88), pad_rows=0, pad_cols=24
  -> Tile (2,0): start=(56,0), cover=(56:88, 0:32), pad_rows=24, pad_cols=0
  -> Tile (2,1): start=(56,28), cover=(56:88, 28:60), pad_rows=24, pad_cols=0
  -> Tile (2,2): start=(56,56), cover=(56:88, 56:88), pad_rows=24, pad_cols=24

[Engine] Total tiles: 9

[Tile 0] Start at (0,0)
  Tmat shape: (400, 100)

[Tile 1] Start at (0,28)

[Tile 2] Start at (0,56)

[T

In [254]:
import numpy as np

def im2col_2d(input_tensor, kernel_shape, stride=1):
    """
    Convert input tensor into Toeplitz / im2col matrix.
    input_tensor: 2D numpy array (H x W)
    kernel_shape: tuple (kH, kW)
    stride: stride for convolution
    Returns:
        cols: 2D array (num_patches x kernel_size)
    """
    H, W = input_tensor.shape
    kH, kW = kernel_shape
    out_H = (H - kH) // stride + 1
    out_W = (W - kW) // stride + 1

    cols = []
    for i in range(0, out_H*stride, stride):
        for j in range(0, out_W*stride, stride):
            patch = input_tensor[i:i+kH, j:j+kW].reshape(-1)
            cols.append(patch)
    return np.array(cols)  # shape = (num_patches, kH*kW)

def conv2d_matmul(input_tensor, kernel, stride=1):
    """
    Convolution as matmul
    """
    kH, kW = kernel.shape
    kernel_flatten = kernel.reshape(-1)  # shape = (kH*kW,)
    Tmat = im2col_2d(input_tensor, (kH, kW), stride)  # shape = (num_patches, kH*kW)
    Y = Tmat @ kernel_flatten  # matmul -> shape (num_patches,)
    
    # Reshape output to 2D
    out_H = (input_tensor.shape[0] - kH) // stride + 1
    out_W = (input_tensor.shape[1] - kW) // stride + 1
    return Y.reshape(out_H, out_W)

# ======================
# Example
# ======================
input_tensor = np.array([
    [1, 0, 3, 1, 1],
    [2, 2, 0, 3, 1],
    [4, 4, 4, 3, 3],
    [0, 0, 3, 1, 4],
    [2, 1, 2, 1, 4]
])

kernel = np.array([
    [0, 1, 2],
    [0, 2, 3],
    [2, 1, 2]
])

output = conv2d_matmul(input_tensor, kernel, stride=1)
print(output)

[[30 32 29]
 [28 28 35]
 [30 25 36]]
