In [248]:
import numpy as np
from typing import List, Optional, Tuple

# ==========================================================
# 1. Scratchpad Memory (SPM) - handles tiling and padding
# ==========================================================
class ScratchpadMemory:
    def __init__(self, input_tensor, tile_size, kernel_size, stride=1, verbose=True):
        self.input = input_tensor
        self.H, self.W, self.C = input_tensor.shape
        self.tile_size = tile_size
        self.kernel_size = kernel_size
        self.stride = stride
        self.verbose = verbose

    def generate_tile_addresses(self):
        T, K, s = self.tile_size, self.kernel_size, self.stride
        addrs = []
        row_step = T - K + 1 if T - K + 1 > 0 else 1
        col_step = T - K + 1 if T - K + 1 > 0 else 1

        num_tiles_row = int(np.ceil((self.H - K + 1) / row_step))
        num_tiles_col = int(np.ceil((self.W - K + 1) / col_step))

        if self.verbose:
            print(f"\n[SPM] Generating tile addresses...")
            print(f"  Input shape = ({self.H},{self.W},{self.C}), Tile size = {T}, Kernel = {K}, Stride = {s}")
            print(f"  Step sizes -> row_step = {row_step}, col_step = {col_step}")
            print(f"  Expected total tiles = {num_tiles_row} x {num_tiles_col} = {num_tiles_row*num_tiles_col}\n")

        for t_i in range(num_tiles_row):
            for t_j in range(num_tiles_col):
                row = t_i * row_step
                col = t_j * col_step
                addrs.append((row, col))
                if self.verbose:
                    pad_r = max(0, row + T - self.H)
                    pad_c = max(0, col + T - self.W)
                    print(f"  -> Tile ({t_i},{t_j}): start=({row},{col}), "
                          f"cover=({row}:{row+T}, {col}:{col+T}), "
                          f"pad_rows={pad_r}, pad_cols={pad_c}")

        return addrs

    def read_row(self, tile_start, row_idx):
        r0, c0 = tile_start
        T = self.tile_size
        row_data = np.zeros((T, self.C))
        for j in range(T):
            rr, cc = r0 + row_idx, c0 + j
            if rr < self.H and cc < self.W:
                row_data[j, :] = self.input[rr, cc, :]
        if self.verbose:
            print(f"[SPM] Read row {row_idx} from tile starting at {tile_start}, "
                  f"valid range=({r0+row_idx},{c0}:{c0+T})")
        return row_data  # shape [T, C]

In [249]:
class ToeplitzBuffer:
    def __init__(self, tile_size, kernel_size, num_channels, stride=1):
        self.tile_size = tile_size
        self.kernel_size = kernel_size
        self.num_channels = num_channels
        self.stride = stride
        self.buffer = []

    def stream_row(self, row_data):
        """
        row_data: shape [tile_size, num_channels]
        Returns: list of Toeplitz rows (flattened) ready for SA
        """
        self.buffer.append(row_data)
        if len(self.buffer) < self.kernel_size:
            return []

        K = self.kernel_size
        C = self.num_channels
        toeplitz_rows = []

        current_block = np.stack(self.buffer[-K:], axis=0)  # shape [K, tile_size, C]

        for j in range(0, self.tile_size - K + 1, self.stride):
            patch = current_block[:, j:j+K, :].reshape(-1)
            toeplitz_rows.append(patch)

        return toeplitz_rows

    def stream_columns(self):
        """
        Converts the buffered rows into column-major flattened vectors
        ready for column-staggered SA streaming.
        Returns: list of arrays of length K*num_channels (for SA row input)
        """
        K = self.kernel_size
        C = self.num_channels
        T = self.tile_size

        if len(self.buffer) < K:
            return []

        # Stack last K rows
        block = np.stack(self.buffer[-K:], axis=0)  # [K, T, C]

        cols = []
        for j in range(0, T - K + 1, self.stride):
            col_vec = block[:, j:j+K, :].reshape(-1)
            # padded_col = np.zeros(self.tile_size)  # pad to array_size
            # padded_col[:len(col_vec)] = col_vec
            cols.append(col_vec)
        return cols



In [250]:
class KernelLoader:
    def __init__(self, kernels: np.ndarray, array_size: int):
        self.R, self.S, self.C, self.K = kernels.shape
        self.kernels = kernels
        self.array_size = array_size

    def get_kernel_matrix(self, k: int):
        """
        Flatten one kernel and return padded [array_size, array_size] matrix.
        """
        k_flat = self.kernels[:, :, :, k].reshape(-1)  # shape [R*S*C]
        H = len(k_flat)
        padded = np.zeros((self.array_size, self.array_size))
        padded[:H, 0] = k_flat  # map into first column (common for weight-stationary)
        return padded


In [251]:
# class PE:
#     def __init__(self, links: Optional[List['PE']] = None):
#         if links is None:
#             links = [None, None, None, None]
#         self.links = links
#         self.activation = 0.0
#         self.weight = 0.0
#         self.accumulation = 0.0

# class SystolicArray:
#     def __init__(self, size: int):
#         self.size = size
#         self.array = [[PE() for _ in range(size)] for _ in range(size)]

#     def load_weights(self, weight_matrix):
#         """Load a stationary weight matrix into the SA"""
#         for i in range(self.size):
#             for j in range(self.size):
#                 self.array[i][j].weight = weight_matrix[i, j]

#     def feed_activation_col(self, col_vec):
#         """
#         Feed one Toeplitz column (vector) into the SA, staggered.
#         col_vec: list/np.array of activations (length ≤ size).
#         - Values injected into column 0, one per row, with delay.
#         - Requires calling cycle() repeatedly to flush.
#         """
#         depth = len(col_vec)
#         # Stream with stagger (pipeline fill)
#         for t in range(depth + self.size - 1):
#             for r in range(self.size):
#                 idx = t - r
#                 if 0 <= idx < depth:
#                     self.array[r][0].activation = col_vec[idx]
#                 else:
#                     self.array[r][0].activation = 0.0
#             self.cycle()

#     def cycle(self):
#         """One systolic cycle: MAC + shift activations rightward"""
#         # 1. Compute MAC
#         for i in range(self.size):
#             for j in range(self.size):
#                 pe = self.array[i][j]
#                 pe.accumulation += pe.activation * pe.weight

#         # 2. Shift activations right
#         for i in range(self.size):
#             for j in reversed(range(self.size - 1)):
#                 self.array[i][j + 1].activation = self.array[i][j].activation

#     def collect_output(self):
#         out = np.zeros((self.size, self.size))
#         for i in range(self.size):
#             for j in range(self.size):
#                 out[i, j] = self.array[i][j].accumulation
#         return out

#     def reset_accumulation(self):
#         for i in range(self.size):
#             for j in range(self.size):
#                 self.array[i][j].accumulation = 0.0

In [252]:
class PE:
    def __init__(self, links: Optional[List['PE']] = None):
        if links is None:
            links = [None, None, None, None]
        self.links = links
        self.activation = 0.0
        self.weight = 0.0
        self.accumulation = 0.0

class SystolicArray:
    def __init__(self, size: int):
        self.size = size
        self.array = [[PE() for _ in range(size)] for _ in range(size)]

    def load_weights(self, weight_matrix):
        for i in range(self.size):
            for j in range(self.size):
                self.array[i][j].weight = weight_matrix[i,j]

    def feed_activation_row(self, row):
        for j in range(len(row)):
            self.array[0][j].activation = row[j]

    def cycle(self):
        # compute accumulation
        for i in range(self.size):
            for j in range(self.size):
                pe = self.array[i][j]
                pe.accumulation += pe.activation * pe.weight
        # shift activations down
        for i in reversed(range(self.size-1)):
            for j in range(self.size):
                self.array[i+1][j].activation = self.array[i][j].activation

    def collect_output(self):
        out = np.zeros((self.size, self.size))
        for i in range(self.size):
            for j in range(self.size):
                out[i,j] = self.array[i][j].accumulation
        return out

    def reset_accumulation(self):
        for i in range(self.size):
            for j in range(self.size):
                self.array[i][j].accumulation = 0.0

In [253]:
# ==========================================================
# 6. Accumulator
# ==========================================================
class PsumBuffer:
    def __init__(self):
        self.outputs = []

    def accumulate(self, tile_output):
        self.outputs.append(tile_output)

    def get_final(self):
        return np.sum(self.outputs, axis=0)

In [254]:
class ConvEngine:
    def __init__(self, input_tensor, kernels, array_size=32, stride=1, verbose=True):
        self.input_tensor = input_tensor
        self.kernels = kernels
        self.array_size = array_size
        self.stride = stride
        self.verbose = verbose

        self.H, self.W, self.C = input_tensor.shape
        self.R, self.S, self.Ck, self.K = kernels.shape
        assert self.C == self.Ck, "Channel mismatch!"

        self.sa = SystolicArray(array_size)
        
    def run(self):
        spm = ScratchpadMemory(self.input_tensor, self.array_size, self.R,
                            stride=self.stride, verbose=self.verbose)

        # Output dimensions
        out_H = (self.H - self.R) // self.stride + 1
        out_W = (self.W - self.S) // self.stride + 1
        output = np.zeros((out_H, out_W, self.K))

        # 1. Get all tile start addresses
        tile_addrs = spm.generate_tile_addresses()
        if self.verbose:
            print(f"\n[Engine] === Tiling complete: {len(tile_addrs)} tiles ===")

        # 2. Process kernels in groups of SA width
        group_size = self.array_size
        num_groups = int(np.ceil(self.K / group_size))
        for g in range(num_groups):
            k_start = g * group_size
            k_end = min((g+1) * group_size, self.K)

            if self.verbose:
                print(f"\n[Engine] === Processing kernel group {g} ({k_start}..{k_end-1}) ===")

            # Reset SA accumulators
            self.sa.reset_accumulation()

            # Figure out how many chunks are needed (same across all kernels in this group)
            max_chunks = max(
                int(np.ceil(self.kernels[:, :, :, k].size / self.array_size))
                for k in range(k_start, k_end)
            )

            # 🔑 Loop over chunks round-robin style
            for chunk_id in range(max_chunks):
                # 1. Load this chunk for ALL kernels in the group
                weight_matrix = np.zeros((self.array_size, self.array_size))

                for col, k in enumerate(range(k_start, k_end)):
                    k_flat = self.kernels[:, :, :, k].reshape(-1)
                    start = chunk_id * self.array_size
                    end = min((chunk_id + 1) * self.array_size, len(k_flat))

                    chunk = np.zeros(self.array_size)
                    if start < len(k_flat):
                        chunk[:end - start] = k_flat[start:end]

                    weight_matrix[:, col] = chunk

                    if self.verbose:
                        print(f"[Kernel {k}] Chunk {chunk_id+1}/{max_chunks} loaded "
                            f"indices {start}:{end} into column {col}")

                self.sa.load_weights(weight_matrix)
                self.sa.cycle()  # settle weights

                # 2. Process all tiles with current chunk weights
                for (row, col_start) in tile_addrs:
                    if self.verbose:
                        print(f"\n[Engine] Processing tile at start=({row},{col_start}) "
                            f"for chunk {chunk_id+1}")

                    # toeplitz = ToeplitzBuffer(self.array_size, self.R, self.C,
                    #                         stride=self.stride)

                    # for r in range(self.array_size):
                    #     row_data = spm.read_row((row, col_start), r)
                    #     toeplitz_rows = toeplitz.stream_row(row_data)

                    #     for act_row in toeplitz_rows:
                    #         num_act_chunks = int(np.ceil(len(act_row) / self.array_size))
                    #         for act_chunk_id in range(num_act_chunks):
                    #             a_start = act_chunk_id * self.array_size
                    #             a_end = min((act_chunk_id + 1) * self.array_size, len(act_row))
                    #             act_chunk = np.zeros(self.array_size)
                    #             act_chunk[:a_end - a_start] = act_row[a_start:a_end]

                    #             for r in range(self.array_size):
                    #                 if r < len(act_chunk):
                    #                     self.sa.array[r][0].activation = act_chunk[r]
                    #                 else:
                    #                     self.sa.array[r][0].activation = 0.0
                    #             self.sa.cycle()
                    toeplitz = ToeplitzBuffer(self.array_size, self.R, self.C, stride=self.stride)

                    for r_idx in range(self.array_size):
                        row_data = spm.read_row((row, col_start), r_idx)
                        toeplitz.buffer.append(row_data)  # Fill the buffer for K rows

                    # Convert buffered rows into column-major vectors
                    toeplitz_cols = toeplitz.stream_columns()

                    # Column-staggered streaming (diagonal wavefront)
                    for t in range(len(toeplitz_cols) + self.array_size - 1):
                        for col_idx in range(self.array_size):
                            if 0 <= t - col_idx < len(toeplitz_cols):
                                act_vec = toeplitz_cols[t - col_idx]
                            else:
                                act_vec = np.zeros(self.array_size)

                            for r in range(self.array_size):
                                self.sa.array[r][col_idx].activation = act_vec[r]

                        self.sa.cycle()
                    # for r in range(self.array_size):
                    #     row_data = spm.read_row((row, col_start), r)
                    #     toeplitz_cols = toeplitz.stream_row(row_data)

                    #     # Each column is a Toeplitz vector → feed directly
                    #     for col_vec in toeplitz_cols:
                    #         self.sa.feed_activation_col(col_vec)

                    sa_out = self.sa.collect_output()  # final psums for tile-region

                    # Write into output
                    block_H = min(self.array_size, out_H - row)
                    block_W = min(self.array_size, out_W - col_start)
                    for col_idx, k in enumerate(range(k_start, k_end)):
                        valid_out = sa_out[:block_H, col_idx:col_idx + block_W]
                        target_slice = output[row:row + block_H, col_start:col_start + block_W, k]
                        min_H = min(valid_out.shape[0], target_slice.shape[0])
                        min_W = min(valid_out.shape[1], target_slice.shape[1])
                        target_slice[:min_H, :min_W] = valid_out[:min_H, :min_W]

                    # Reset SA accumulations before next tile
                    self.sa.reset_accumulation()
                    toeplitz.buffer = []

        return output
    


In [255]:
if __name__ == "__main__":
    H, W, C = 64, 64, 3
    R = S = 4
    stride = 1
    input_tensor = np.random.randint(0,5,(H,W,C)).astype(float)
    kernels = np.random.randint(0,5,(R,S,C,4)).astype(float)

    engine = ConvEngine(input_tensor, kernels, stride=stride, array_size=32, verbose=True)
    sa_output = engine.run()

    # Golden convolution
    def conv2d_stride(input_tensor, kernel, stride=1):
        H, W, C = input_tensor.shape
        R, S, _, K = kernel.shape
        out_H = (H - R)//stride + 1
        out_W = (W - S)//stride + 1
        out = np.zeros((out_H, out_W, K))
        for k in range(K):
            for i in range(out_H):
                for j in range(out_W):
                    patch = input_tensor[i*stride:i*stride+R, j*stride:j*stride+S, :]
                    out[i,j,k] = np.sum(patch * kernel[:,:,:,k])
        return out

    golden = conv2d_stride(input_tensor, kernels, stride=stride)
    print("SA Simulator Output Shape:", sa_output.shape)
    print("Golden Conv Shape:", golden.shape)
    for k in range(sa_output.shape[2]):
        diff = np.max(np.abs(sa_output[:,:,k] - golden[:,:,k]))
        print(f"Kernel {k} max absolute difference: {diff}")

    # Print some sample values for debugging
    print(f"\nSample outputs (kernel 0, first few positions):")
    print(f"SA output: {sa_output[0:3, 0:3, 0]}")
    print(f"Golden:    {golden[0:3, 0:3, 0]}")
    print(f"\nSample outputs (kernel 1, first few positions):")
    print(f"SA output: {sa_output[0:3, 0:3, 1]}")
    print(f"Golden:    {golden[0:3, 0:3, 1]}")
    print(f"\nSample outputs (kernel 2, first few positions):")
    print(f"SA output: {sa_output[0:3, 0:3, 2]}")
    print(f"Golden:    {golden[0:3, 0:3, 2]}")
    print(f"\nSample outputs (kernel 3, first few positions):")
    print(f"SA output: {sa_output[0:3, 0:3, 3]}")
    print(f"Golden:    {golden[0:3, 0:3, 3]}")

#tiling and loading kernel is correct, output shapes are matching, difference are around 200
#next steps: look into psum how is it handled, rotation (transpose) of SA at the end of operation, 
        #need to check if toeplitz is being streamed column or row wise into SA - i need it to stream using this column-staggered Toeplitz streaming (with initial zeros)
        #check where we need to collect psum - after a chunk? after a tile? -After a chunk → accumulate psums internally (still incomplete).
	# •	After a kernel (all chunks done) → psums are final for that kernel and tile region.
	# •	After a tile → flush psums out, move to next tile.


[SPM] Generating tile addresses...
  Input shape = (64,64,3), Tile size = 32, Kernel = 4, Stride = 1
  Step sizes -> row_step = 29, col_step = 29
  Expected total tiles = 3 x 3 = 9

  -> Tile (0,0): start=(0,0), cover=(0:32, 0:32), pad_rows=0, pad_cols=0
  -> Tile (0,1): start=(0,29), cover=(0:32, 29:61), pad_rows=0, pad_cols=0
  -> Tile (0,2): start=(0,58), cover=(0:32, 58:90), pad_rows=0, pad_cols=26
  -> Tile (1,0): start=(29,0), cover=(29:61, 0:32), pad_rows=0, pad_cols=0
  -> Tile (1,1): start=(29,29), cover=(29:61, 29:61), pad_rows=0, pad_cols=0
  -> Tile (1,2): start=(29,58), cover=(29:61, 58:90), pad_rows=0, pad_cols=26
  -> Tile (2,0): start=(58,0), cover=(58:90, 0:32), pad_rows=26, pad_cols=0
  -> Tile (2,1): start=(58,29), cover=(58:90, 29:61), pad_rows=26, pad_cols=0
  -> Tile (2,2): start=(58,58), cover=(58:90, 58:90), pad_rows=26, pad_cols=26

[Engine] === Tiling complete: 9 tiles ===

[Engine] === Processing kernel group 0 (0..3) ===
[Kernel 0] Chunk 1/2 loaded indices 