In [1]:
import numpy as np
import pandas as pd
from typing import List, Optional

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
# 4x4, 4x4, 0, 1 -> 10
# 5x5, 4x4, 0, 1 -> 22 
# 6x6, 4x4, 0, 1 -> 42
# 7x7, 4x4, 0, 1 -> 70
# 5x5, 3x3, 0, 1 -> 31
# 5x5, 3x3, 0, 2 -> 16

In [18]:
def _output_size(I, K, P, S): 
    return int((I - K + 2*P)/(S)) + 1

kernel_size = 4
padding = 0
stride = 1

activations_size = 7
output_size = _output_size(activations_size, kernel_size, padding, stride)
num_patches = output_size ** 2

In [19]:
f"{output_size}x{output_size}"

'4x4'

In [20]:
I, J = np.indices((activations_size, activations_size))
activations = np.char.add(np.char.add('A', I.astype(str)), J.astype(str))

In [21]:
I, J = np.indices((kernel_size, kernel_size))
kernel = np.char.add(np.char.add('K', I.astype(str)), J.astype(str))

In [22]:
equations = []
patches = [] 
inv_patches = [] 

num_rows = (activations_size - kernel_size) // stride + 1
num_cols = (activations_size - kernel_size) // stride + 1

equations_matrix = [[None for _ in range(num_cols)] for _ in range(num_rows)]

for i in range(0, activations_size - kernel_size + 1, stride):
    for j in range(0, activations_size - kernel_size + 1, stride):
        patch = activations[i:i+kernel_size, j:j+kernel_size]
        inv_patch = activations[i:i+kernel_size, j:j+kernel_size].T
        patches.append(patch)
        inv_patches.append(inv_patch)
        
        terms = []
        for a in range(kernel_size):
            for b in range(kernel_size):
                terms.append(f"{patch[a, b]}*{kernel[a, b]}")
        equation = " + ".join(terms)
        
        row_idx = i // stride
        col_idx = j // stride
        equations_matrix[row_idx][col_idx] = equation

In [23]:
split_equations_dict = {'0': '000_0'}

for row_id in range(len(equations_matrix)):
    for col_id in range(len(equations_matrix[row_id])):
        equation = equations_matrix[row_id][col_id]
        terms = equation.split(" + ")
        split_eq = []

        for i in range(kernel_size):
            split_eq.append(" + ".join(terms[i*kernel_size : (i+1)*kernel_size]))
        
        for i, eq in enumerate(split_eq):
            key = f"C{row_id}{col_id}_{i}"
            split_equations_dict[eq] = key

In [24]:
patches = np.array(patches)
inv_patches = np.array(inv_patches)

In [25]:
class PE:
    # links: [0: left, 1: up, 2: right, 3: down]
    def __init__(self, links: Optional[List['PE']] = None):
        if links is None:
            links = [None, None, None, None]
        self.links = links
        self.activation = "0"
        self.weight = '--'
        self.accumulation: str = "0"
    
    def _input(self, activation: str):
        self.activation = activation
    
    def _weight(self, weight: str):
        self.weight = weight
    
    def shift(self, shift_direction: int, activation_flag: bool):
        neighbor = self.links[shift_direction]
        if neighbor is not None:
            if activation_flag:
                neighbor._input(self.activation)
            else:
                neighbor._weight(self.weight)

class SystolicArray:
    def __init__(self, size: int):
        self.size = size
        self.buffer = []  
        self.array = self._setup_array()
    
    def _setup_array(self):
        array = [[PE() for _ in range(self.size)] for _ in range(self.size)]
        for i in range(self.size):
            for j in range(self.size):
                left = array[i][j-1] if j > 0 else None
                up = array[i-1][j] if i > 0 else None
                right = array[i][j+1] if j < self.size - 1 else None
                down = array[i+1][j] if i < self.size - 1 else None
                array[i][j].links = [left, up, right, down]
        return array

    def print_array(self):
        for i in range(self.size):
            row_parts = []
            for pe in self.array[i]:
                act_str = pe.activation.ljust(kernel_size)
                weight_str = pe.weight.ljust(kernel_size)
                
                if pe.accumulation.strip() == "0":
                    parts = ["0", "0", "0", "0"]
                else:
                    parts = [p.strip() for p in pe.accumulation.split('+')]
                    while len(parts) < kernel_size:
                        parts.append("0")
                    parts = parts[:4]
                parts = [p.ljust(8) for p in parts]
                acc_str = " + ".join(parts)
                
                row_parts.append(f"[A:{act_str} | W:{weight_str} | Acc:{acc_str}]")
            print(" | ".join(row_parts))

    def cycle(self, new_activations: Optional[List[str]] = None):
        if new_activations:
            for i in range(self.size):
                #  shift activations rightward
                for j in range(self.size - 1, 0, -1):
                    self.array[i][j].activation = self.array[i][j-1].activation
                # shift the new activation into column 0.
                self.array[i][0].activation = new_activations[i]
        
        bottom_row = [self.array[self.size - 1][j].accumulation for j in range(self.size)]
        self.buffer.append(bottom_row)
        # Look to get the equations
        
        for j in range(self.size):
            # save previous cycle's accumulations for column j
            prev_acc = [self.array[i][j].accumulation for i in range(self.size)]
            
            # Update the accumulation in row 0 for column j.
            cell0 = self.array[0][j]
            prod0 = f"{cell0.activation}*{cell0.weight}" if cell0.activation not in ["0", "--"] else "0"
            cell0.accumulation = prod0
            
            for i in range(1, self.size):
                cell = self.array[i][j]
                prod = f"{cell.activation}*{cell.weight}" if cell.activation not in ["0", "--"] else "0"
                inherited = prev_acc[i-1]
                if inherited == "0":
                    new_acc = prod
                else:
                    new_acc = inherited + (f" + {prod}" if prod != "0" else "")
                cell.accumulation = new_acc if new_acc != "" else "0"

In [26]:
class Conv2dSimulator:
    def __init__(self, grid_size: int):
        self.input_buffers = [
            [],              
            ["0"],           
            ["0", "0"],      
            ["0", "0", "0"] 
        ]
        
        self.grid_size = grid_size
        self.systolic_array = SystolicArray(grid_size)
        self.outputs = [] 

    def add_patches(self, patches: List[np.ndarray]):
        for patch in patches:
            flat_patch = patch.flatten().tolist()
            
            for i in range(0, len(flat_patch), self.grid_size):
                chunk = flat_patch[i:i+self.grid_size]
                
                for j in range(self.grid_size):
                    self.input_buffers[j].append(chunk[j])
                # STRIDE THE FLATTENED SUB-PATCHES ACROSS THE INPUT BUFFERS!!!!

    def cycle(self):
        new_activations = []
        for i in range(self.systolic_array.size):
            if self.input_buffers[i]:
                new_act = self.input_buffers[i].pop(0)
            else:
                new_act = "0"
            new_activations.append(new_act)
        
        print("New activations:", new_activations)
        self.systolic_array.cycle(new_activations=new_activations)
        self.systolic_array.print_array()
        output = self.systolic_array.buffer[-1]
        translated_equations = [] 
        for _out in output: 
            try: 
                translated_equations.append(split_equations_dict[_out])
            except:
                translated_equations.append('Cxx_x')
        self.outputs.append(translated_equations)
        print("Output: ", translated_equations)
        return translated_equations

In [28]:
conv_sim = Conv2dSimulator(kernel_size)
conv_sim.add_patches(patches)
print(conv_sim.input_buffers[0])


kernel_test = kernel.T
for i in range(kernel_size):
    for j in range(kernel_size):
        conv_sim.systolic_array.array[i][j].weight = kernel_test[i, j]

for cycle_num in range(100):
    out = conv_sim.cycle()
    print()

['A00', 'A10', 'A20', 'A30', 'A01', 'A11', 'A21', 'A31', 'A02', 'A12', 'A22', 'A32', 'A03', 'A13', 'A23', 'A33', 'A10', 'A20', 'A30', 'A40', 'A11', 'A21', 'A31', 'A41', 'A12', 'A22', 'A32', 'A42', 'A13', 'A23', 'A33', 'A43', 'A20', 'A30', 'A40', 'A50', 'A21', 'A31', 'A41', 'A51', 'A22', 'A32', 'A42', 'A52', 'A23', 'A33', 'A43', 'A53', 'A30', 'A40', 'A50', 'A60', 'A31', 'A41', 'A51', 'A61', 'A32', 'A42', 'A52', 'A62', 'A33', 'A43', 'A53', 'A63']
New activations: ['A00', '0', '0', '0']
[A:A00  | W:K00  | Acc:A00*K00  + 0        + 0        + 0       ] | [A:0    | W:K10  | Acc:0        + 0        + 0        + 0       ] | [A:0    | W:K20  | Acc:0        + 0        + 0        + 0       ] | [A:0    | W:K30  | Acc:0        + 0        + 0        + 0       ]
[A:0    | W:K01  | Acc:0        + 0        + 0        + 0       ] | [A:0    | W:K11  | Acc:0        + 0        + 0        + 0       ] | [A:0    | W:K21  | Acc:0        + 0        + 0        + 0       ] | [A:0    | W:K31  | Acc:0        + 0  

In [14]:
for a in equations_matrix: 
    for b in a: 
        print(b)

A00*K00 + A01*K01 + A02*K02 + A03*K03 + A10*K10 + A11*K11 + A12*K12 + A13*K13 + A20*K20 + A21*K21 + A22*K22 + A23*K23 + A30*K30 + A31*K31 + A32*K32 + A33*K33
A02*K00 + A03*K01 + A04*K02 + A05*K03 + A12*K10 + A13*K11 + A14*K12 + A15*K13 + A22*K20 + A23*K21 + A24*K22 + A25*K23 + A32*K30 + A33*K31 + A34*K32 + A35*K33
A20*K00 + A21*K01 + A22*K02 + A23*K03 + A30*K10 + A31*K11 + A32*K12 + A33*K13 + A40*K20 + A41*K21 + A42*K22 + A43*K23 + A50*K30 + A51*K31 + A52*K32 + A53*K33
A22*K00 + A23*K01 + A24*K02 + A25*K03 + A32*K10 + A33*K11 + A34*K12 + A35*K13 + A42*K20 + A43*K21 + A44*K22 + A45*K23 + A52*K30 + A53*K31 + A54*K32 + A55*K33


In [15]:
split_equations_dict

{'0': '000_0',
 'A00*K00 + A01*K01 + A02*K02 + A03*K03': 'C00_0',
 'A10*K10 + A11*K11 + A12*K12 + A13*K13': 'C00_1',
 'A20*K20 + A21*K21 + A22*K22 + A23*K23': 'C00_2',
 'A30*K30 + A31*K31 + A32*K32 + A33*K33': 'C00_3',
 'A02*K00 + A03*K01 + A04*K02 + A05*K03': 'C01_0',
 'A12*K10 + A13*K11 + A14*K12 + A15*K13': 'C01_1',
 'A22*K20 + A23*K21 + A24*K22 + A25*K23': 'C01_2',
 'A32*K30 + A33*K31 + A34*K32 + A35*K33': 'C01_3',
 'A20*K00 + A21*K01 + A22*K02 + A23*K03': 'C10_0',
 'A30*K10 + A31*K11 + A32*K12 + A33*K13': 'C10_1',
 'A40*K20 + A41*K21 + A42*K22 + A43*K23': 'C10_2',
 'A50*K30 + A51*K31 + A52*K32 + A53*K33': 'C10_3',
 'A22*K00 + A23*K01 + A24*K02 + A25*K03': 'C11_0',
 'A32*K10 + A33*K11 + A34*K12 + A35*K13': 'C11_1',
 'A42*K20 + A43*K21 + A44*K22 + A45*K23': 'C11_2',
 'A52*K30 + A53*K31 + A54*K32 + A55*K33': 'C11_3'}

In [16]:
patches

array([[['A00', 'A01', 'A02', 'A03'],
        ['A10', 'A11', 'A12', 'A13'],
        ['A20', 'A21', 'A22', 'A23'],
        ['A30', 'A31', 'A32', 'A33']],

       [['A02', 'A03', 'A04', 'A05'],
        ['A12', 'A13', 'A14', 'A15'],
        ['A22', 'A23', 'A24', 'A25'],
        ['A32', 'A33', 'A34', 'A35']],

       [['A20', 'A21', 'A22', 'A23'],
        ['A30', 'A31', 'A32', 'A33'],
        ['A40', 'A41', 'A42', 'A43'],
        ['A50', 'A51', 'A52', 'A53']],

       [['A22', 'A23', 'A24', 'A25'],
        ['A32', 'A33', 'A34', 'A35'],
        ['A42', 'A43', 'A44', 'A45'],
        ['A52', 'A53', 'A54', 'A55']]], dtype='<U43')

In [29]:
pd.DataFrame(conv_sim.outputs)

Unnamed: 0,0,1,2,3
0,000_0,000_0,000_0,000_0
1,000_0,000_0,000_0,000_0
2,000_0,000_0,000_0,000_0
3,000_0,000_0,000_0,000_0
4,C00_0,000_0,000_0,000_0
5,C10_0,Cxx_x,000_0,000_0
6,C20_0,C00_1,Cxx_x,000_0
7,C30_0,C10_1,Cxx_x,Cxx_x
8,C01_0,C20_1,C00_2,Cxx_x
9,C11_0,Cxx_x,C10_2,Cxx_x


In [None]:
# Cxx = kernel_size*[TimeStep,BufferIndex + ]
# Each TimeStep,BufferIndex represents a PSUM output at TimeStep

In [None]:
# KernelSize = 3x3
# ActivationSize = 5x5
# OutputSize = 3x3
# Stride = 1


# C00 = 3,0 + 5,1 + 7,2
# C01 = 6,0 + 8,1 + 10,2 
# C02 = 9,0 + 11,1 + 13,2 

# C10 = 12,0 + 14,1 + 16,2 
# C11 = 15,0 + 17,1 + 19,2 
# C12 = 18,0 + 20,1 + 22,2 

# C20 = 21,0 + 23,1 + 25,2 
# C21 = 24,0 + 26,1 + 28,2 
# C22 = 27,0 + 29,1 + 31,2 

In [None]:
# KernelSize = 4x4 
# ActivationSize = 6x6
# OutputSize = 3x3
# Stride = 1


# C00 = 4,0 + 6,1 + 8,2 + 10,3a
# C01 = 8,0 + 10,1 + 12,2 + 14,3
# C02 = 12,0 + 14,1 + 16,2 + 18,3

# C10 = 16,0 + 18,1 + 20,2 + 22,3
# C11 = 20,0 + 22,1 + 24,2 + 26,3
# C12 = 24,0 + 26,1 + 28,2 + 30,3

# C20 = 28,0 + 30,1 + 32,2 + 34,3
# C21 = 32,0 + 34,1 + 36,2 + 38,3
# C22 = 36,0 + 38,1 + 40,2 + 42,3

In [None]:
# KernelSize = 4x4 
# ActivationSize = 7x7
# OutputSize = 4x4
# Stride = 1


# C00 = 4,0 + 6,1 + 8,2 + 10,3
# C01 = 8,0 + 10,1 + 12,2 + 14,3
# C02 = 12,0 + 14,1 + 16,2 + 18,3
# C03 = 16,0 + 18,1 + 20,2 + 22,3

# C10 = 20,0 + 22,1 + 24,2 + 26,3
# C11 = 24,0 + 26,1 + 28,2 + 30,3
# C12 = 28,0 + 30,1 + 32,2 + 34,3
# C13 = 32,0 + 34,1 + 36,2 + 38,3

# C20 = 36,0 + 38,1 + 40,2 + 42,3
# C21 = 40,0 + 42,1 + 44,2 + 46,3
# C22 = 44,0 + 46,1 + 48,2 + 50,3
# C23 = 48,0 + 50,1 + 52,2 + 54,3

# C30 = 52,0 + 54,1 + 56,2 + 58,3
# C31 = 56,0 + 58,1 + 60,2 + 62,3
# C32 = 60,0 + 62,1 + 64,2 + 66,3
# C33 = 64,0 + 66,1 + 68,2 + 70,3

In [None]:
# KernelSize = 4x4 
# ActivationSize = 7x7
# OutputSize = 2x2
# Stride = 2


# C00 = 4,0 + 6,1 + 8,2 + 10,3
# C01 = 8,0 + 10,1 + 12,2 + 14,3

# C10 = 12,0 + 14,1 + 16,2 + 18,3
# C11 = 16,0 + 18,1 + 20,2 + 22,3

In [54]:
def route_time(T, activation_size, output_size, kernel_size, stride, delay):
    """
    For every time step T, we get to choose which psums to mask out, and mask in 
    We get the i and j indices out, then we can put it back into scratchpad 
    WE CAN USE THIS AS A SCRATCHPAD_OUTPUT_BUFFER ENABLE VECTOR to mask values, then use the I and J indices to decide where to put stuff
    """

    routing = {}
    for k in range(kernel_size):
        remainder = T - kernel_size - (delay * k)
        if remainder < 0 or remainder % kernel_size != 0:
            routing[k] = None
            continue
        
        Q = remainder // kernel_size
        i = Q // output_size
        j = Q % output_size
        
        if i < output_size and j < output_size:
            routing[k] = (i, j)
        else:
            routing[k] = None
    return routing

In [55]:
delay = 2
# Not sure why this is constantly 2? Basically in every 2 cycles, one of them is useless.. but is needed to propagate
for i in range(0, 10): 
    print(route_time(i, activations_size, output_size, kernel_size, stride, delay))

{0: None, 1: None, 2: None}
{0: None, 1: None, 2: None}
{0: None, 1: None, 2: None}
{0: (0, 0), 1: None, 2: None}
{0: None, 1: None, 2: None}
{0: None, 1: (0, 0), 2: None}
{0: (0, 1), 1: None, 2: None}
{0: None, 1: None, 2: (0, 0)}
{0: None, 1: (0, 1), 2: None}
{0: (0, 2), 1: None, 2: None}
