In [1]:
import numpy as np
import pandas as pd
from typing import List, Optional

In [2]:
pd.set_option('display.max_rows', None)

In [20]:
# Input tensor shape: [N, H, W, C]
# Weight tensor shape: [R, S, C, K] 
# Output tensor shape: [N, P, Q, K]

def _output_size(I, K, P, S): 
    return int((I - K + 2*P)/(S)) + 1
    
N = 1 # batch size

H = 5 # activation height, activation width
W = 7
C = 1   # input channels <= strictly, grid_size (max PE size)
 
R = S = kernel_size = 4 # kernel height, kernel width
K = 1   # kernel channels, <= strictly, grid_size (max PE size)

# C and K don't need to be equal 

padding = 0
stride = 1

P = _output_size(H, R, padding, stride)
Q = _output_size(W, S, padding, stride)
num_patches = P * Q 

input_shape = (N, H, W, C)
weight_shape = (R, S, C, K)
output_shape = (N, P, Q, K)

In [21]:
output_shape

(1, 2, 4, 1)

In [22]:
N_idx, H_idx, W_idx, C_idx = np.indices((N, H, W, C))
activations = np.char.add(
    np.char.add(
        np.char.add(
            np.char.add('A', N_idx.astype(str)),
            H_idx.astype(str)
        ),
        W_idx.astype(str)
    ),
    C_idx.astype(str)
)

In [23]:
I_ker, J_ker, C_ker, K_ker = np.indices((R, S, C, K))
kernel = np.char.add(
    np.char.add(
        np.char.add(
            np.char.add('K', I_ker.astype(str)),
            J_ker.astype(str)
        ),
        C_ker.astype(str)
    ),
    K_ker.astype(str)
)

In [24]:
kernel.flatten()

array(['K0000', 'K0100', 'K0200', 'K0300', 'K1000', 'K1100', 'K1200',
       'K1300', 'K2000', 'K2100', 'K2200', 'K2300', 'K3000', 'K3100',
       'K3200', 'K3300'], dtype='<U85')

In [25]:
equations_matrix = [[[[None for _ in range(K)] 
                      for _ in range(Q)] 
                     for _ in range(P)] 
                    for _ in range(N)]
patches = []
inv_patches = []

for n in range(N):
    for i in range(0, H - R + 1, stride):
        for j in range(0, W - S + 1, stride):
            # Extract a patch from image n
            patch = activations[n, i:i+R, j:j+S, :]   # shape (R, S, C)
            print(i)
            print(j)
            print(patch)
            inv_patch = patch.transpose(1, 0, 2) # inverted: shape (S, R, C)
            patches.append(patch)
            inv_patches.append(inv_patch)
            
            for k_out in range(K):
                terms = []
                for a in range(R):
                    for b in range(S):
                        for c_idx in range(C):
                            # Each term: A{n}{i+a}{j+b}{c_idx} * K{a}{b}{c_idx}{k_out}
                            terms.append(f"{patch[a, b, c_idx]}*{kernel[a, b, c_idx, k_out]}")
                equation = " + ".join(terms)
                row_idx = i // stride
                col_idx = j // stride
                equations_matrix[n][row_idx][col_idx][k_out] = equation

0
0
[[['A0000']
  ['A0010']
  ['A0020']
  ['A0030']]

 [['A0100']
  ['A0110']
  ['A0120']
  ['A0130']]

 [['A0200']
  ['A0210']
  ['A0220']
  ['A0230']]

 [['A0300']
  ['A0310']
  ['A0320']
  ['A0330']]]
0
1
[[['A0010']
  ['A0020']
  ['A0030']
  ['A0040']]

 [['A0110']
  ['A0120']
  ['A0130']
  ['A0140']]

 [['A0210']
  ['A0220']
  ['A0230']
  ['A0240']]

 [['A0310']
  ['A0320']
  ['A0330']
  ['A0340']]]
0
2
[[['A0020']
  ['A0030']
  ['A0040']
  ['A0050']]

 [['A0120']
  ['A0130']
  ['A0140']
  ['A0150']]

 [['A0220']
  ['A0230']
  ['A0240']
  ['A0250']]

 [['A0320']
  ['A0330']
  ['A0340']
  ['A0350']]]
0
3
[[['A0030']
  ['A0040']
  ['A0050']
  ['A0060']]

 [['A0130']
  ['A0140']
  ['A0150']
  ['A0160']]

 [['A0230']
  ['A0240']
  ['A0250']
  ['A0260']]

 [['A0330']
  ['A0340']
  ['A0350']
  ['A0360']]]
1
0
[[['A0100']
  ['A0110']
  ['A0120']
  ['A0130']]

 [['A0200']
  ['A0210']
  ['A0220']
  ['A0230']]

 [['A0300']
  ['A0310']
  ['A0320']
  ['A0330']]

 [['A0400']
  ['A0410']
  ['A0

In [26]:
split_equations_dict = {}

for n in range(N):
    for row_id in range(len(equations_matrix[n])):
        for col_id in range(len(equations_matrix[n][row_id])):
            for k_out in range(K):
                equation = equations_matrix[n][row_id][col_id][k_out]
                # each part corresponding to a kernel row
                terms = equation.split(" + ")
                split_eq = []
                num_terms_per_split = S * C  # originally, each kernel row contributed S * C terms.
                for r_idx in range(R):
                    start = r_idx * num_terms_per_split
                    end = (r_idx + 1) * num_terms_per_split
                    partial_eq = " + ".join(terms[start:end])
                    
                    subterms = partial_eq.split(" + ")
                    if len(subterms) > 4:
                        # even more break the partial equation into groups of 4 terms.
                        num_groups = (len(subterms) + 3) // 4  
                        for group in range(num_groups):
                            sub_start = group * 4
                            sub_end = (group + 1) * 4
                            sub_eq = " + ".join(subterms[sub_start:sub_end])
                            key = f"C{n}{row_id}{col_id}{k_out}_{r_idx}_{group}"
                            split_equations_dict[sub_eq] = key
                    else:
                        key = f"C{n}{row_id}{col_id}{k_out}_{r_idx}"
                        split_equations_dict[partial_eq] = key

In [27]:
equations_matrix

[[[['A0000*K0000 + A0010*K0100 + A0020*K0200 + A0030*K0300 + A0100*K1000 + A0110*K1100 + A0120*K1200 + A0130*K1300 + A0200*K2000 + A0210*K2100 + A0220*K2200 + A0230*K2300 + A0300*K3000 + A0310*K3100 + A0320*K3200 + A0330*K3300'],
   ['A0010*K0000 + A0020*K0100 + A0030*K0200 + A0040*K0300 + A0110*K1000 + A0120*K1100 + A0130*K1200 + A0140*K1300 + A0210*K2000 + A0220*K2100 + A0230*K2200 + A0240*K2300 + A0310*K3000 + A0320*K3100 + A0330*K3200 + A0340*K3300'],
   ['A0020*K0000 + A0030*K0100 + A0040*K0200 + A0050*K0300 + A0120*K1000 + A0130*K1100 + A0140*K1200 + A0150*K1300 + A0220*K2000 + A0230*K2100 + A0240*K2200 + A0250*K2300 + A0320*K3000 + A0330*K3100 + A0340*K3200 + A0350*K3300'],
   ['A0030*K0000 + A0040*K0100 + A0050*K0200 + A0060*K0300 + A0130*K1000 + A0140*K1100 + A0150*K1200 + A0160*K1300 + A0230*K2000 + A0240*K2100 + A0250*K2200 + A0260*K2300 + A0330*K3000 + A0340*K3100 + A0350*K3200 + A0360*K3300']],
  [['A0100*K0000 + A0110*K0100 + A0120*K0200 + A0130*K0300 + A0200*K1000 + A021

In [28]:
split_equations_dict

{'A0000*K0000 + A0010*K0100 + A0020*K0200 + A0030*K0300': 'C0000_0',
 'A0100*K1000 + A0110*K1100 + A0120*K1200 + A0130*K1300': 'C0000_1',
 'A0200*K2000 + A0210*K2100 + A0220*K2200 + A0230*K2300': 'C0000_2',
 'A0300*K3000 + A0310*K3100 + A0320*K3200 + A0330*K3300': 'C0000_3',
 'A0010*K0000 + A0020*K0100 + A0030*K0200 + A0040*K0300': 'C0010_0',
 'A0110*K1000 + A0120*K1100 + A0130*K1200 + A0140*K1300': 'C0010_1',
 'A0210*K2000 + A0220*K2100 + A0230*K2200 + A0240*K2300': 'C0010_2',
 'A0310*K3000 + A0320*K3100 + A0330*K3200 + A0340*K3300': 'C0010_3',
 'A0020*K0000 + A0030*K0100 + A0040*K0200 + A0050*K0300': 'C0020_0',
 'A0120*K1000 + A0130*K1100 + A0140*K1200 + A0150*K1300': 'C0020_1',
 'A0220*K2000 + A0230*K2100 + A0240*K2200 + A0250*K2300': 'C0020_2',
 'A0320*K3000 + A0330*K3100 + A0340*K3200 + A0350*K3300': 'C0020_3',
 'A0030*K0000 + A0040*K0100 + A0050*K0200 + A0060*K0300': 'C0030_0',
 'A0130*K1000 + A0140*K1100 + A0150*K1200 + A0160*K1300': 'C0030_1',
 'A0230*K2000 + A0240*K2100 + A025

In [29]:
patches = np.array(patches)
inv_patches = np.array(inv_patches)

In [30]:
patches.shape

(8, 4, 4, 1)

In [31]:
kernel.flatten()

array(['K0000', 'K0100', 'K0200', 'K0300', 'K1000', 'K1100', 'K1200',
       'K1300', 'K2000', 'K2100', 'K2200', 'K2300', 'K3000', 'K3100',
       'K3200', 'K3300'], dtype='<U85')

In [32]:
class PE:
    # links: [0: left, 1: up, 2: right, 3: down]
    def __init__(self, links: Optional[List['PE']] = None):
        if links is None:
            links = [None, None, None, None]
        self.links = links
        self.activation = "0"
        self.weight = '--'
        self.accumulation: str = "0"
    
    def _input(self, activation: str):
        self.activation = activation
    
    def _weight(self, weight: str):
        self.weight = weight
    
    def shift(self, shift_direction: int, activation_flag: bool):
        neighbor = self.links[shift_direction]
        if neighbor is not None:
            if activation_flag:
                neighbor._input(self.activation)
            else:
                neighbor._weight(self.weight)

class SystolicArray:
    def __init__(self, size: int):
        self.size = size
        self.buffer = []  
        self.array = self._setup_array()
    
    def _setup_array(self):
        array = [[PE() for _ in range(self.size)] for _ in range(self.size)]
        for i in range(self.size):
            for j in range(self.size):
                left = array[i][j-1] if j > 0 else None
                up = array[i-1][j] if i > 0 else None
                right = array[i][j+1] if j < self.size - 1 else None
                down = array[i+1][j] if i < self.size - 1 else None
                array[i][j].links = [left, up, right, down]
        return array

    def print_array(self):
        for i in range(self.size):
            row_parts = []
            for pe in self.array[i]:
                act_str = pe.activation.ljust(12)
                weight_str = pe.weight.ljust(12)
                
                if pe.accumulation.strip() == "0":
                    parts = self.size * ["0"]
                else:
                    parts = [p.strip() for p in pe.accumulation.split('+')]
                    while len(parts) < kernel_size:
                        parts.append("0")
                    parts = parts[:4]
                parts = [p.ljust(12) for p in parts]
                acc_str = " + ".join(parts)
                
                row_parts.append(f"[A:{act_str} | W:{weight_str} | Acc:{acc_str}]")
            print(" | ".join(row_parts))

    def cycle(self, new_activations: Optional[List[str]] = None):
        if new_activations:
            for i in range(self.size):
                #  shift activations rightward
                for j in range(self.size - 1, 0, -1):
                    self.array[i][j].activation = self.array[i][j-1].activation
                # shift the new activation into column 0.
                self.array[i][0].activation = new_activations[i]
        
        bottom_row = [self.array[self.size - 1][j].accumulation for j in range(self.size)]
        self.buffer.append(bottom_row)
        # Look to get the equations
        
        for j in range(self.size):
            # save previous cycle's accumulations for column j
            prev_acc = [self.array[i][j].accumulation for i in range(self.size)]
            
            # Update the accumulation in row 0 for column j.
            cell0 = self.array[0][j]
            prod0 = f"{cell0.activation}*{cell0.weight}" if cell0.activation not in ["0", "--"] else "0"
            cell0.accumulation = prod0
            
            for i in range(1, self.size):
                cell = self.array[i][j]
                prod = f"{cell.activation}*{cell.weight}" if cell.activation not in ["0", "--"] else "0"
                inherited = prev_acc[i-1]
                if inherited == "0":
                    new_acc = prod
                else:
                    new_acc = inherited + (f" + {prod}" if prod != "0" else "")
                cell.accumulation = new_acc if new_acc != "" else "0"

In [33]:
class Conv2dSimulator:
    def __init__(self, grid_size: int):
        self.input_buffers = [
            [],              
            ["0"],           
            ["0", "0"],      
            ["0", "0", "0"] 
        ]
        
        self.grid_size = grid_size
        self.systolic_array = SystolicArray(grid_size)
        self.outputs = [] 
        self.weight_update_flag = False
        self.kernel_update_iter = 0
        self.total_cycles = 1
        
        self.cycles_per_weight_frame = P * Q + 6
        self.current_cycle_count = 0
        
        self.current_kernel = None 
        self.current_kernel_index = 0  
        self.flat_patch = None
        self.complete = 0 

    def add_patches(self, patches: List[np.ndarray]):     
        flat_patch = []
        if (C + K) > 2: 
            for h in range(patches.shape[1]):       # patches.shape[1] = H
                for w in range(patches.shape[2]):     # patches.shape[2] = W
                    for n in range(patches.shape[0]):  # patches.shape[0] = N
                        for c in range(patches.shape[3]): # patches.shape[3] = C
                            flat_patch.append(patches[n, h, w, c])
        else: 
            for n in range(patches.shape[0]):  # patches.shape[0] = N
                for h in range(patches.shape[1]):       # patches.shape[1] = H
                    for w in range(patches.shape[2]):     # patches.shape[2] = W
                        for c in range(patches.shape[3]): # patches.shape[3] = C
                            flat_patch.append(patches[n, h, w, c])
        self.flat_patch = flat_patch

        counter = 0
        for i in range(0, len(flat_patch), self.grid_size):
            chunk = flat_patch[i:i + self.grid_size]
            if len(chunk) < self.grid_size:
                chunk.extend(["0"] * (self.grid_size - len(chunk)))
            for j in range(self.grid_size):
                self.input_buffers[j].append(chunk[j])

            if (counter == (P * Q - 1)): 
                counter = 0
                for j in range(self.grid_size):
                    self.input_buffers[j].extend(["0"]*6)
            else: counter += 1

    def add_kernel(self, kernel: np.ndarray):
        self.current_kernel = kernel  
        self.current_kernel_index = 0  # start at the beginning of this kernel frame
        self.switch_weights()

    def switch_weights(self):
        grid_size = self.grid_size
        total_needed = grid_size * grid_size

        to_return = 0
        if (C + K) > 2: 
            flat_weights = self.current_kernel.flatten().tolist()
        else: 
            flat_weights = self.current_kernel.T.flatten().tolist()

        total_weights = len(flat_weights)

        start = self.current_kernel_index
        end = start + total_needed

        # If there aren't enough weights left, take what remains and pad with zeros.
        if end > total_weights:
            selected = flat_weights[start:] + [0] * (end - total_weights)
            self.complete = 1
            self.current_kernel_index = 0
        else:
            selected = flat_weights[start:end]
            self.current_kernel_index += total_needed
            if self.current_kernel_index >= total_weights:
                self.complete = 1
                self.current_kernel_index = 0

        for i in range(grid_size):
            for j in range(grid_size):
                idx = i * grid_size + j
                self.systolic_array.array[i][j].weight = selected[idx]

        self.kernel_update_iter += 1
        self.weight_update_flag = True
        self.current_cycle_count = 0
        print(f"Switched weights. Kernel update iteration: {self.kernel_update_iter}")
        return to_return

    def cycle(self):
        new_activations = []
        status = 0
        for i in range(self.systolic_array.size):
            if self.input_buffers[i]:
                new_act = self.input_buffers[i].pop(0)
            else:
                new_act = "0"
            new_activations.append(new_act)

        print("Total Cycles: ", self.total_cycles)
        
        print("New activations:", new_activations)
        self.systolic_array.cycle(new_activations=new_activations)
        self.systolic_array.print_array()

        output = self.systolic_array.buffer[-1]
        translated_equations = [] 
        for _out in output: 
            try: 
                translated_equations.append(split_equations_dict[_out])
            except:
                translated_equations.append('Cxx_x')
        self.outputs.append(translated_equations)
        print("Output: ", translated_equations)

        self.current_cycle_count += 1
        if not self.complete and (self.current_cycle_count >= self.cycles_per_weight_frame):
            print(f"Completed {self.current_cycle_count} cycles for current weight frame. Switching weights.")
            status = self.switch_weights()

        if self.complete: 
            print("Convolution Completed!")

        self.total_cycles += 1

        return translated_equations, status

In [34]:
kernel.shape

(4, 4, 1, 1)

In [35]:
conv_sim = Conv2dSimulator(kernel_size)
conv_sim.add_patches(patches)
conv_sim.add_kernel(kernel)
print(conv_sim.input_buffers[0])

for cycle_num in range(200):
    out, status = conv_sim.cycle()
    print()

    # Keep the same flow of activations, we need to check if we can move the weights in (flat similar to the log file), in a upper-left triangle way

Switched weights. Kernel update iteration: 1
[np.str_('A0000'), np.str_('A0100'), np.str_('A0200'), np.str_('A0300'), np.str_('A0010'), np.str_('A0110'), np.str_('A0210'), np.str_('A0310'), '0', '0', '0', '0', '0', '0', np.str_('A0020'), np.str_('A0120'), np.str_('A0220'), np.str_('A0320'), np.str_('A0030'), np.str_('A0130'), np.str_('A0230'), np.str_('A0330'), '0', '0', '0', '0', '0', '0', np.str_('A0100'), np.str_('A0200'), np.str_('A0300'), np.str_('A0400'), np.str_('A0110'), np.str_('A0210'), np.str_('A0310'), np.str_('A0410'), '0', '0', '0', '0', '0', '0', np.str_('A0120'), np.str_('A0220'), np.str_('A0320'), np.str_('A0420'), np.str_('A0130'), np.str_('A0230'), np.str_('A0330'), np.str_('A0430'), '0', '0', '0', '0', '0', '0']
Total Cycles:  1
New activations: [np.str_('A0000'), '0', '0', '0']
[A:A0000        | W:K0000        | Acc:A0000*K0000  + 0            + 0            + 0           ] | [A:0            | W:K1000        | Acc:0            + 0            + 0            + 0     

In [48]:
patches[4]

array([[['A0100'],
        ['A0110'],
        ['A0120'],
        ['A0130']],

       [['A0200'],
        ['A0210'],
        ['A0220'],
        ['A0230']],

       [['A0300'],
        ['A0310'],
        ['A0320'],
        ['A0330']],

       [['A0400'],
        ['A0410'],
        ['A0420'],
        ['A0430']]], dtype='<U85')

In [36]:
pd.DataFrame(conv_sim.outputs)

Unnamed: 0,0,1,2,3
0,Cxx_x,Cxx_x,Cxx_x,Cxx_x
1,Cxx_x,Cxx_x,Cxx_x,Cxx_x
2,Cxx_x,Cxx_x,Cxx_x,Cxx_x
3,Cxx_x,Cxx_x,Cxx_x,Cxx_x
4,C0000_0,Cxx_x,Cxx_x,Cxx_x
5,C0100_0,Cxx_x,Cxx_x,Cxx_x
6,Cxx_x,C0000_1,Cxx_x,Cxx_x
7,Cxx_x,C0100_1,Cxx_x,Cxx_x
8,C0010_0,Cxx_x,C0000_2,Cxx_x
9,C0110_0,Cxx_x,C0100_2,Cxx_x


In [29]:
# INCOMPLETE!!  

def route_time(T, activation_size, output_size, kernel_size, stride, delay):
    routing = {}
    for k in range(kernel_size):
        remainder = T - kernel_size - (delay * k)
        if remainder < 0 or remainder % kernel_size != 0:
            routing[k] = None
            continue
        
        Q = remainder // kernel_size
        i = Q // output_size
        j = Q % output_size
        
        if i < output_size and j < output_size:
            routing[k] = (i, j)
        else:
            routing[k] = None
    return routing

for i in range(20): 
    print(i, route_time(0, H, P, R, stride, 2))

0 {0: None, 1: None, 2: None, 3: None}
1 {0: None, 1: None, 2: None, 3: None}
2 {0: None, 1: None, 2: None, 3: None}
3 {0: None, 1: None, 2: None, 3: None}
4 {0: None, 1: None, 2: None, 3: None}
5 {0: None, 1: None, 2: None, 3: None}
6 {0: None, 1: None, 2: None, 3: None}
7 {0: None, 1: None, 2: None, 3: None}
8 {0: None, 1: None, 2: None, 3: None}
9 {0: None, 1: None, 2: None, 3: None}
10 {0: None, 1: None, 2: None, 3: None}
11 {0: None, 1: None, 2: None, 3: None}
12 {0: None, 1: None, 2: None, 3: None}
13 {0: None, 1: None, 2: None, 3: None}
14 {0: None, 1: None, 2: None, 3: None}
15 {0: None, 1: None, 2: None, 3: None}
16 {0: None, 1: None, 2: None, 3: None}
17 {0: None, 1: None, 2: None, 3: None}
18 {0: None, 1: None, 2: None, 3: None}
19 {0: None, 1: None, 2: None, 3: None}


In [23]:
patches[0]

array([[['A0000'],
        ['A0010'],
        ['A0020'],
        ['A0030']],

       [['A0100'],
        ['A0110'],
        ['A0120'],
        ['A0130']],

       [['A0200'],
        ['A0210'],
        ['A0220'],
        ['A0230']],

       [['A0300'],
        ['A0310'],
        ['A0320'],
        ['A0330']]], dtype='<U85')

In [24]:
patches[1]

array([[['A0010'],
        ['A0020'],
        ['A0030'],
        ['A0040']],

       [['A0110'],
        ['A0120'],
        ['A0130'],
        ['A0140']],

       [['A0210'],
        ['A0220'],
        ['A0230'],
        ['A0240']],

       [['A0310'],
        ['A0320'],
        ['A0330'],
        ['A0340']]], dtype='<U85')