In [11]:
import numpy as np
import math
from collections import defaultdict
import pandas as pd

# ===================================================================
# 1. Configuration & Initial Data Generation (Same as your setup)
# ===================================================================

# Input tensor shape: [N, H, W, C]
# Weight tensor shape: [R, S, C, K]
N, H, W, C = 1, 5, 7, 1
R, S, K = 4, 4, 1
padding, stride = 0, 1
P = int((H - R + 2*padding)/stride) + 1
Q = int((W - S + 2*padding)/stride) + 1
output_shape = (N, P, Q, K)

# Generate symbolic activations and kernel weights
N_idx, H_idx, W_idx, C_idx = np.indices((N, H, W, C))
activations = np.char.add(np.char.add(np.char.add(np.char.add('A', N_idx.astype(str)), H_idx.astype(str)), W_idx.astype(str)), C_idx.astype(str))
I_ker, J_ker, C_ker, K_ker = np.indices((R, S, C, K))
kernel = np.char.add(np.char.add(np.char.add(np.char.add('K', I_ker.astype(str)), J_ker.astype(str)), C_ker.astype(str)), K_ker.astype(str))

# ===================================================================
# 2. Hardware Component Classes (PE and Systolic Array)
# ===================================================================

class PE:
    """A single Processing Element."""
    def __init__(self):
        self.activation = "0"
        self.weight = "--"
        self.psum_in = "0"
        self.psum_out = "0"

    def compute(self):
        # Skip computation if activation or weight is zero/empty
        if self.activation == "0" or self.weight == "--" or self.weight == "0":
            product = "0"
        else:
            product = f"{self.activation}*{self.weight}"

        # Accumulate with the partial sum from the PE above
        if self.psum_in == "0" and product == "0":
            self.psum_out = "0"
        elif self.psum_in == "0":
            self.psum_out = product
        elif product == "0":
            self.psum_out = self.psum_in
        else:
            self.psum_out = f"{self.psum_in} + {product}"

class SystolicArray:
    """A grid of PEs implementing a weight-stationary dataflow."""
    def __init__(self, size: int):
        self.size = size
        self.array = [[PE() for _ in range(size)] for _ in range(size)]

    def load_weights(self, weight_tile: np.ndarray):
        """Load a tile of weights, which will remain stationary."""
        for i in range(self.size):
            for j in range(self.size):
                self.array[i][j].weight = weight_tile[i, j]

    def cycle(self, input_row: list):
        """Execute one cycle of data movement and computation."""
        # Shift activations down and load new input row at the top
        for i in range(self.size - 1, 0, -1):
            for j in range(self.size):
                self.array[i][j].activation = self.array[i-1][j].activation
        for j in range(self.size):
            self.array[0][j].activation = input_row[j]

        # Shift partial sums down and compute
        for i in range(self.size):
            for j in range(self.size):
                # Get psum from PE above (or 0 for the top row)
                psum_from_above = self.array[i-1][j].psum_out if i > 0 else "0"
                self.array[i][j].psum_in = psum_from_above
                self.array[i][j].compute()

    def get_output_row(self):
        """Return the partial sums emerging from the bottom row."""
        return [self.array[self.size - 1][j].psum_out for j in range(self.size)]

    def print_state(self):
        print("-" * (self.size * 40))
        for i in range(self.size):
            row_str = []
            for j in range(self.size):
                pe = self.array[i][j]
                row_str.append(f"[A:{pe.activation:<5}|W:{pe.weight:<5}|Pout:{pe.psum_out[:15]:<15}]")
            print(" | ".join(row_str))
        print("-" * (self.size * 40))

# ===================================================================
# 3. Main Simulator Class (Controller for Tiled GEMM)
# ===================================================================

class TiledGemmSimulator:
    def __init__(self, grid_size):
        self.grid_size = grid_size
        self.systolic_array = SystolicArray(grid_size)
        self.total_cycles = 0

    def _im2col(self, activation_tensor):
        """Symbolic im2col transformation."""
        patches = []
        for n in range(N):
            for i in range(P):
                for j in range(Q):
                    patch = activation_tensor[n, i*stride:i*stride+R, j*stride:j*stride+S, :]
                    patches.append(patch.flatten())
        # Result shape: [num_patches, C*R*S]
        return np.array(patches).T

    def run(self, activations, kernel):
        print("--- Step 1: Transforming Convolution to GEMM ---")
        # logical_A shape: [C*R*S, P*Q]
        logical_A = self._im2col(activations)
        # logical_B shape: [K, C*R*S]
        logical_B = kernel.reshape(R*S*C, K).T
        
        M, K_dim = logical_B.shape
        K_dim_A, N_dim = logical_A.shape
        assert K_dim == K_dim_A

        print(f"Logical Matrix A (Inputs): {logical_A.shape}")
        print(f"Logical Matrix B (Weights): {logical_B.shape}")
        
        # This will store the final results for all output pixels
        final_equations = defaultdict(str)

        print("\n--- Step 2: Tiling and Executing on Systolic Array ---")
        num_k_tiles = math.ceil(K_dim / self.grid_size)
        num_n_tiles = math.ceil(N_dim / self.grid_size)
        num_m_tiles = math.ceil(M / self.grid_size) # For this problem M=1

        # Iterate through tiles of the output matrix C = B @ A
        for m_tile_idx in range(num_m_tiles):
            for n_tile_idx in range(num_n_tiles):
                output_pixel_start_idx = n_tile_idx * self.grid_size
                
                print(f"\n>>> Computing output pixels C[{m_tile_idx*self.grid_size}:, {output_pixel_start_idx}:]...")

                # Accumulator for this output tile
                output_tile_acc = ["0"] * self.grid_size

                # Loop over the shared dimension K
                for k_tile_idx in range(num_k_tiles):
                    print(f"  - Using K-dimension slice #{k_tile_idx}")
                    # --- Slice the tiles ---
                    k_start = k_tile_idx * self.grid_size
                    b_tile = logical_B[:, k_start : k_start + self.grid_size]
                    a_tile = logical_A[k_start : k_start + self.grid_size, output_pixel_start_idx : output_pixel_start_idx + self.grid_size]

                    # --- Pad tiles to fit the grid_size x grid_size array ---
                    padded_B = np.full((self.grid_size, self.grid_size), "--", dtype=object)
                    padded_B[:b_tile.shape[0], :b_tile.shape[1]] = b_tile

                    padded_A = np.full((self.grid_size, self.grid_size), "0", dtype=object)
                    padded_A[:a_tile.shape[0], :a_tile.shape[1]] = a_tile
                    
                    # --- Load weights and stream activations ---
                    self.systolic_array.load_weights(padded_B.T) # Use B.T to match dataflow
                    
                    # It takes 2*size-1 cycles to clear the array
                    for i in range(2 * self.grid_size -1):
                        self.total_cycles += 1
                        input_row = padded_A[i, :] if i < self.grid_size else ["0"] * self.grid_size
                        print(f"\n[Cycle: {self.total_cycles}] Loading input row: {input_row}")
                        
                        self.systolic_array.cycle(input_row)
                        self.systolic_array.print_state()
                        
                        if i >= self.grid_size - 1:
                            output_row = self.systolic_array.get_output_row()
                            print(f"Output from bottom row: {output_row}")
                            # Accumulate results
                            for j in range(self.grid_size):
                                if output_row[j] != "0" and output_row[j] != "":
                                    if output_tile_acc[j] == "0":
                                        output_tile_acc[j] = output_row[j]
                                    else:
                                        output_tile_acc[j] += f" + {output_row[j]}"
                
                # Store the final computed tile
                for i in range(self.grid_size):
                    pixel_idx = output_pixel_start_idx + i
                    if pixel_idx < N_dim:
                         final_equations[pixel_idx] = output_tile_acc[i]

        return final_equations

# ===================================================================
# 4. Run the Simulation
# ===================================================================

# For this problem, the grid size should match the kernel size
GRID_SIZE = R 
sim = TiledGemmSimulator(grid_size=GRID_SIZE)
final_results = sim.run(activations, kernel)

print("\n--- Step 3: Final Computed Equations ---")
# Reshape the results into the output image format
# Create an empty numpy array to hold the final equations in their proper 2D shape
output_equations = np.full(output_shape, "N/A", dtype=object)
for i in range(P):
    for j in range(Q):
        pixel_idx = i * Q + j
        if pixel_idx in final_results:
            output_equations[0, i, j, 0] = final_results[pixel_idx]

print(output_equations)

--- Step 1: Transforming Convolution to GEMM ---
Logical Matrix A (Inputs): (16, 8)
Logical Matrix B (Weights): (1, 16)

--- Step 2: Tiling and Executing on Systolic Array ---

>>> Computing output pixels C[0:, 0:]...
  - Using K-dimension slice #0

[Cycle: 1] Loading input row: ['A0000' 'A0010' 'A0020' 'A0030']
----------------------------------------------------------------------------------------------------------------------------------------------------------------
[A:A0000|W:K0000|Pout:A0000*K0000    ] | [A:A0010|W:--   |Pout:0              ] | [A:A0020|W:--   |Pout:0              ] | [A:A0030|W:--   |Pout:0              ]
[A:0    |W:K0100|Pout:A0000*K0000    ] | [A:0    |W:--   |Pout:0              ] | [A:0    |W:--   |Pout:0              ] | [A:0    |W:--   |Pout:0              ]
[A:0    |W:K0200|Pout:A0000*K0000    ] | [A:0    |W:--   |Pout:0              ] | [A:0    |W:--   |Pout:0              ] | [A:0    |W:--   |Pout:0              ]
[A:0    |W:K0300|Pout:A0000*K0000    ] 

In [12]:
# ===================================================================
# 5. Run Simulation and Display Output in a DataFrame
# ===================================================================

# --- NEW: Use pandas for clear, tabular output ---
# Squeeze the array to remove the batch (N) and kernel (K) dimensions of size 1
output_map_data = np.squeeze(output_equations)

# Create the DataFrame
df = pd.DataFrame(output_map_data)
df.index.name = 'Output Row (p)'
df.columns.name = 'Output Col (q)'

print(f"Displaying output for Batch n=0, Output Channel k=0:")
# Use to_string() to prevent pandas from truncating the long equation strings
print(df.to_string())

Displaying output for Batch n=0, Output Channel k=0:
Output Col (q)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              0  1  2  3
Output Row (p)                                                                                                                                                                                                                                                                                                                                                              