# Tiled (not vectorized) Matrix addition

Requires (at least): `VITIS_DIR`, `XRT_DIR`, `XILINXD_LICENSE_FILE` env variables (to be read in [util.py](util.py)).

## Boilerplate

In [1]:
import random

from aie.extras.dialects.ext import arith, func, linalg
from filelock import FileLock
import numpy as np

from aie.dialects import aie, aiex
from aie.dialects.aie import AIEDevice, DMAChannelDir, LockAction, WireBundle
from aie.dialects.linalg.opdsl.ops.core_named_ops import fill as linalg_fill
from aie.dialects.scf import for_ as range_, yield_
import aie.extras.types as T
from aie.extras.context import RAIIMLIRContext, ExplicitlyManagedModule
from aie.util import tiling_calculator_n_tiles
from aie.xrt import XCLBin
from util import (
    compile_without_vectorization,
    construct_and_print_module,
    make_xclbin,
)

DMA = WireBundle.DMA
S2MM = DMAChannelDir.S2MM
MM2S = DMAChannelDir.MM2S
Acquire = LockAction.Acquire
AcquireGreaterEqual = LockAction.AcquireGreaterEqual
Release = LockAction.Release

# Explicit context and module management

In [2]:
ctx = RAIIMLIRContext()
module = ExplicitlyManagedModule()

# Test params

In [3]:
RANDOM_NUMBER = random.randint(0, 100)
M = N = 16
n_tile_rows = n_tile_cols = 2
tile_rows, tile_cols = M // n_tile_rows, N // n_tile_cols
_, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles(
    M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols
)
npu_insts = aiex.npu.get_prolog()

# Configure data movement and specify kernel (all together for now)

In [4]:
@aie.device(AIEDevice.npu)
def npu():  # function name isn't load-bearing

    # tiles that will participate
    shim_tile_0_0 = aie.tile(0, 0)
    mem_tile_0_1 = aie.tile(0, 1)
    compute_tile_0_2 = aie.tile(0, 2)

    ###
    # "automatic" switchbox configuration (you specify start and endpoint and the compiler does the routing/config)
    ###

    # input flow (ie host -> compute tile direction)
    input_a_tile_0_0_to_tile_0_1 = aie.flow(
        source=shim_tile_0_0,
        source_bundle=DMA,
        source_channel=0,
        dest=mem_tile_0_1,
        dest_bundle=DMA,
        dest_channel=0,
    )
    input_a_tile_0_1_to_tile_0_2 = aie.flow(
        source=mem_tile_0_1,
        source_bundle=DMA,
        source_channel=0,
        dest=compute_tile_0_2,
        dest_bundle=DMA,
        dest_channel=0,
    )
    input_b_tile_0_0_to_tile_0_1 = aie.flow(
        source=shim_tile_0_0,
        source_bundle=DMA,
        source_channel=1,
        dest=mem_tile_0_1,
        dest_bundle=DMA,
        dest_channel=1,
    )
    input_b_tile_0_1_to_tile_0_2 = aie.flow(
        source=mem_tile_0_1,
        source_bundle=DMA,
        source_channel=1,
        dest=compute_tile_0_2,
        dest_bundle=DMA,
        dest_channel=1,
    )
    output_c_tile_0_2_to_tile_0_1 = aie.flow(
        source=compute_tile_0_2,
        source_bundle=DMA,
        source_channel=0,
        dest=mem_tile_0_1,
        dest_bundle=DMA,
        dest_channel=2,
    )
    output_c_tile_0_1_to_tile_0_0 = aie.flow(
        source=mem_tile_0_1,
        source_bundle=DMA,
        source_channel=2,
        dest=shim_tile_0_0,
        dest_bundle=DMA,
        dest_channel=0,
    )

    # see https://andreroesti.com/data-layout-viz/data_layout.html for
    # a conceptual explanation of size (called wrap there) and stride (called step there).
    # note: di are in reverse order relative to what you expect -> strides[3] = {d2_stride, d1_stride, d0_stride}
    # note: upper-left coordinates (0, 0), (0, 8), (128, 0), (128, 8) of tiles we're sending (and receiving)

    # read in A operand chopped up into 4 tiles
    offsets = [
        0,
        0 + d0_size * d0_stride,
        d1_size * d1_stride,
        d1_size * d1_stride + d0_size * d0_stride,
    ]
    # all tiles are in column 0
    col = 0

    # ddr_id is used to indicate which (positional) arg on the host side this shim dma config corresponds to.
    # (yes this is a weird naming/assignment but it's due to a hack in implementation...)
    ddr_id = 0
    for i, bd_id in enumerate(range(4)):
        npu_insts.extend(
            aiex.npu.writebd_shimtile(
                col,
                bd_id,
                tile_rows * tile_cols,
                offsets[i],
                ddr_id,
                d1_size=d1_size,
                d1_stride=d1_stride,
                d0_size=d0_size,
                d0_stride=d0_stride,
            )
        )
        npu_insts.extend(
            aiex.npu.shimtile_push_queue(
                MM2S, input_a_tile_0_0_to_tile_0_1.source_channel, col, bd_id
            )
        )

    # in B
    ddr_id = 1
    for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
        npu_insts.extend(
            aiex.npu.writebd_shimtile(
                col,
                bd_id,
                tile_rows * tile_cols,
                offsets[i],
                ddr_id,
                d1_size=d1_size,
                d1_stride=d1_stride,
                d0_size=d0_size,
                d0_stride=d0_stride,
            )
        )
        npu_insts.extend(
            aiex.npu.shimtile_push_queue(
                MM2S, input_b_tile_0_0_to_tile_0_1.source_channel, col, bd_id
            )
        )

    # out C
    ddr_id = 2
    for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
        npu_insts.extend(
            aiex.npu.writebd_shimtile(
                bd_id,
                tile_rows * tile_cols,
                offsets[i],
                ddr_id,
                d1_size=d1_size,
                d1_stride=d1_stride,
                d0_size=d0_size,
                d0_stride=d0_stride,
            )
        )
        npu_insts.extend(
            aiex.npu.shimtile_push_queue(
                S2MM, output_c_tile_0_1_to_tile_0_0.dest_channel, col, bd_id
            )
        )
        npu_insts.extend(
            aiex.npu.sync(
                channel=0, column=0, column_num=1, direction=0, row=0, row_num=1
            )
        )

    # memtile_0_1 is just passthrough so buffers can be "locally scoped" and don't need
    # explicit lock management
    @aie.memtile_dma(mem_tile_0_1)
    def memtile_dma_0_1():
        # input flow
        buffer_0_1_a = aie.buffer(mem_tile_0_1, (tile_rows, tile_cols), T.i32())
        buffer_0_1_b = aie.buffer(mem_tile_0_1, (tile_rows, tile_cols), T.i32())
        # output flow
        buffer_0_1_c = aie.buffer(mem_tile_0_1, (tile_rows, tile_cols), T.i32())

        aiex.forward_bd(
            mem_tile_0_1,
            buffer_0_1_a,
            input_a_tile_0_0_to_tile_0_1.dest_channel,
        )
        aiex.forward_bd(
            mem_tile_0_1,
            buffer_0_1_b,
            input_b_tile_0_0_to_tile_0_1.dest_channel,
        )
        aiex.forward_bd(
            mem_tile_0_1,
            buffer_0_1_c,
            output_c_tile_0_1_to_tile_0_0.source_channel,
        )

        aie.end()

    # compute tile operates on buffers from through the DMA engine
    # and the kernel code and thus needs "globally" scoped buffers
    # and explicitly managed locks
    # in
    buffer_0_2_a = aie.buffer(compute_tile_0_2, (tile_rows, tile_cols), T.i32())
    buffer_0_2_b = aie.buffer(compute_tile_0_2, (tile_rows, tile_cols), T.i32())
    # out
    buffer_0_2_c = aie.buffer(compute_tile_0_2, (tile_rows, tile_cols), T.i32())

    lock_0_2_read_in_a = aie.lock(compute_tile_0_2, lock_id=0, init=1)
    lock_0_2_use_a = aie.lock(compute_tile_0_2, lock_id=1, init=0)
    lock_0_2_read_in_b = aie.lock(compute_tile_0_2, lock_id=2, init=1)
    lock_0_2_use_b = aie.lock(compute_tile_0_2, lock_id=3, init=0)
    lock_0_2_use_c = aie.lock(compute_tile_0_2, lock_id=4, init=1)
    lock_0_2_write_out_c = aie.lock(compute_tile_0_2, lock_id=5, init=0)

    @aie.mem(compute_tile_0_2)
    def mem_0_2():
        ###
        # input buffer descriptor processing
        ###
        @aie.dma(S2MM, input_a_tile_0_1_to_tile_0_2.dest_channel)
        def dma1():
            # acquire lock_0_2_read_in_a, "process" buffer_0_2_a, release lock_0_2_use_a
            # and loop (ie implicitly this "bd processing" happens repeatedly until the program is finished)
            aiex.process_bd(lock_0_2_read_in_a, buffer_0_2_a, lock_0_2_use_a)

        @aie.dma(S2MM, input_b_tile_0_1_to_tile_0_2.dest_channel)
        def dma2():
            # acquire lock_0_2_read_in_b, "process" buffer_0_2_b, release lock_0_2_use_b
            aiex.process_bd(lock_0_2_read_in_b, buffer_0_2_b, lock_0_2_use_b)

        # output
        @aie.dma(MM2S, output_c_tile_0_2_to_tile_0_1.source_channel)
        def dma3():
            # acquire lock_0_2_write_out_c, "process" buffer_0_2_c, release lock_0_2_use_c
            aiex.process_bd(lock_0_2_write_out_c, buffer_0_2_c, lock_0_2_use_c)

        aie.end()

    # actual kernel: a + b + random_number (a,b  are tiles in A, B)
    @aie.core(compute_tile_0_2)
    def core():
        for _ in range_(0, n_tile_rows):
            for _ in range_(0, n_tile_cols):
                with (
                    aiex.hold_lock(lock_0_2_use_a, lock_0_2_read_in_a),
                    aiex.hold_lock(lock_0_2_use_b, lock_0_2_read_in_b),
                    aiex.hold_lock(lock_0_2_use_c, lock_0_2_write_out_c),
                ):
                    linalg_fill(arith.constant(RANDOM_NUMBER), outs=[buffer_0_2_c])
                    linalg.add(buffer_0_2_a, buffer_0_2_c, buffer_0_2_c)
                    linalg.add(buffer_0_2_b, buffer_0_2_c, buffer_0_2_c)

                yield_([])
            yield_([])

# "Finish" module and print/show IR

In [5]:
module.finish()
module = module.module
print(ctx.module)

module {
  aie.device(npu) {
    %tile_0_0 = aie.tile(0, 0)
    %tile_0_1 = aie.tile(0, 1)
    %tile_0_2 = aie.tile(0, 2)
    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
    aie.flow(%tile_0_0, DMA : 1, %tile_0_1, DMA : 1)
    aie.flow(%tile_0_1, DMA : 1, %tile_0_2, DMA : 1)
    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 2)
    aie.flow(%tile_0_1, DMA : 2, %tile_0_0, DMA : 0)
    func.func @bobsyouruncle() {
      aiex.npu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 

# Compile (this is the part where env variables come into play)

In [6]:
# shim DMA as npu instructions
compile_without_vectorization(ctx.module, workdir)
xclbin_path = make_xclbin(ctx.module, workdir)

# Run

In [7]:
# FileLock because this runs in CI where multiple jobs might be attempting to run (and the device isn't multi-tenant yet)
with FileLock("/tmp/npu.lock"):
    # XRT manager
    xclbin = XCLBin(xclbin_path, "MLIR_AIE")
    # configure shim dmas
    xclbin.load_npu_instructions(npu_insts)

    # initialize input operands and zero out output
    views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)

    wrap_A = np.asarray(views[0])
    wrap_B = np.asarray(views[1])
    wrap_C = np.asarray(views[2])

    A = np.random.randint(0, 10, (M, N), dtype=np.int32)
    B = np.random.randint(0, 10, (M, N), dtype=np.int32)
    C = np.zeros((M, N), dtype=np.int32)

    np.copyto(wrap_A, A, casting="no")
    np.copyto(wrap_B, B, casting="no")
    np.copyto(wrap_C, C, casting="no")

    # run
    xclbin.sync_buffers_to_device()
    xclbin.run()
    xclbin.wait(30)
    xclbin.sync_buffers_from_device()

    # check result
    assert np.array_equal(A + B + RANDOM_NUMBER, wrap_C)
    assert not np.array_equal(C, wrap_C)