<a href="https://colab.research.google.com/github/XueyanZhang/MachineLearningCompilation/blob/master/8_Hardware_Conceptual_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python3 -m  pip install mlc-ai-nightly -f https://mlc.ai/wheels

Looking in links: https://mlc.ai/wheels


In [2]:
import numpy as np
import tvm
from tvm import relax
from tvm.ir.module import IRModule
from tvm.script import relax as R
from tvm.script import tir as T

# Evolution
1. Scalar computing
2. Vector computing (more complex program)
3. Tensor computing (matrix computations)

## Example of hardware backend

mimic possible operations/instructions on hardware:
1. fill a region with a value (broadcast a value for initialization)
2. matmul + add (madd) (size 16x16)
3. data movement (dma) from host (eg, cpu) to device (eg, tpu)

In [3]:
def accel_fill_zero(C, value):
    C[:] = value

def accel_tmm_add(C, A, B):
    C[:] += A @ B.T

def accel_dma_copy(reg, dram):
    reg[:] = dram[:]

def lnumpy_tmm(A: np.ndarray, B: np.ndarray, C: np.ndarray):
    # a special accumulator memory
    C_accumulator = np.empty((16, 16), dtype="float32")
    A_reg = np.empty((16, 16), dtype="float32")
    B_reg = np.empty((16, 16), dtype="float32")

    for i in range(64):
        for j in range(64):
            accel_fill_zero(C_accumulator[:,:], 0)
            for k in range(64):
                accel_dma_copy(A_reg[:], A[i * 16 : i * 16 + 16, k * 16 : k * 16 + 16])
                accel_dma_copy(B_reg[:], B[j * 16 : j * 16 + 16, k * 16 : k * 16 + 16])
                accel_tmm_add(C_accumulator[:,:], A_reg, B_reg)
            accel_dma_copy(C[i * 16 : i * 16 + 16, j * 16 : j * 16 + 16], C_accumulator[:,:])

In [4]:
# verify result
dtype = "float32"
a_np = np.random.rand(1024, 1024).astype(dtype)
b_np = np.random.rand(1024, 1024).astype(dtype)
c_tmm = a_np @ b_np.T

# build & run
c_np = np.empty((1024, 1024), dtype=dtype)
lnumpy_tmm(a_np, b_np, c_np)
np.testing.assert_allclose(c_np, c_tmm, rtol=1e-5)

# Tensorized Block

Most of efficient accelerator offers tensor computations, e.g., perform matmul of 16x16 sub matrix, aka, tensorized block

In [5]:
f32 = "float32"
@tvm.script.ir_module
class MatmulBlockModule:
    @T.prim_func
    def main(
        A: T.Buffer((1024, 1024), f32),
        B: T.Buffer((1024, 1024), f32),
        C: T.Buffer((1024, 1024), f32),
    ) -> None:
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        for i0, j0, k0 in T.grid(64, 64, 64):
            with T.block("tmm-16x16"):
                vi0, vj0, vk0 = T.axis.remap("SSR", [i0, j0, k0])
                with T.init():
                    for i1, j1 in T.grid(16, 16):
                        with T.block("tmm_init"):
                            vi1, vj1 = T.axis.remap("SS", [i1, j1])
                            C[vi0 * 16 + vi1, vj0 * 16 + vj1] = T.float32(0)
                for i1, j1, k1 in T.grid(16, 16, 16):
                    with T.block("tmm"):
                        vi1, vj1, vk1 = T.axis.remap("SSR", [i1, j1, k1])
                        C[vi0 *16 + vi1, vj0 * 16 + vj1] += \
                                A[vi0 * 16 + vi1, vk0 * 16 + vk1] * B[vj0 * 16 + vj1, vk0 * 16 + vk1]

In [6]:
MatmulBlockModule.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


In [7]:
a_nd = tvm.nd.array(a_np)
b_nd = tvm.nd.array(b_np)
c_nd = tvm.nd.empty((1024, 1024), dtype=f32)

lib = tvm.build(MatmulBlockModule, target="llvm")
lib["main"](a_nd, b_nd, c_nd)
np.testing.assert_allclose(c_nd.numpy(), c_tmm, rtol=1e-5)

## Transform loops

In [8]:
sch = tvm.tir.Schedule(MatmulBlockModule)

block_mm = sch.get_block("tmm-16x16")
i, j, k = sch.get_loops(block_mm)

i0, i1 = sch.split(i, [None, 4])

sch.reorder(i0, j, i1, k)
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


# Blockization
What we ususally got is a scalar program (or a high-level program).
To leverage the benenfit of a hardware accelerator, we need to transform the program to match the underlying patterns, for example 16x16 matmul.

1. Start from scalar program,
2. group subregions of a loop,
3. form a tensorized block

In [9]:
# this is the given high-level program (awaiting blockization)
@tvm.script.ir_module
class MatmulModule:
    @T.prim_func
    def main(
        A: T.Buffer((1024, 1024), f32),
        B: T.Buffer((1024, 1024), f32),
        C: T.Buffer((1024, 1024), f32),
    ) -> None:
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        for i, j, k in T.grid(1024, 1024, 1024):
            with T.block("matmul"):
                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                with T.init():
                    C[vi, vj] = T.float32(0)
                C[vi, vj] += A[vi, vk] * B[vj, vk]

In [10]:
# start blockization
sch = tvm.tir.Schedule(MatmulModule)
block_matmul = sch.get_block("matmul")

i, j, k = sch.get_loops(block_matmul)
i, ii = sch.split(i, factors=[None, 16])
j, ji = sch.split(j, factors=[None, 16])
k, ki = sch.split(k, factors=[None, 16])

sch.reorder(i, j, k, ii, ji, ki)
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


In [11]:
# tvm offers a method to blockize
block_mm = sch.blockize(ii)
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


# Memory Scope

Hardware often has special memory scope for acceleration.  
Let's map these intermediate memory stage with `cache_read` and `cache_write`.  
Later when compiling towards hardware, A_reg/B_reg can be direcly mapped to such memory regions.

In [12]:
# continuing with MatmulModule above
A_reg = sch.cache_read(block_mm, read_buffer_index=0, storage_scope="global.A_reg")
B_reg = sch.cache_read(block_mm, read_buffer_index=1, storage_scope="global.B_reg")
sch.compute_at(A_reg, k)
sch.compute_at(B_reg, k)

write_back_block = sch.cache_write(block_mm, write_buffer_index=0, storage_scope="global.accumulator")
sch.reverse_compute_at(write_back_block, j)
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


# Tensorization
After previous steps, now we want to map tensorized block to hardware intrinsic.

Let's say the hardware offers a 16x16 matmul intrinsic `tmm16_desc`,
and there is a higher-level wrapper method (interface) `tmm16_impl`.

In [13]:
# though this intrinsic seems scalar,
# let's pretend this is such a black-box, magical hardware intrinsic,
# whenever we call tmm16_desc, a subregion 16x16 matmul is computed.
@T.prim_func
def tmm16_desc(a: T.handle, b: T.handle, c: T.handle):
    A = T.match_buffer(a, (16, 16), f32, offset_factor=16, scope="global.A_reg")
    B = T.match_buffer(b, (16, 16), "float32", offset_factor=16, scope="global.B_reg")
    C = T.match_buffer(c, (16, 16), "float32", offset_factor=16,  scope="global.accumulator")

    with T.block("root"):
        T.reads(C[0:16, 0:16], A[0:16, 0:16], B[0:16, 0:16])
        T.writes(C[0:16, 0:16])
        for i, j, k in T.grid(16, 16, 16):
            with T.block(""):
                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                C[vi, vj] += A[vi, vk] * B[vj, vk]

# high-level wrapper
@T.prim_func
def tmm16_impl(a: T.handle, b: T.handle, c: T.handle):
    sa = T.int32()
    sb = T.int32()
    sc = T.int32()
    A = T.match_buffer(a, (16, 16), f32, offset_factor=16, strides=[sa, 1], scope="global.A_reg")
    B = T.match_buffer(b, (16, 16), "float32", offset_factor=16, strides=[sb, 1], scope="global.B_reg")
    C = T.match_buffer(c, (16, 16), "float32", offset_factor=16, strides=[sc, 1], scope="global.accumulator")

    with T.block("root"):
        T.reads(C[0:16, 0:16], A[0:16, 0:16], B[0:16, 0:16])
        T.writes(C[0:16, 0:16])
        T.evaluate(
            T.call_extern(
                "tmm16",
                C.access_ptr("w"), # write access
                A.access_ptr("r"), # read access
                B.access_ptr("r"),
                sa,
                sb,
                sc,
                dtype="int32",
            )
        )

tvm.tir.TensorIntrin.register("tmm16", tmm16_desc, tmm16_impl)

In [14]:
# separate reduction loop
sch.decompose_reduction(block_mm, k)
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


In [15]:
# map tmm16 intrinsic
sch.tensorize(block_mm, "tmm16")
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


In [20]:
a_nd = tvm.nd.array(a_np)
b_nd = tvm.nd.array(b_np)

c_nd = tvm.nd.empty((1024, 1024), dtype="float32")

lib = tvm.build(sch.mod, target="llvm")
lib["main"](a_nd, b_nd, c_nd)
np.testing.assert_allclose(c_nd.numpy(), c_tmm, rtol=1e-5)

## map to kernel

instead of mapping to a method `tmm16` like above,
we can also directly map to a lower level code (offered by the hardware).

In [19]:
def tmm_kernel():
    cc_code = """
      extern "C" int tmm16(float *cc, float *aa, float *bb, int stride_a, int stride_b, int stride_c) {
        for (int i = 0; i < 16; ++i) {
            for (int j = 0; j < 16; ++j) {
                for (int k = 0; k < 16; ++k) {
                    cc[i * stride_c + j] += aa[i * stride_a + k] * bb[j * stride_b + k];
                }
            }
        }
        return 0;
      }
    """
    from tvm.contrib import utils, clang

    temp = utils.tempdir()
    ll_path = temp.relpath("temp.ll")
    # Create LLVM ir from c source code
    ll_code = clang.create_llvm(cc_code, output=ll_path)
    return ll_code

sch.annotate(i, "pragma_import_llvm", tmm_kernel())