<a href="https://colab.research.google.com/github/XueyanZhang/MachineLearningCompilation/blob/master/7_GPU_Hardware_Acceleration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!python3 -m  pip install mlc-ai-nightly-cu121 -f https://mlc.ai/wheels

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://mlc.ai/wheels
Collecting mlc-ai-nightly-cu121
  Downloading https://github.com/mlc-ai/package/releases/download/v0.9.dev0/mlc_ai_nightly_cu121-0.12.dev1141-cp310-cp310-manylinux_2_28_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mlc-ai-nightly-cu121
Successfully installed mlc-ai-nightly-cu121-0.12.dev1141


In [26]:
# !python3 -m  pip install mlc-ai-nightly -f https://mlc.ai/wheels

In [27]:
!nvidia-smi

Fri Jun 23 01:44:14 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    27W /  70W |    105MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [28]:
# %%shell
# # Installs the latest dev build of TVM from PyPI, with CUDA enabled. To use this,
# # you must request a Google Colab instance with a GPU by going to Runtime ->
# # Change runtime type -> Hardware accelerator -> GPU. If you wish to build from
# # source, see see https://tvm.apache.org/docs/install/from_source.html
# pip install tlcpack-nightly-cu113 --pre -f https://tlcpack.ai/wheels

In [29]:
import numpy as np
import tvm
# from tvm import relax
from tvm.ir.module import IRModule
# from tvm.script import relax as R
from tvm.script import tir as T

In [30]:
tvm.__version__

'0.13.dev273+g6b20caee2'

# Element-wise Add (aka, vector add)

An example of GPU programming

In [31]:
f32 = "float32"
@tvm.script.ir_module
class MyModuleVecAdd:
    @T.prim_func
    def main(A: T.Buffer((1024,), f32),
             B: T.Buffer((1024,), f32),
             C: T.Buffer((1024,), f32)) -> None:
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        for i in T.grid(1024):
            with T.block("C"):
                vi = T.axis.remap("S", [i])
                C[vi] = A[vi] + B[vi]

In [32]:
# split loop i
sch = tvm.tir.Schedule(MyModuleVecAdd)
block_C = sch.get_block("C")
i, = sch.get_loops(block_C)
i0, i1 = sch.split(i, [None, 128])
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


## Thread blocks
a thread == a core  
multiple threads == a thread block  
multiple thread blocks == a grid

### Identify a thread:
- threadIdx.x
- blockIdx.x

In [33]:
sch.bind(i0, "blockIdx.x")
sch.bind(i1, "threadIdx.x")
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


In [34]:
# build and run on gpu
target = tvm.target.cuda()
rt_mod = tvm.build(sch.mod, target=target)

A_np = np.random.uniform(size=(1024,)).astype(f32)
B_np = np.random.uniform(size=(1024,)).astype(f32)
A_nd = tvm.nd.array(A_np, tvm.cuda(0))
B_nd = tvm.nd.array(B_np, tvm.cuda(0))
C_nd = tvm.nd.array(np.zeros((1024,), dtype="float32"), tvm.cuda(0))

rt_mod["main"](A_nd, B_nd, C_nd)
print(C_nd)



[1.2500075  0.6357553  1.2580568  ... 1.3098538  1.2019022  0.68830264]


# Window Sum

A basic vervion of "convolution"

Sliding window sums 3 neighbors.

In [35]:
@tvm.script.ir_module
class MyModuleWindowSum:
    @T.prim_func
    def main(A: T.Buffer((1024,), f32),
             B: T.Buffer((1024,), f32)) -> None:
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        for i in T.grid(1024):
            with T.block("C"):
                vi = T.axis.remap("S", [i])
                B[vi] = A[vi] + A[vi + 1] + A[vi + 2]

In [36]:
# bind GPU threads
sch = tvm.tir.Schedule(MyModuleWindowSum)
nthread = 128
block_C = sch.get_block("C")
i, = sch.get_loops(block_C)
i0, i1 = sch.split(i, [None, nthread])
sch.bind(i0, "blockIdx.x")
sch.bind(i1, "threadIdx.x")
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


## Shared memory

Each thread block has a shared memory that all threads (within this blk) can access.

In [37]:
# add an intermediate stage (representing the shared memory)
A_shared = sch.cache_read(block_C, read_buffer_index=0, storage_scope="shared")
# move a block under i1 loop
sch.compute_at(A_shared, i1)
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


### Cooperative fetching

threads work together to bring in data

In [42]:
ax = sch.get_loops(A_shared)[-1]
ax0, ax1 = sch.split(ax, [None, nthread])
sch.bind(ax1, "threadIdx.x")
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


# Inspect Lower level Code

the code has two parts:
1. host code: which calls gpu driver
2. kernel code: which runs computations



In [43]:
# print out cuda kernel
rt_mod = tvm.build(sch.mod, target="cuda")
print(rt_mod.imported_modules[0].get_source())


#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
     (__CUDACC_VER_MAJOR__ > 11))
#define TVM_ENABLE_L2_PREFETCH 1
#else
#define TVM_ENABLE_L2_PREFETCH 0
#endif

#ifdef _WIN32
  using uint = unsigned int;
  using uchar = unsigned char;
  using ushort = unsigned short;
  using int64_t = long long;
  using uint64_t = unsigned long long;
#else
  #define uint unsigned int
  #define uchar unsigned char
  #define ushort unsigned short
  #define int64_t long long
  #define uint64_t unsigned long long
#endif
extern "C" __global__ void __launch_bounds__(128) main_kernel(float* __restrict__ A, float* __restrict__ B) {
  __shared__ float A_shared[1024];
  for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) {
    if (((((int)blockIdx.x) + ax0_0) < 8) && (((ax0_0 * 64) + (((int)threadIdx.x) >> 1)) < 65)) {
      A_shared[((ax0_0 * 128) + ((int)threadIdx.x))] = A[(((((int)blockIdx.x) * 128) + (ax0_0 * 128)) + ((int)threadIdx.x))];
    }
  }
  __syncthreads();
  B[((((int)blockIdx.x) * 1

In [45]:
# print metal / opencl kernel
rt_mod = tvm.build(sch.mod, target="metal")
print(rt_mod.imported_modules[0].get_source())

rt_mod = tvm.build(sch.mod, target="opencl")
print(rt_mod.imported_modules[0].get_source())

// Function: main_kernel
#include <metal_stdlib>
using namespace metal;

union __TVMArgUnion {
 int v_int[2];
};

kernel void main_kernel(  device float* A [[ buffer(0) ]],
  device float* B [[ buffer(1) ]],
  uint blockIdx [[threadgroup_position_in_grid]],
  uint threadIdx [[thread_position_in_threadgroup]]
) {
  threadgroup float A_shared[1024];
  for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) {
    if (((((int)blockIdx) + ax0_0) < 8) && (((ax0_0 * 64) + (((int)threadIdx) >> 1)) < 65)) {
      A_shared[((ax0_0 * 128) + ((int)threadIdx))] = A[(((((int)blockIdx) * 128) + (ax0_0 * 128)) + ((int)threadIdx))];
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  B[((((int)blockIdx) * 128) + ((int)threadIdx))] = ((A_shared[((int)threadIdx)] + A_shared[(((int)threadIdx) + 1)]) + A_shared[(((int)threadIdx) + 2)]);
}



// Function: main_kernel
__kernel void main_kernel(__global float* restrict A, __global float* restrict B) {
  __local float A_shared[1024];
  for (int ax0_0 = 0; ax0_0 < 2