<a href="https://colab.research.google.com/github/XueyanZhang/MachineLearningCompilation/blob/master/MLC_Automatic_Program_Opt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automatic Program Optimization

MLC process can be viewed as transformation among tensor functions.

There are many ways to transform.

Which transformation is better?

In [1]:
!python3 -m  pip install mlc-ai-nightly -f https://mlc.ai/wheels

import numpy as np
import tvm
from tvm import relax
from tvm.ir.module import IRModule
from tvm.script import relax as R
from tvm.script import tir as T

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://mlc.ai/wheels
Collecting mlc-ai-nightly
  Downloading https://github.com/mlc-ai/utils/releases/download/v0.9.dev0/mlc_ai_nightly-0.12.dev827%2Bgbc6282b3c-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (52.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.1/52.1 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mlc-ai-nightly
Successfully installed mlc-ai-nightly-0.12.dev827+gbc6282b3c


# Recap: Transform a Primitive Tensor Func

In [3]:
f32 = "float32"
@tvm.script.ir_module
class MyModule:
    @T.prim_func
    def main(
        A: T.Buffer((128, 128), f32),
        B: T.Buffer((128, 128), f32),
        C: T.Buffer((128, 128), f32),
    ):
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        for i, j, k in T.grid(128, 128, 128):
            with T.block("C"):
                vi, vj, vk = T.axis.remap("SSR", [i, j ,k])
                with T.init():
                    C[vi, vj] = T.float32(0)
                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]

In [9]:
# define input values and baseline
a_np = np.random.rand(128, 128).astype(f32)
b_np = np.random.rand(128, 128).astype(f32)
c_np = a_np @ b_np

In [23]:
# run MyModule
a_tvm = tvm.nd.array(a_np)
b_tvm = tvm.nd.array(b_np)
c_tvm = tvm.nd.empty((128, 128), dtype=f32)

lib_rt = tvm.build(MyModule, target="llvm")
f_timer_before = lib_rt.time_evaluator("main", tvm.cpu())
print("Time of MyModule: ", f_timer_before(a_tvm, b_tvm, c_tvm).mean * 1000, " ms")
np.testing.assert_allclose(c_tvm.numpy(), c_np, rtol=1e-5)

Time of MyModule:  3.8970793999999995  ms


# Transformation: loop reordering

Let's add some simple transformations (split j as input arg `jfactor`)

(we did this before)

In [18]:
def schdule_mm(sch: tvm.tir.Schedule, jfactor=4):
    block_C = sch.get_block("C", "main")
    i, j, k = sch.get_loops(block_C)
    j0, j1 = sch.split(j, factors=[None, jfactor])
    sch.reorder(i, j0, k, j1)
    sch.decompose_reduction(block_C, k)
    return sch

In [19]:
# apply transformation
sch = tvm.tir.Schedule(MyModule)
sch = schdule_mm(sch)
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


In [26]:
# run the mod
lib_rt_mod = tvm.build(sch.mod, target="llvm")
f_timer_after = lib_rt_mod.time_evaluator("main", tvm.cpu())
print("Time of MyModule.mod: ", f_timer_before(a_tvm, b_tvm, c_tvm).mean * 1000, " ms")
np.testing.assert_allclose(c_tvm.numpy(), c_np, rtol=1e-5)

Time of MyModule.mod:  3.8819906000000004  ms


`sch.mod` should take less time theoretically. It may be subjected to noises.

# Transformation Trace

`tir.Schedule` offers a trace field, showing the steps to get a transformed module.

In [27]:
print(sch.trace) # exact transformation in schdule_mm

# from tvm import tir
def apply_trace(sch: tir.Schedule) -> None:
  b0 = sch.get_block(name="C", func_name="main")
  l1, l2, l3 = sch.get_loops(block=b0)
  l4, l5 = sch.split(loop=l2, factors=[None, 4], preserve_unit_iters=True)
  sch.reorder(l1, l4, l3, l5)
  b6 = sch.decompose_reduction(block=b0, loop=l3)


# Stochastic Transformation

use stochastic elements in tranformation function to see which achieves better performance.

In [31]:
def stochastic_schdule_mm(sch: tvm.tir.Schedule): # no longer specify jfactors
    block_C = sch.get_block("C", "main")
    i, j, k = sch.get_loops(block_C)
    j_factors = sch.sample_perfect_tile(loop=j, n=2) # stochastic
    j0, j1 = sch.split(j, factors=j_factors)
    sch.reorder(i, j0, k, j1)
    sch.decompose_reduction(block_C, k)
    return sch

possible j_factors: [8, 16], [32, 4], [2, 64], [1,128]

In [33]:
sch = tvm.tir.Schedule(MyModule)
sch = stochastic_schdule_mm(sch)
sch.mod.show()

To print formatted TVM script, please install the formatter 'Black':
/usr/bin/python3 -m pip install "black==22.3.0" --upgrade --user


In [34]:
# check the trace to see the jfactors
print(sch.trace)

# from tvm import tir
def apply_trace(sch: tir.Schedule) -> None:
  b0 = sch.get_block(name="C", func_name="main")
  l1, l2, l3 = sch.get_loops(block=b0)
  v4, v5 = sch.sample_perfect_tile(loop=l2, n=2, max_innermost_factor=16, decision=[64, 2])
  l6, l7 = sch.split(loop=l2, factors=[v4, v5], preserve_unit_iters=True)
  sch.reorder(l1, l6, l3, l7)
  b8 = sch.decompose_reduction(block=b0, loop=l3)
