**Optimizing Operators with Schedule Templates and AutoTVM**

In [3]:
%%shell
pip install apache-tvm --pre

Collecting apache-tvm
  Downloading apache_tvm-0.14.dev273-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Downloading apache_tvm-0.14.dev273-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (69.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.2/69.2 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: apache-tvm
Successfully installed apache-tvm-0.14.dev273




In [2]:
!pip3 install --user psutil xgboost cloudpickle



In [4]:
!pip3 install --user cython
!sudo make cython3

make: *** No rule to make target 'cython3'.  Stop.


In [5]:
import logging
import sys

import numpy as np
import tvm
from tvm import te
import tvm.testing
from tvm import autotvm

Basic Matrix Multiplication with TE

In [6]:
def matmul_basic(N, L, M, dtype):

    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    yo, yi = s[C].split(y, 8)
    xo, xi = s[C].split(x, 8)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]

Matrix Multiplication with AutoTVM:
A Basic Matrix Multiplication Template

In [7]:
@autotvm.template("tutorial/matmul_v1")  # a decorator
def matmul_v1(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    # 2. get the config object
    cfg = autotvm.get_config()

    # 3. define search space
    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])

    # 4. schedule according to config
    yo, yi = s[C].split(y, cfg["tile_y"].val)
    xo, xi = s[C].split(x, cfg["tile_x"].val)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]

A Matrix Multiplication Template with the Advanced Parameter API

In [8]:
@autotvm.template("tutorial/matmul")
def matmul(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)

    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]

Step 2: Use AutoTVM to Optimize the Matrix Multiplication

Begin tuning

In [9]:
N, L, M = 512, 512, 512
task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm")
print(task.config_space)

ConfigSpace (len=100, range_length=100, space_map=
   0 tile_y: Split(policy=factors, product=512, num_outputs=2) len=10
   1 tile_x: Split(policy=factors, product=512, num_outputs=2) len=10
)


In [10]:
logging.getLogger("autotvm").setLevel(logging.DEBUG)
logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))

In [11]:
measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(
    n_trial=10,
    measure_option=measure_option,
    callbacks=[autotvm.callback.log_to_file("matmul.log")],
)

waiting for device...


DEBUG:autotvm:waiting for device...


device available


DEBUG:autotvm:device available


Get devices for measurement successfully!


INFO:autotvm:Get devices for measurement successfully!


No: 1	GFLOPS: 5.05/5.05	result: MeasureResult(costs=(0.053161619199999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3552758693695068, timestamp=1726489101.506798)	[('tile_y', [-1, 4]), ('tile_x', [-1, 512])],None,92


DEBUG:autotvm:No: 1	GFLOPS: 5.05/5.05	result: MeasureResult(costs=(0.053161619199999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3552758693695068, timestamp=1726489101.506798)	[('tile_y', [-1, 4]), ('tile_x', [-1, 512])],None,92


No: 2	GFLOPS: 1.55/5.05	result: MeasureResult(costs=(0.17298517600000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.881613254547119, timestamp=1726489105.346971)	[('tile_y', [-1, 4]), ('tile_x', [-1, 1])],None,2


DEBUG:autotvm:No: 2	GFLOPS: 1.55/5.05	result: MeasureResult(costs=(0.17298517600000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.881613254547119, timestamp=1726489105.346971)	[('tile_y', [-1, 4]), ('tile_x', [-1, 1])],None,2


No: 3	GFLOPS: 0.95/5.05	result: MeasureResult(costs=(0.2823497202,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.149085760116577, timestamp=1726489110.6967266)	[('tile_y', [-1, 1]), ('tile_x', [-1, 1])],None,0


DEBUG:autotvm:No: 3	GFLOPS: 0.95/5.05	result: MeasureResult(costs=(0.2823497202,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.149085760116577, timestamp=1726489110.6967266)	[('tile_y', [-1, 1]), ('tile_x', [-1, 1])],None,0


No: 4	GFLOPS: 1.09/5.05	result: MeasureResult(costs=(0.2460393088,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.932849884033203, timestamp=1726489115.5497136)	[('tile_y', [-1, 32]), ('tile_x', [-1, 16])],None,45


DEBUG:autotvm:No: 4	GFLOPS: 1.09/5.05	result: MeasureResult(costs=(0.2460393088,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.932849884033203, timestamp=1726489115.5497136)	[('tile_y', [-1, 32]), ('tile_x', [-1, 16])],None,45


No: 5	GFLOPS: 2.60/5.05	result: MeasureResult(costs=(0.10342303959999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.96867036819458, timestamp=1726489118.6718123)	[('tile_y', [-1, 64]), ('tile_x', [-1, 8])],None,36


DEBUG:autotvm:No: 5	GFLOPS: 2.60/5.05	result: MeasureResult(costs=(0.10342303959999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.96867036819458, timestamp=1726489118.6718123)	[('tile_y', [-1, 64]), ('tile_x', [-1, 8])],None,36


No: 6	GFLOPS: 8.31/8.31	result: MeasureResult(costs=(0.032290437,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9415507316589355, timestamp=1726489119.4568455)	[('tile_y', [-1, 1]), ('tile_x', [-1, 512])],None,90


DEBUG:autotvm:No: 6	GFLOPS: 8.31/8.31	result: MeasureResult(costs=(0.032290437,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9415507316589355, timestamp=1726489119.4568455)	[('tile_y', [-1, 1]), ('tile_x', [-1, 512])],None,90


No: 7	GFLOPS: 7.23/8.31	result: MeasureResult(costs=(0.037104190999999995,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.14677095413208, timestamp=1726489120.727095)	[('tile_y', [-1, 16]), ('tile_x', [-1, 64])],None,64


DEBUG:autotvm:No: 7	GFLOPS: 7.23/8.31	result: MeasureResult(costs=(0.037104190999999995,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.14677095413208, timestamp=1726489120.727095)	[('tile_y', [-1, 16]), ('tile_x', [-1, 64])],None,64


No: 8	GFLOPS: 1.10/8.31	result: MeasureResult(costs=(0.244889486,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.438783645629883, timestamp=1726489125.1540842)	[('tile_y', [-1, 1]), ('tile_x', [-1, 2])],None,10


DEBUG:autotvm:No: 8	GFLOPS: 1.10/8.31	result: MeasureResult(costs=(0.244889486,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.438783645629883, timestamp=1726489125.1540842)	[('tile_y', [-1, 1]), ('tile_x', [-1, 2])],None,10


No: 9	GFLOPS: 4.19/8.31	result: MeasureResult(costs=(0.0640084958,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5014536380767822, timestamp=1726489127.6846879)	[('tile_y', [-1, 2]), ('tile_x', [-1, 32])],None,51


DEBUG:autotvm:No: 9	GFLOPS: 4.19/8.31	result: MeasureResult(costs=(0.0640084958,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5014536380767822, timestamp=1726489127.6846879)	[('tile_y', [-1, 2]), ('tile_x', [-1, 32])],None,51


No: 10	GFLOPS: 5.12/8.31	result: MeasureResult(costs=(0.05243227100000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.2266881465911865, timestamp=1726489128.8429985)	[('tile_y', [-1, 16]), ('tile_x', [-1, 128])],None,74


DEBUG:autotvm:No: 10	GFLOPS: 5.12/8.31	result: MeasureResult(costs=(0.05243227100000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.2266881465911865, timestamp=1726489128.8429985)	[('tile_y', [-1, 16]), ('tile_x', [-1, 128])],None,74


In [12]:
with autotvm.apply_history_best("matmul.log"):
    with tvm.target.Target("llvm"):
        s, arg_bufs = matmul(N, L, M, "float32")
        func = tvm.build(s, arg_bufs)

a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = a_np.dot(b_np)

c_tvm = tvm.nd.empty(c_np.shape)
func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.numpy(), rtol=1e-4)

Finish loading 10 records


DEBUG:autotvm:Finish loading 10 records
