In [1]:
from __future__ import absolute_import, print_function

import tvm
from tvm import te
import tvm.testing
import numpy as np
N, M, L = 1024, 512, 64
A = te.placeholder((N, L), name="A")
B = te.placeholder((M, L), name="B")
k = te.reduce_axis((0, L), name="k")
C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[j, k], axis=k), name="C")
s = te.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
             A: Buffer(A_2: Pointer(float32), float32, [1024, 64], []),
             C: Buffer(C_2: Pointer(float32), float32, [1024, 512], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (i: int32, 0, 1024) {
    for (j: int32, 0, 512) {
      C_2[((i*512) + j)] = 0f32
      for (k: int32, 0, 64) {
        C_2[((i*512) + j)] = ((float32*)C_2[((i*512) + j)] + ((float32*)A_2[((i*64) + k)]*(float32*)B_2[((j*64) + k)]))
      }
    }
  }
}




In [2]:
factor = 16
x, y = C.op.axis
(z,) = C.op.reduce_axis
yo, yi = s[C].split(y, factor=factor)
s[C].reorder(x, yo, yi, z)
print(tvm.lower(s, [A, B, C], simple_mode=True))

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 512], []),
             A: Buffer(A_2: Pointer(float32), float32, [1024, 64], []),
             B: Buffer(B_2: Pointer(float32), float32, [512, 64], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (i: int32, 0, 1024) {
    for (j.outer: int32, 0, 32) {
      for (j.inner: int32, 0, 16) {
        C_2[(((i*512) + (j.outer*16)) + j.inner)] = 0f32
        for (k: int32, 0, 64) {
          C_2[(((i*512) + (j.outer*16)) + j.inner)] = ((float32*)C_2[(((i*512) + (j.outer*16)) + j.inner)] + ((float32*)A_2[((i*64) + k)]*(float32*)B_2[(((j.outer*1024) + (j.inner*64)) + k)]))
        }
      }
    }
  }
}




In [3]:
def intrin_gemv(m, l):
    a = te.placeholder((l,), name="a")
    b = te.placeholder((m, l), name="b")
    k = te.reduce_axis((0, l), name="k")
    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c")
    Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1])
    Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[te.var("s1"), 1])
    Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1])

    def intrin_func(ins, outs):
        ib = tvm.tir.ir_builder.create()
        aa, bb = ins
        cc = outs[0]
        ib.emit(
            tvm.tir.call_extern(
                "int32",
                "gemv_update",
                cc.access_ptr("w"),
                aa.access_ptr("r"),
                bb.access_ptr("r"),
                m,
                l,
                bb.strides[0],
            )
        )
        return ib.get()

    return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
def gemv_impl():
    cc_code = """
      extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {
        for (int i = 0; i < m; ++i) {
            for (int j = 0; j < l; ++j) {
                cc[i] += aa[j] * bb[i * stride + j];
            }
        }
        return 0;
      }
    """
    from tvm.contrib import utils, clang

    temp = utils.tempdir()
    ll_path = temp.relpath("temp.ll")
    # Create LLVM ir from c source code
    ll_code = clang.create_llvm(cc_code, output=ll_path)
    return ll_code

gemv = intrin_gemv(factor, L)
s[C].tensorize(yi, gemv)
#print(tvm.lower(s, [A, B, C], simple_mode=True))


In [4]:
s[C].pragma(x, "import_llvm", gemv_impl())
#print(tvm.lower(s, [A, B, C], simple_mode=True))

In [5]:
func = tvm.build(s, [A, B, C], target="llvm", name="gemv")

from tvm.topi.utils import get_const_tuple

dtype = A.dtype
ctx = tvm.context("cpu", 0)
a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)

In [9]:
import tvm
from tvm import te

def compute_conv2d(A, W, stride, padding):
    batch_size, in_channel, height, width = A.shape
    out_channel, _ = W.shape

    kh = 1
    kw = 1

    out_height = (height + 2 * padding - kh) // stride + 1
    out_width = (width + 2 * padding - kw) // stride + 1

    A = te.compute((batch_size, height, width, in_channel), lambda n, h, w, c: A[n, c, h, w])

    # convolution
    oshape = (batch_size, out_channel, out_height, out_width)

    ic = te.reduce_axis((0, in_channel), name='ic')

    conv = te.compute(oshape, lambda n, oc, oh, ow:
                       te.sum(A[n, oh*stride+kh, ow*stride+kw, ic] * W[oc, ic],
                               axis=[ic]),
                       name='conv2d', tag="conv2d")
    return conv


def matmul():
    wgt = te.placeholder((1, 128))
    inp = te.placeholder((16, 128))

    k = te.reduce_axis((0, 128), name="k")

    out = te.compute((16, 1),
                      lambda i, j: te.sum(inp(i, k) * wgt(j, k), axis=[k]))

    def intrin_func(inputs, outputs):
        def body():
            irb = tvm.tir.ir_builder.create()
            irb.emit(tvm.tir.call_extern(
                "float32", "Matmul"))
            return irb.get()

        def reset():
            return body()

        def update():
            return body()

        return body(), reset(), update()

    return te.decl_tensor_intrin(out.op, intrin_func, name="Matmul")


def schedule_conv2d(out):
    s = te.create_schedule(out.op)
    conv = out.op.output(0)
    data, kernel = conv.op.input_tensors

    batch, oc, oh, ow = s[conv].op.axis
    ic, = s[conv].op.reduce_axis

    s[conv].tensorize(ow, matmul())

    return s


def verify_conv2d_nchw(batch, in_channel, in_height, in_width, num_filter, kernel, stride, padding):
    A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
    W = te.placeholder((num_filter, in_channel), name='W')
    B = compute_conv2d(A, W, stride, padding)
    s = schedule_conv2d(B)

    s = s.normalize()
    print(tvm.lower(s, [A, W, B], simple_mode=True))


def test_conv2d_nchw():
    verify_conv2d_nchw(batch=1, in_channel=128, in_height=16, in_width=16, num_filter=64, kernel=1, stride=1, padding=0)


if __name__ == "__main__":
    test_conv2d_nchw()

TVMError: Traceback (most recent call last):
  File "D:\work\llvmsrc\tvm\src\te\operation\tensorize.cc", line 336
TVMError: 
---------------------------------------------------------------
An internal invariant was violated during the execution of TVM.
Please read TVM's error reporting guidelines.
More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.
---------------------------------------------------------------
  Check failed: expr_equal(lhs, rhs) == false: Failed to match the compute with TensorIntrin Matmul's declaration  provided= reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[(placeholder[0, k]*placeholder[0, k])], init=[], axis=[iter_var(k, range(min=0, ext=128))], where=(bool)1, value_index=0), intrin=  reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[(placeholder[i, k]*placeholder[0, k])], init=[], axis=[iter_var(k, range(min=0, ext=128))], where=(bool)1, value_index=0)