In [84]:
import tvm
from tvm import te, topi
from tvm.topi import nn
from numbers import Integral
import numpy as np

In [85]:
def get_const_tuple(in_tuple):
    """Verifies input tuple is IntImm or Var, returns tuple of int or Var.

    Parameters
    ----------
    in_tuple : tuple of Expr
        The input.

    Returns
    -------
    out_tuple : tuple of int
        The output.
    """
    ret = []
    ana = None
    for elem in in_tuple:
        if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)):
            ret.append(elem)
        elif not isinstance(elem, (tvm.tir.IntImm, int)):
            ana = tvm.arith.Analyzer() if ana is None else ana
            elem = ana.simplify(elem)
            if not isinstance(elem, tvm.tir.IntImm):
                ret.append(elem)
            else:
                ret.append(get_const_int(elem))
        else:
            ret.append(get_const_int(elem))
    return tuple(ret)
def get_const_int(expr):
    """Verifies expr is integer and get the constant value.

    Parameters
    ----------
    expr : tvm.Expr or int
        The input expression.

    Returns
    -------
    out_value : int
        The output.
    """
    if isinstance(expr, Integral):
        return expr
    if not isinstance(expr, tvm.tir.IntImm):
        ana = tvm.arith.Analyzer()
        expr = ana.simplify(expr)
    if not isinstance(expr, tvm.tir.IntImm):
        raise ValueError("Expect value to be constant int")
    return int(expr.value)


In [86]:
H=40
W = 40
H_P, W_P = H, W
KW=KH=3

CI=16
CO=16

in_channel = CI
out_channel = CO
in_size=H
open_image = 0
ddtype = 'float32'

data_pl = te.placeholder((1, CI, H, W),
                         name='data', dtype=ddtype)
kernel_pl = te.placeholder((CO, CI, KW, KH),
                           name='filter', dtype=ddtype)



data, kernel=data_pl,kernel_pl
N, CI, IH, IW = get_const_tuple(data.shape)
dilation=1
strides=1
padding=1
tile_size=2
out_dtype='float32'
if isinstance(dilation, int):
    dilation_h = dilation_w = dilation
else:
    dilation_h, dilation_w = dilation

if len(kernel.shape) == 4:
    if dilation_h != 1 or dilation_w != 1:
        kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
    pre_computed = False
    CO, _, KH, KW = get_const_tuple(kernel.shape)
else:
    assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
    pre_computed = True
    H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
    CO *= VC
    KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))

assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")

r = KW
m = tile_size
alpha = m + r - 1

A, B, G = nn.winograd_util.winograd_transform_matrices(m, r, out_dtype)

H = (IH + pt + pb - 3) // HSTR + 1
W = (IW + pl + pr - 3) // WSTR + 1
nH, nW = (H + m - 1) // m, (W + m - 1) // m
P = N * nH * nW
bna=nH
bnb=nW
P_round = (P + bnb - 1) // bnb * bnb    
print(m)
input_tile = te.compute(
    (CI, P_round // bnb, alpha, alpha, bnb),
    lambda ci, b, eps, nu, bb: tvm.tir.if_then_else(
        b * bnb + bb < P,
        data_pad[(b * bnb + bb) // (nH * nW)][ci][(b * bnb + bb) // nW % nH * m + eps][
            (b * bnb + bb) % nW * m + nu],
        tvm.tir.const(0, data_pad.dtype),
    ),
    name="d",
)


2


In [91]:
print(data.shape)
print(data_pad.shape)
pt, pl
pb, pr

[1, 16, 40, 40]
[1, 16, 42, 42]


(1, 1)

In [9]:
s = te.create_schedule(input_tile.op)
print(tvm.lower(s,[data,kernel,input_tile], simple_mode=True))
func=tvm.build(s,[data,kernel,input_tile])
ctx=tvm.context("llvm", 0)

primfn(data_1: handle, filter_1: handle, d_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {d: Buffer(d_2: Pointer(float32), float32, [16, 20, 4, 4, 20], []),
             data: Buffer(data_2: Pointer(float32), float32, [1, 16, 40, 40], []),
             filter: Buffer(filter_2: Pointer(float32), float32, [16, 16, 3, 3], [])}
  buffer_map = {data_1: data, filter_1: filter, d_1: d} {
  attr [data_pad: Pointer(float32)] "storage_scope" = "global";
  allocate(data_pad, float32, [28224]) {
    for (i1: int32, 0, 16) {
      for (i2: int32, 0, 42) {
        for (i3: int32, 0, 42) {
          data_pad[(((i1*1764) + (i2*42)) + i3)] = @tir.if_then_else(((((1 <= i2) && (i2 < 41)) && (1 <= i3)) && (i3 < 41)), (float32*)data_2[((((i1*1600) + (i2*40)) + i3) - 41)], 0f32, dtype=float32)
        }
      }
    }
    for (ci: int32, 0, 16) {
      for (b: int32, 0, 20) {
        for (eps: int32, 0, 4) {
          for (nu: int32, 0, 4) {
            for (bb: int32, 

In [10]:
osp=input_tile.shape
a_np_tvm=np.arange(in_channel*H*W).reshape(1,in_channel,H,W)
w_np_tvm=np.arange(KW*KH*in_channel*out_channel).reshape(out_channel,in_channel,KW,KH)
a_tvm = tvm.nd.array(a_np_tvm, ctx=ctx, dtype=data_pl.dtype)
w_tvm = tvm.nd.array(w_np_tvm, ctx=ctx, dtype=kernel_pl.dtype)
c_tvm = tvm.nd.empty(osp, ctx=ctx, dtype=input_tile.dtype)
func(a_tvm, w_tvm, c_tvm)
print(c_tvm.shape)
c_tvm.asnumpy().astype('int32')[1,1:3,:,:,:]

(16, 20, 4, 4, 20)


array([[[[   0, 1641, 1643, 1645, 1647, 1649, 1651, 1653, 1655, 1657,
          1659, 1661, 1663, 1665, 1667, 1669, 1671, 1673, 1675, 1677],
         [1640, 1642, 1644, 1646, 1648, 1650, 1652, 1654, 1656, 1658,
          1660, 1662, 1664, 1666, 1668, 1670, 1672, 1674, 1676, 1678],
         [1641, 1643, 1645, 1647, 1649, 1651, 1653, 1655, 1657, 1659,
          1661, 1663, 1665, 1667, 1669, 1671, 1673, 1675, 1677, 1679],
         [1642, 1644, 1646, 1648, 1650, 1652, 1654, 1656, 1658, 1660,
          1662, 1664, 1666, 1668, 1670, 1672, 1674, 1676, 1678,    0]],

        [[   0, 1681, 1683, 1685, 1687, 1689, 1691, 1693, 1695, 1697,
          1699, 1701, 1703, 1705, 1707, 1709, 1711, 1713, 1715, 1717],
         [1680, 1682, 1684, 1686, 1688, 1690, 1692, 1694, 1696, 1698,
          1700, 1702, 1704, 1706, 1708, 1710, 1712, 1714, 1716, 1718],
         [1681, 1683, 1685, 1687, 1689, 1691, 1693, 1695, 1697, 1699,
          1701, 1703, 1705, 1707, 1709, 1711, 1713, 1715, 1717, 1719],
         [1

In [11]:
A_data, B_data, G_data = nn.winograd_util._cook_toom_convolution(nn.winograd_util._interpolation_points(tile_size+2), tile_size, KH)
A, B, G=nn.winograd_util.winograd_transform_matrices(tile_size,KH,'float32')
r_kh = te.reduce_axis((0, KH), "r_kh")
r_kw = te.reduce_axis((0, KW), "r_kw")
U = te.compute(
    (alpha, alpha, CO // bna, CI, bna),
    lambda eps, nu, co, ci, vco: te.sum(
        kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
        axis=[r_kh, r_kw],
    ),
    name="U",
)


In [12]:
BOUT=U
c_tvm = tvm.nd.empty(BOUT.shape, ctx=ctx,dtype='float32')
s = te.create_schedule(BOUT.op)
print(tvm.lower(s,[data,kernel,input_tile], simple_mode=True))
func=tvm.build(s,[kernel,BOUT])
print(func.imported_modules[0].get_source()) if len(func.imported_modules) > 0 else print("source not imported")

func(w_tvm,c_tvm)
print(c_tvm.shape)

primfn(data_1: handle, filter_1: handle, d_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {d: Buffer(d_2: Pointer(float32), float32, [16, 20, 4, 4, 20], []),
             data: Buffer(data_2: Pointer(float32), float32, [1, 16, 40, 40], []),
             filter: Buffer(filter_2: Pointer(float32), float32, [16, 16, 3, 3], [])}
  buffer_map = {data_1: data, filter_1: filter, d_1: d} {
  attr [G: Pointer(float32)] "storage_scope" = "global";
  allocate(G, float32, [12]);
  attr [U: Pointer(float32)] "storage_scope" = "global";
  allocate(U, float32, [0]);
  for (i: int32, 0, 4) {
    for (j: int32, 0, 3) {
      G[((i*3) + j)] = select(((i == 3) && (j == 2)), 1f32, select(((i == 3) && (j == 1)), 0f32, select(((i == 3) && (j == 0)), 0f32, select(((i == 2) && (j == 2)), 0.5f32, select(((i == 2) && (j == 1)), 0.5f32, select(((i == 2) && (j == 0)), 0.5f32, select(((i == 1) && (j == 2)), 0.5f32, select(((i == 1) && (j == 1)), -0.5f32, select(((i == 1) && (j

In [24]:
ugg=G_data.dot(w_tvm.asnumpy()[0,0,:,:]).dot(G_data.T)
#print(ugg)
print("===")
rugg=c_tvm.asnumpy()[:,:,0,0,0]
#print(rugg)
np.sum(ugg-rugg)<1e2

===


IndexError: index 0 is out of bounds for axis 2 with size 0

In [25]:
print(c_tvm.shape)
print(a_np_tvm.shape)
#print(a_np_tvm)
c_tvm.asnumpy().astype(np.int)[0,1,:,:,:]

(4, 4, 0, 16, 20)
(1, 16, 40, 40)


array([], shape=(0, 16, 20), dtype=int64)

In [26]:
# transform image
print(alpha, alpha, P_round // bnb, CI, bnb)
r_a = te.reduce_axis((0, alpha), "r_a")
r_b = te.reduce_axis((0, alpha), "r_b")
V = te.compute(
    (alpha, alpha, P_round // bnb, CI, bnb),
    lambda eps, nu, p, ci, vp: te.sum(
        input_tile[ci][p][r_a][r_b][vp] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
    ),
    name="V",
)

4 4 20 16 20


In [82]:
data_tr = te.placeholder((H, W),
                         name='data', dtype=ddtype)
r_a = te.reduce_axis((0, alpha), "r_a")
r_b = te.reduce_axis((0, alpha), "r_b")
r_c = te.reduce_axis((0, alpha), "r_c")
V = te.compute(
    (H//2*4,W//2*4),
    lambda hi,wi: te.sum(
        data_tr[hi//2+r_a][wi//2+r_b] * B[r_a][wi%4]*B[r_b][hi%4], axis=[r_a, r_b]
    ),
    name="V",
)
BOUT=V
ctx1=tvm.context("llvm", 0)
c_tvm = tvm.nd.empty(BOUT.shape, ctx=ctx1,dtype='float32')
s = te.create_schedule(BOUT.op)
thread_x = te.thread_axis((0, 1), "threadIdx.x")
hi, wi = s[BOUT].op.axis
s[BOUT].bind(hi, thread_x)
ra,rb= s[BOUT].op.reduce_axis
s[BOUT].unroll(ra)
s[BOUT].unroll(rb)
#print(tvm.lower(s,[data_tr,BOUT], simple_mode=True))
func=tvm.build(s,[data_tr,BOUT],"opencl")
print(func.imported_modules[0].get_source()) if len(func.imported_modules) > 0 else print("source not imported")
a_np_tvm1=np.arange(H*W).reshape(H,W)
a_tvm1 = tvm.nd.array(a_np_tvm1, ctx=ctx1, dtype=data_pl.dtype)
#print(a_np_tvm1.astype('int32'))
func(a_tvm1,c_tvm)

np.set_printoptions(threshold=10000)
#print(c_tvm.asnumpy().astype('int32'))
#print(B_data)

__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE| CLK_FILTER_NEAREST;
__kernel void default_function_kernel0(__global float* restrict V, __global float* restrict data, __global float* restrict B) {
  for (int wi = 0; wi < 80; ++wi) {
    V[(((((int)get_local_id(0)) * 80) + wi))] = 0.000000e+00f;
    V[(((((int)get_local_id(0)) * 80) + wi))] = (V[(((((int)get_local_id(0)) * 80) + wi))] + ((data[((((((int)get_local_id(0)) >> 1) * 40) + (wi >> 1)))] * B[((wi & 3))]) * B[((((int)get_local_id(0)) & 3))]));
    V[(((((int)get_local_id(0)) * 80) + wi))] = (V[(((((int)get_local_id(0)) * 80) + wi))] + ((data[(((((((int)get_local_id(0)) >> 1) * 40) + (wi >> 1)) + 1))] * B[((wi & 3))]) * B[(((((int)get_local_id(0)) & 3) + 4))]));
    V[(((((int)get_local_id(0)) * 80) + wi))] = (V[(((((int)get_local_id(0)) * 80) + wi))] + ((data[(((((((int)get_local_id(0)) >> 1) * 40) + (wi >> 1)) + 2))] * B[((wi & 3))]) * B[(((((int)get_local_id(0)) & 3) + 8))]));
    V[(((((

TVMError: Traceback (most recent call last):
  [bt] (5) /home/azureuser/work/tvm/build/libtvm.so(TVMFuncCall+0x95) [0x7fc58139b203]
  [bt] (4) /home/azureuser/work/tvm/build/libtvm.so(tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const+0x30) [0x7fc58058005c]
  [bt] (3) /home/azureuser/work/tvm/build/libtvm.so(std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const+0x5a) [0x7fc58044e6d8]
  [bt] (2) /home/azureuser/work/tvm/build/libtvm.so(+0x28f70de) [0x7fc5813a80de]
  [bt] (1) /home/azureuser/work/tvm/build/libtvm.so(+0x28f59f5) [0x7fc5813a69f5]
  [bt] (0) /home/azureuser/work/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x4e) [0x7fc580287c8c]
  File "/home/azureuser/work/tvm/src/runtime/library_module.cc", line 78
TVMError: 
---------------------------------------------------------------
An internal invariant was violated during the execution of TVM.
Please read TVM's error reporting guidelines.
More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.
---------------------------------------------------------------

  Check failed: ret == 0 (-1 vs. 0) : Assert fail: (4 == tir.tvm_struct_get(arg0, 0, 10)), Argument arg0.device_type has an unsatisfied constraint: (4 == tir.tvm_struct_get(arg0, 0, 10))

In [12]:
BOUT=V
c_tvm = tvm.nd.empty(BOUT.shape, ctx=ctx,dtype='float32')
s = te.create_schedule(BOUT.op)
print(tvm.lower(s,[data,BOUT], simple_mode=True))
func=tvm.build(s,[data,BOUT])
print(func.imported_modules[0].get_source()) if len(func.imported_modules) > 0 else print("source not imported")

func(a_tvm,c_tvm)
print(c_tvm.shape)

NameError: name 'V' is not defined

In [13]:
idxdiv = tvm.tir.indexdiv
idxmod = tvm.tir.indexmod

# batch gemm
ci = te.reduce_axis((0, CI), name="c")
M = te.compute(
    (alpha, alpha, CO, P_round),
    lambda eps, nu, co, p: te.sum(
        U[eps][nu][idxdiv(co, bna)][ci][idxmod(co, bna)]
        * V[eps][nu][idxdiv(p, bnb)][ci][idxmod(p, bnb)],
        axis=ci,
    ),
    name="M",
)

NameError: name 'tvm' is not defined

In [73]:
from sympy import Matrix
import numpy as np
import sympy as sym
ab=sym.symbols('a11,a12,a13,a21,a22,a23,a31,a32,a33,d11,d12,d13,d21,d22,d23,d31,d32,d33')

In [74]:
Bm=Matrix(np.array(['a'+str(i//4)+str(i%4) for i in range(16)]).reshape(4,4))
#Bm=Matrix([[1,0,-1,0],[0,1,1,0],[0,-1,1,0],[0,1,0,1]])
dm=Matrix(np.array(['d'+str(i//4)+str(i%4) for i in range(16)]).reshape(4,4))
#dm=sym.ones(3,3)
em=sym.expand(Bm*dm*Bm.T)
em[0]
#em.reshape(9,1)


a00**2*d00 + a00*a01*d01 + a00*a01*d10 + a00*a02*d02 + a00*a02*d20 + a00*a03*d03 + a00*a03*d30 + a01**2*d11 + a01*a02*d12 + a01*a02*d21 + a01*a03*d13 + a01*a03*d31 + a02**2*d22 + a02*a03*d23 + a02*a03*d32 + a03**2*d33

In [None]:
sym.expand("(a31+a32+a33)*(a11+a12+a13)")

In [16]:
bm=Matrix(B_data)
dm=Matrix(np.array(['a'+str(i//8)+str(i%8) for i in range(64)]).reshape(8,8))
#dm=sym.ones(8,8)
em=(bm.T*dm*bm)
em.reshape(64,1)

NameError: name 'B_data' is not defined

In [17]:
from tvm import relay
from tvm.relay import testing
from tvm.contrib import utils
import tvm

# Resnet18 workload
resnet18_mod, resnet18_params = relay.testing.resnet.get_workload(num_layers=18)

with relay.build_config(opt_level=0):
    _, resnet18_lib, _ = relay.build_module.build(resnet18_mod, "llvm", params=resnet18_params)

# print relay ir
print(resnet18_mod.astext(show_meta_data=False))

# print source code
print(resnet18_lib.get_source())

ModuleNotFoundError: No module named 'tvm'