# topi test

In [1]:
from tvm import topi, testing, nd, target, te
from sparse_utils import random_bsr_sparse
import numpy as np
import tvm

N, C, HW, VL, SP = 10, 64, 56, 16, 0.5
NNZ = int(C * C * 9 // VL * SP)
spweight = random_bsr_sparse((C, 9*C), (VL, 1), NNZ)

In [2]:
# NHWC

data = np.random.rand(N, HW, HW, C).astype('float32')
weight = spweight.toarray().T.copy().reshape(3, 3, C, C)
Data = te.placeholder(data.shape, 'float32')

## dense

Weight = te.placeholder(weight.shape, 'float32')
with target.Target('llvm -mcpu=cascadelake'):
    CC = topi.nn.conv2d_nhwc(Data, Weight, 1, 1, 1)
    s = topi.generic.schedule_conv2d_nhwc(CC)
    func = tvm.build(s, [Data, Weight, CC])
args = [nd.array(data), nd.array(weight), nd.empty(CC.shape)]
func(*args)

## sparse

Wdat = te.placeholder(spweight.data.shape, 'float32')
Wind = te.placeholder(spweight.indices.shape, 'int32')
Wptr = te.placeholder(spweight.indptr.shape, 'int32')
with target.Target('llvm -mcpu=cascadelake'):
    CC = topi.x86.sparse.spconv2d_3x3_nhwc(Data, Wdat, Wind, Wptr)
    s = topi.x86.sparse.schedule_spconv2d_3x3_nhwc(CC)
    func = tvm.build(s, [Data, Wdat, Wind, Wptr, CC])
args2 = [nd.array(data), nd.array(spweight.data), nd.array(spweight.indices), nd.array(spweight.indptr), nd.empty(CC.shape)]
func(*args2)

## assert

testing.assert_allclose(args[-1].numpy(), args2[-1].numpy())
print('NHWC passed')

Cannot find config for target=llvm -keys=cpu -link-params=0 -mcpu=cascadelake, workload=('conv3x3_spNHWC.x86', ('TENSOR', (10, 56, 56, 64), 'float32'), ('TENSOR', (1152, 16, 1), 'float32'), ('TENSOR', (1152,), 'int32'), ('TENSOR', (5,), 'int32')). A fallback configuration is used, which may bring great performance regression.


NHWC passed


In [3]:
# NCHW

data = np.random.rand(N, C, HW, HW).astype('float32')
weight = spweight.toarray().reshape(C, C, 3, 3)
Data = te.placeholder(data.shape, 'float32')

## dense

Weight = te.placeholder(weight.shape, 'float32')
with target.Target('llvm -mcpu=cascadelake'):
    CC = topi.nn.conv2d_nchw(Data, Weight, 1, 1, 1, 'float32')
    s = topi.generic.schedule_conv2d_nchw(CC)
    func = tvm.build(s, [Data, Weight, CC])
args = [nd.array(data), nd.array(weight), nd.empty(CC.shape)]
func(*args)

## sparse

Wdat = te.placeholder(spweight.data.shape, 'float32')
Wind = te.placeholder(spweight.indices.shape, 'int32')
Wptr = te.placeholder(spweight.indptr.shape, 'int32')
with target.Target('llvm -mcpu=cascadelake'):
    CC = topi.x86.sparse.spconv2d_3x3_nchw(Data, Wdat, Wind, Wptr)
    s = topi.x86.sparse.schedule_spconv2d_3x3_nchw(CC)
    func = tvm.build(s, [Data, Wdat, Wind, Wptr, CC])
args2 = [nd.array(data), nd.array(spweight.data), nd.array(spweight.indices), nd.array(spweight.indptr), nd.empty(CC.shape)]
func(*args2)

## assert

testing.assert_allclose(args[-1].numpy(), args2[-1].numpy())
print('NCHW passed')

Cannot find config for target=llvm -keys=cpu -link-params=0 -mcpu=cascadelake, workload=('conv3x3_spNCHW.x86', ('TENSOR', (10, 64, 56, 56), 'float32'), ('TENSOR', (1152, 16, 1), 'float32'), ('TENSOR', (1152,), 'int32'), ('TENSOR', (5,), 'int32')). A fallback configuration is used, which may bring great performance regression.


NCHW passed


# baseline model

In [4]:
%env OMP_NUM_THREADS 1
import onnx
onnx_model = onnx.load("sparse_resnet18_best_onnx/resnet18_GL_16_PR_0.6_ckpt_best.onnx")

env: OMP_NUM_THREADS=1


In [5]:
from tvm import relay
const_mod, params = relay.frontend.from_onnx(onnx_model, {'data': (10, 3, 224, 224)}, freeze_params=True)

In [6]:
const_mod.astext().splitlines()[:100]

['#[version = "0.0.5"]',
 'def @main(%data: Tensor[(10, 3, 224, 224), float32]) {',
 '  %0 = nn.conv2d(%data, meta[relay.Constant][0], strides=[2, 2], padding=[3, 3, 3, 3], kernel_size=[7, 7]);',
 '  %1 = nn.bias_add(%0, meta[relay.Constant][1]);',
 '  %2 = nn.relu(%1);',
 '  %3 = nn.max_pool2d(%2, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]);',
 '  %4 = nn.conv2d(%3, meta[relay.Constant][2], padding=[1, 1, 1, 1], kernel_size=[3, 3]);',
 '  %5 = nn.bias_add(%4, meta[relay.Constant][3]);',
 '  %6 = nn.relu(%5);',
 '  %7 = nn.conv2d(%6, meta[relay.Constant][4], padding=[1, 1, 1, 1], kernel_size=[3, 3]);',
 '  %8 = nn.bias_add(%7, meta[relay.Constant][5]);',
 '  %9 = add(%8, %3);',
 '  %10 = nn.relu(%9);',
 '  %11 = nn.conv2d(%10, meta[relay.Constant][6], padding=[1, 1, 1, 1], kernel_size=[3, 3]);',
 '  %12 = nn.bias_add(%11, meta[relay.Constant][7]);',
 '  %13 = nn.relu(%12);',
 '  %14 = nn.conv2d(%13, meta[relay.Constant][8], padding=[1, 1, 1, 1], kernel_size=[3, 3]);',
 '  %

In [7]:
from tvm import autotvm

tasks = autotvm.task.extract_from_program(
    const_mod['main'], params={}, target='llvm -mcpu=cascadelake')

In [8]:
opts = autotvm.measure_option(
    builder='local',
    runner=autotvm.LocalRunner(timeout=20, min_repeat_ms=200),
)

for tsk in tasks:
    print(tsk.name, len(tsk.config_space))
    nsamples = min(200, len(tsk.config_space))
    tuner = autotvm.tuner.GATuner(tsk)
    tuner.tune(
        nsamples,
        measure_option=opts,
        callbacks=[
            autotvm.callback.progress_bar(nsamples),
            autotvm.callback.log_to_file('test_dense.log'),
        ],
    )
autotvm.record.pick_best('test_dense.log', 'test_dense.best.log')

conv2d_NCHWc.x86 308
 Current/Best:  115.19/ 197.50 GFLOPS | Progress: (200/200) | 448.85 s Done.
conv2d_NCHWc.x86 980
 Current/Best:    9.94/ 203.54 GFLOPS | Progress: (200/200) | 390.84 s Done.
conv2d_NCHWc.x86 896
 Current/Best:    7.72/ 183.42 GFLOPS | Progress: (200/200) | 328.03 s Done.
conv2d_NCHWc.x86 896
 Current/Best:   68.05/ 143.04 GFLOPS | Progress: (200/200) | 290.39 s Done.
conv2d_NCHWc.x86 1024
 Current/Best:   30.71/ 212.58 GFLOPS | Progress: (200/200) | 397.88 s Done.
conv2d_NCHWc.x86 864
 Current/Best:   40.97/ 178.58 GFLOPS | Progress: (200/200) | 312.93 s Done.
conv2d_NCHWc.x86 864
 Current/Best:   43.79/ 166.37 GFLOPS | Progress: (200/200) | 286.90 s Done.
conv2d_NCHWc.x86 972
 Current/Best:  144.97/ 211.71 GFLOPS | Progress: (200/200) | 443.07 s Done.
conv2d_NCHWc.x86 720
 Current/Best:   64.33/ 159.66 GFLOPS | Progress: (200/200) | 367.98 s Done.
conv2d_NCHWc.x86 720
 Current/Best:   89.31/ 181.01 GFLOPS | Progress: (200/200) | 278.24 s Done.
conv2d_NCHWc.x86 80

In [9]:
from tvm import autotvm
from tvm.contrib.debugger import debug_executor
from tvm.contrib import graph_executor
import numpy as np
import tvm

with autotvm.apply_history_best('test_dense.best.log'):
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build_module.build(const_mod, params={}, target='llvm -mcpu=cascadelake')

dev = tvm.device('llvm -mcpu=cascadelake', 0)
data = tvm.nd.array(np.random.rand(10, 3, 224, 224).astype('float32'))

graph_dense = debug_executor.create(lib.graph_json, lib.module, dev)
graph_dense.set_input(data=data, **lib.params)

graph_dense.run()
#ftimer = graph_dense.module.time_evaluator("run", dev, number=100, repeat=1)
#ftimer().mean

Node Name                                                        Ops                                                             Time(us)    Time(%)  Shape                  Inputs  Outputs  
---------                                                        ---                                                             --------    -------  -----                  ------  -------  
tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu         tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu        16015.9     7.553    (10, 2, 112, 112, 32)  3       1        
tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_add_nn_relu_31  tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_add_nn_relu_3  15272.8     7.203    (10, 16, 7, 7, 32)     4       1        
tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_7       tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_7      15067.0     7.106    (10, 16, 7, 7, 32)     3       1        
tvmgen_default_fused_nn_contrib_conv2d_NCHWc_

# convert to sparse model

In [10]:
newfunc = relay.data_dep_optimization.utils._run_opt_pass(
    const_mod['main'],
    relay.transform._ffi_api.Conv2dToSparse2("NCHW", 3, 16, 1, 0.4)
)
spconst_mod = tvm.ir.IRModule.from_expr(newfunc)

In [11]:
spconst_mod.astext().splitlines()[:100]

['#[version = "0.0.5"]',
 'def @main(%data: Tensor[(10, 3, 224, 224), float32]) -> Tensor[(10, 1000), float32] {',
 '  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 7, 7), float32] */, strides=[2, 2], padding=[3, 3, 3, 3], kernel_size=[7, 7]) /* ty=Tensor[(10, 64, 112, 112), float32] */;',
 '  %1 = nn.bias_add(%0, meta[relay.Constant][1] /* ty=Tensor[(64), float32] */) /* ty=Tensor[(10, 64, 112, 112), float32] */;',
 '  %2 = nn.relu(%1) /* ty=Tensor[(10, 64, 112, 112), float32] */;',
 '  %3 = nn.max_pool2d(%2, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]) /* ty=Tensor[(10, 64, 56, 56), float32] */;',
 '  %4 = nn.sparse_conv2d(%3, meta[relay.Constant][2] /* ty=Tensor[(927, 16, 1), float32] */, meta[relay.Constant][3] /* ty=Tensor[(927), int32] */, meta[relay.Constant][4] /* ty=Tensor[(5), int32] */, layout="NCHW", kernel_size=3) /* ty=Tensor[(10, 64, 56, 56), float32] */;',
 '  %5 = nn.bias_add(%4, meta[relay.Constant][5] /* ty=Tensor[(64), float32] */) /*

In [12]:
desired_layouts = {'nn.conv2d': ['NHWC', 'default']}
seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
                                relay.transform.ConvertLayout(desired_layouts),
                                relay.transform.FoldConstant()])
with tvm.transform.PassContext(opt_level=3):
    const_mod2 = seq(const_mod)

In [13]:
newfunc = relay.data_dep_optimization.utils._run_opt_pass(
    const_mod2['main'],
    relay.transform._ffi_api.Conv2dToSparse2("NHWC", 3, 16, 1, 0.4)
)
spconst_mod2 = tvm.ir.IRModule.from_expr(newfunc)

In [14]:
spconst_mod2.astext().splitlines()[:100]

['#[version = "0.0.5"]',
 'def @main(%data: Tensor[(10, 3, 224, 224), float32]) -> Tensor[(10, 1000), float32] {',
 '  %0 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC") /* ty=Tensor[(10, 224, 224, 3), float32] */;',
 '  %1 = nn.conv2d(%0, meta[relay.Constant][0] /* ty=Tensor[(7, 7, 3, 64), float32] */, strides=[2, 2], padding=[3, 3, 3, 3], kernel_size=[7, 7], data_layout="NHWC", kernel_layout="HWIO") /* ty=Tensor[(10, 112, 112, 64), float32] */;',
 '  %2 = add(%1, meta[relay.Constant][1] /* ty=Tensor[(1, 1, 1, 64), float32] */) /* ty=Tensor[(10, 112, 112, 64), float32] */;',
 '  %3 = nn.relu(%2) /* ty=Tensor[(10, 112, 112, 64), float32] */;',
 '  %4 = nn.max_pool2d(%3, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1], layout="NHWC") /* ty=Tensor[(10, 56, 56, 64), float32] */;',
 '  %5 = nn.sparse_conv2d(%4, meta[relay.Constant][2] /* ty=Tensor[(927, 16, 1), float32] */, meta[relay.Constant][3] /* ty=Tensor[(927), int32] */, meta[relay.Constant][4] /* ty=Tensor[(5

In [22]:
desired_layouts = {'nn.conv2d': ['NCHW', 'default']}
seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
                                relay.transform.ConvertLayout(desired_layouts),
                                relay.transform.FoldConstant()])
with tvm.transform.PassContext(opt_level=3):
    spconst_mod3 = seq(spconst_mod2)

In [23]:
spconst_mod3.astext().splitlines()[:100]

['#[version = "0.0.5"]',
 'def @main(%data: Tensor[(10, 3, 224, 224), float32]) -> Tensor[(10, 1000), float32] {',
 '  %0 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC") /* ty=Tensor[(10, 224, 224, 3), float32] */;',
 '  %1 = layout_transform(%0, src_layout="NHWC", dst_layout="NCHW") /* ty=Tensor[(10, 3, 224, 224), float32] */;',
 '  %2 = nn.conv2d(%1, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 7, 7), float32] */, strides=[2, 2], padding=[3, 3, 3, 3], kernel_size=[7, 7]) /* ty=Tensor[(10, 64, 112, 112), float32] */;',
 '  %3 = add(%2, meta[relay.Constant][1] /* ty=Tensor[(1, 64, 1, 1), float32] */) /* ty=Tensor[(10, 64, 112, 112), float32] */;',
 '  %4 = nn.relu(%3) /* ty=Tensor[(10, 64, 112, 112), float32] */;',
 '  %5 = nn.max_pool2d(%4, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]) /* ty=Tensor[(10, 64, 56, 56), float32] */;',
 '  %6 = layout_transform(%5, src_layout="NCHW", dst_layout="NHWC") /* ty=Tensor[(10, 56, 56, 64), float32] */;',
 '  %7 = nn.spar

# sparse tuning

In [15]:
from tvm import autotvm

tasks = autotvm.task.extract_from_program(
    spconst_mod['main'], params={}, target='llvm -mcpu=cascadelake')

In [16]:
from scipy import sparse
spconv_data = {}

def fvisit(e):
    if isinstance(e, relay.Call) and e.op.name == 'nn.sparse_conv2d':
        args_type = [i.checked_type for i in e.args]
        args_type = tuple(('TENSOR', i.concrete_shape, i.dtype) for i in args_type)
        weight = tuple(i.data.numpy() for i in e.args[1:])
        weight = sparse.bsr_matrix(weight)
        spconv_data.setdefault(args_type, []).append(weight)

relay.analysis.post_order_visit(spconst_mod['main'], fvisit)

In [None]:
import random
import numpy as np

opts = autotvm.measure_option(
    builder='local',
    runner=autotvm.LocalRunner(timeout=20, min_repeat_ms=200),
)

for tsk in tasks:
    opts['runner'].ref_input = None
    if tsk.name.startswith('conv3x3_sp'):
        data, wdat, wind, wptr, *attrs = tsk.args
        weight = random.choice(spconv_data[data, wdat, wind, wptr])
        wdat, wind, wptr = weight.data, weight.indices, weight.indptr
        data = np.random.rand(*data[1]).astype(data[2])
        ret = np.zeros_like(data)
        opts['runner'].ref_input = [ret, wptr, wind, wdat, data]
    print(tsk.name, len(tsk.config_space))
    nsamples = min(1000, len(tsk.config_space))
    tuner = autotvm.tuner.GATuner(tsk)
    tuner.tune(
        nsamples,
        measure_option=opts,
        callbacks=[
            autotvm.callback.progress_bar(nsamples),
            autotvm.callback.log_to_file('test_sparse.log'),
        ],
    )
autotvm.record.pick_best('test_sparse.log', 'test_sparse.best.log')

conv2d_NCHWc.x86 308
 Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/308) | 0.00 s

In [24]:
from tvm import autotvm
from tvm.contrib.debugger import debug_executor
from tvm.contrib import graph_executor
import numpy as np
import tvm

with autotvm.apply_history_best('test_sparse.best.log'):
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build_module.build(spconst_mod3, params={}, target='llvm -mcpu=cascadelake')

dev = tvm.device('llvm -mcpu=cascadelake', 0)
data = tvm.nd.array(np.random.rand(10, 3, 224, 224).astype('float32'))

graph_sparse = debug_executor.create(lib.graph_json, lib.module, dev)
graph_sparse.set_input(data=data, **lib.params)

graph_sparse.run()
#ftimer = graph_sparse.module.time_evaluator("run", dev, number=100, repeat=1)
#ftimer().mean

Node Name                                                       Ops                                                             Time(us)    Time(%)  Shape                  Inputs  Outputs  
---------                                                       ---                                                             --------    -------  -----                  ------  -------  
tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu        tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu        15464.8     8.613    (10, 2, 112, 112, 32)  3       1        
tvmgen_default_fused_nn_sparse_conv2d_add_add_nn_relu           tvmgen_default_fused_nn_sparse_conv2d_add_add_nn_relu           10226.9     5.696    (10, 56, 56, 64)       6       1        
tvmgen_default_fused_nn_sparse_conv2d_add_add_nn_relu1          tvmgen_default_fused_nn_sparse_conv2d_add_add_nn_relu           10086.6     5.618    (10, 56, 56, 64)       6       1        
tvmgen_default_fused_nn_sparse_conv2d_add_add_nn_r