# baseline model

In [1]:
import onnx
onnx_model = onnx.load("sparse_resnet18_best_onnx/resnet18_GL_16_PR_0.5_ckpt_best.onnx")

In [34]:
from tvm import relay
mod, params = relay.frontend.from_onnx(onnx_model2, {'data': (10, 3, 224, 224)})

In [35]:
print(mod.astext())

#[version = "0.0.5"]
def @main(%data: Tensor[(10, 3, 224, 224), float32], %fc.weight: Tensor[(1000, 512), float32], %fc.bias: Tensor[(1000), float32], %v220: Tensor[(64, 3, 7, 7), float32], %v221: Tensor[(64), float32], %v223: Tensor[(64, 64, 3, 3), float32], %v224: Tensor[(64), float32], %v226: Tensor[(64, 64, 3, 3), float32], %v227: Tensor[(64), float32], %v229: Tensor[(64, 64, 3, 3), float32], %v230: Tensor[(64), float32], %v232: Tensor[(64, 64, 3, 3), float32], %v233: Tensor[(64), float32], %v235: Tensor[(128, 64, 3, 3), float32], %v236: Tensor[(128), float32], %v238: Tensor[(128, 128, 3, 3), float32], %v239: Tensor[(128), float32], %v241: Tensor[(128, 64, 1, 1), float32], %v242: Tensor[(128), float32], %v244: Tensor[(128, 128, 3, 3), float32], %v245: Tensor[(128), float32], %v247: Tensor[(128, 128, 3, 3), float32], %v248: Tensor[(128), float32], %v250: Tensor[(256, 128, 3, 3), float32], %v251: Tensor[(256), float32], %v253: Tensor[(256, 256, 3, 3), float32], %v254: Tensor[(256), f

In [3]:
from tvm import autotvm

tasks = autotvm.task.extract_from_program(
    mod['main'], params=params, target='llvm -mcpu=cascadelake')

In [6]:
%env OMP_NUM_THREADS 1
opts = autotvm.measure_option(
    builder='local',
    runner=autotvm.LocalRunner(timeout=20, min_repeat_ms=200),
)

for tsk in tasks:
    print(tsk.name, len(tsk.config_space))
    nsamples = min(200, len(tsk.config_space))
    tuner = autotvm.tuner.GATuner(tsk)
    tuner.tune(
        nsamples,
        measure_option=opts,
        callbacks=[
            autotvm.callback.progress_bar(nsamples),
            autotvm.callback.log_to_file('test_dense.log'),
        ],
    )
autotvm.record.pick_best('test_dense.log', 'test_dense.best.log')

env: OMP_NUM_THREADS=1
dense_nopack.x86 640
 Current/Best:    4.76/  62.57 GFLOPS | Progress: (200/200) | 311.62 s Done.
dense_pack.x86 36000
 Current/Best:   33.21/  94.15 GFLOPS | Progress: (200/200) | 340.50 s Done.
conv2d_NCHWc.x86 800
 Current/Best:   26.54/ 175.18 GFLOPS | Progress: (200/200) | 436.28 s Done.
conv2d_NCHWc.x86 720
 Current/Best:   47.00/ 177.62 GFLOPS | Progress: (200/200) | 336.56 s Done.
conv2d_NCHWc.x86 972
 Current/Best:  168.19/ 194.72 GFLOPS | Progress: (200/200) | 419.68 s Done.
conv2d_NCHWc.x86 864
 Current/Best:   37.79/ 179.25 GFLOPS | Progress: (200/200) | 331.13 s Done.
conv2d_NCHWc.x86 1024
 Current/Best:  167.73/ 213.92 GFLOPS | Progress: (200/200) | 372.62 s Done.
conv2d_NCHWc.x86 896
 Current/Best:   11.69/ 183.36 GFLOPS | Progress: (200/200) | 310.05 s Done.
conv2d_NCHWc.x86 980
 Current/Best:   50.44/ 207.11 GFLOPS | Progress: (200/200) | 472.12 s Done.
conv2d_NCHWc.x86 308
 Current/Best:    8.81/ 178.09 GFLOPS | Progress: (200/200) | 408.92 s Do

In [4]:
from tvm.autotvm import graph_tuner
executor = graph_tuner.DPTuner(
    mod['main'],
    {'data': (10, 3, 224, 224)},
    'test_dense.best.log',
    [ relay.op.get("nn.conv2d") ],
    'llvm -mcpu=cascadelake',
)
executor.benchmark_layout_transform(min_exec_num=2000)
executor.run()
executor.write_opt_sch2record_file('test_dense.graph.log')

[Op(nn.conv2d)] [('conv2d_NCHWc.x86', (('TENSOR', (10, 3, 224, 224), 'float32'), ('TENSOR', (64, 3, 7, 7), 'float32'), (2, 2), (3, 3, 3, 3), (1, 1), 'NCHW', 'NCHW', 'float32')), ('conv2d_NCHWc.x86', (('TENSOR', (10, 64, 56, 56), 'float32'), ('TENSOR', (64, 64, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32')), ('conv2d_NCHWc.x86', (('TENSOR', (10, 64, 56, 56), 'float32'), ('TENSOR', (64, 64, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32')), ('conv2d_NCHWc.x86', (('TENSOR', (10, 64, 56, 56), 'float32'), ('TENSOR', (64, 64, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32')), ('conv2d_NCHWc.x86', (('TENSOR', (10, 64, 56, 56), 'float32'), ('TENSOR', (64, 64, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32')), ('conv2d_NCHWc.x86', (('TENSOR', (10, 64, 56, 56), 'float32'), ('TENSOR', (128, 64, 3, 3), 'float32'), (2, 2), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32')), ('conv2d_NCHWc.x

AttributeError: <class 'tvm.tir.expr.Any'> has no attribute value

In [45]:
with autotvm.apply_history_best('test_dense.best.log'):
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build_module.build(mod, target='llvm -mcpu=cascadelake', params=params)

import tvm.contrib.graph_executor as runtime
import numpy as np
dev = tvm.device('llvm -mcpu=cascadelake', 0)
graph_dense = runtime.GraphModule(lib['default'](dev))
data = tvm.nd.array(np.random.rand(10, 3, 224, 224).astype('float32'))
graph_dense.set_input('data', data)

ftimer = graph_dense.module.time_evaluator("run", dev, number=100, repeat=1)
ftimer().mean

0.01883478648

# convert to sparse model

In [7]:
import tvm

const_main = relay.build_module.bind_params_by_name(mod['main'], params)
const_mod = tvm.ir.IRModule({'main': const_main})

In [8]:
desired_layouts = {'nn.conv2d': ['NHWC', 'default']}
seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
                                relay.transform.ConvertLayout(desired_layouts),
                                relay.transform.FoldConstant()])
with tvm.transform.PassContext(opt_level=3):
    const_mod2 = seq(const_mod)

In [5]:
const_mod2['main'].body.astext().splitlines()

['#[version = "0.0.5"]',
 'free_var %data: Tensor[(10, 3, 224, 224), float32];',
 '%0 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC") /* ty=Tensor[(10, 224, 224, 3), float32] */;',
 '%1 = nn.conv2d(%0, meta[relay.Constant][0] /* ty=Tensor[(7, 7, 3, 64), float32] */, strides=[2, 2], padding=[3, 3, 3, 3], kernel_size=[7, 7], data_layout="NHWC", kernel_layout="HWIO") /* ty=Tensor[(10, 112, 112, 64), float32] */;',
 '%2 = add(%1, meta[relay.Constant][1] /* ty=Tensor[(1, 1, 1, 64), float32] */) /* ty=Tensor[(10, 112, 112, 64), float32] */;',
 '%3 = nn.relu(%2) /* ty=Tensor[(10, 112, 112, 64), float32] */;',
 '%4 = nn.max_pool2d(%3, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1], layout="NHWC") /* ty=Tensor[(10, 56, 56, 64), float32] */;',
 '%5 = nn.conv2d(%4, meta[relay.Constant][2] /* ty=Tensor[(3, 3, 64, 64), float32] */, padding=[1, 1, 1, 1], kernel_size=[3, 3], data_layout="NHWC", kernel_layout="HWIO") /* ty=Tensor[(10, 56, 56, 64), float32] */;',
 '%6 = add(%5, 

In [9]:
newfunc = relay.data_dep_optimization.utils._run_opt_pass(
    const_mod2['main'],
    relay.transform._ffi_api.Conv2dToSparse2("NHWC", 3, 16, 1, 0.4)
)

In [7]:
newfunc.body.astext().splitlines()

['#[version = "0.0.5"]',
 'free_var %data: Tensor[(10, 3, 224, 224), float32];',
 '%0 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC") /* ty=Tensor[(10, 224, 224, 3), float32] */;',
 '%1 = nn.conv2d(%0, meta[relay.Constant][0] /* ty=Tensor[(7, 7, 3, 64), float32] */, strides=[2, 2], padding=[3, 3, 3, 3], kernel_size=[7, 7], data_layout="NHWC", kernel_layout="HWIO") /* ty=Tensor[(10, 112, 112, 64), float32] */;',
 '%2 = add(%1, meta[relay.Constant][1] /* ty=Tensor[(1, 1, 1, 64), float32] */) /* ty=Tensor[(10, 112, 112, 64), float32] */;',
 '%3 = nn.relu(%2) /* ty=Tensor[(10, 112, 112, 64), float32] */;',
 '%4 = nn.max_pool2d(%3, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1], layout="NHWC") /* ty=Tensor[(10, 56, 56, 64), float32] */;',
 '%5 = nn.sparse_conv2d(%4, meta[relay.Constant][2] /* ty=Tensor[(1152, 16, 1), float32] */, meta[relay.Constant][3] /* ty=Tensor[(1152), int32] */, meta[relay.Constant][4] /* ty=Tensor[(5), int32] */, kernel_size=3) /* ty=Tensor[(1

# sparse tuning

In [10]:
from tvm import autotvm

tasks = autotvm.task.extract_from_program(
    newfunc, params={}, target='llvm -mcpu=cascadelake')

conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.


In [9]:
tasks

[Task(func_name=dense_nopack.x86, args=(('TENSOR', (10, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32'), kwargs={}, workload=('dense_nopack.x86', ('TENSOR', (10, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32')),
 Task(func_name=dense_pack.x86, args=(('TENSOR', (10, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32'), kwargs={}, workload=('dense_pack.x86', ('TENSOR', (10, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32')),
 Task(func_name=conv3x3_spNHWC.x86, args=(('TENSOR', (10, 7, 7, 512), 'float32'), ('TENSOR', (73728, 16, 1), 'float32'), ('TENSOR', (73728,), 'int32'), ('TENSOR', (33,), 'int32'), 'NHWC'), kwargs={}, workload=('conv3x3_spNHWC.x86', ('TENSOR', (10, 7, 7, 512), 'float32'), ('TENSOR', (73728, 16, 1), 'float32'), ('TENSOR', (73728,), 'int32'), ('TENSOR', (33,), 'int32'), 'NHWC')),
 Task(func_name=conv3x3_spNHWC.x86, args=(('TENSOR', (10, 14, 14, 256), 'float32'), ('TENSOR', (18432, 16, 1

In [10]:
from scipy import sparse
spconv_data = []
def fvisit(e):
    if isinstance(e, relay.Call) and e.op.name == 'nn.sparse_conv2d':
        weight = tuple(i.data.numpy() for i in e.args[1:])
        weight2 = sparse.bsr_matrix(weight)
        spconv_data.append(weight2)
relay.analysis.post_order_visit(newfunc, fvisit)

In [11]:
spconv_data

[<64x576 sparse matrix of type '<class 'numpy.float32'>'
 	with 18432 stored elements (blocksize = 16x1) in Block Sparse Row format>,
 <64x576 sparse matrix of type '<class 'numpy.float32'>'
 	with 18432 stored elements (blocksize = 16x1) in Block Sparse Row format>,
 <64x576 sparse matrix of type '<class 'numpy.float32'>'
 	with 18432 stored elements (blocksize = 16x1) in Block Sparse Row format>,
 <64x576 sparse matrix of type '<class 'numpy.float32'>'
 	with 18432 stored elements (blocksize = 16x1) in Block Sparse Row format>,
 <128x1152 sparse matrix of type '<class 'numpy.float32'>'
 	with 73728 stored elements (blocksize = 16x1) in Block Sparse Row format>,
 <128x1152 sparse matrix of type '<class 'numpy.float32'>'
 	with 73728 stored elements (blocksize = 16x1) in Block Sparse Row format>,
 <128x1152 sparse matrix of type '<class 'numpy.float32'>'
 	with 73728 stored elements (blocksize = 16x1) in Block Sparse Row format>,
 <256x2304 sparse matrix of type '<class 'numpy.float32'

In [None]:
import random
import numpy as np
import logging
%env OMP_NUM_THREADS 1
%rm test_sparse.log
opts = autotvm.measure_option(
    builder='local',
    runner=autotvm.LocalRunner(timeout=20, min_repeat_ms=200),
)
#random.shuffle(tasks)
#logging.basicConfig(level=logging.DEBUG, filename='test_sparse2.log')

for tsk in tasks:
    #if tsk.name.startswith('dense'): continue
    opts['runner'].ref_input = None
    if tsk.name == 'conv3x3_spNHWC.x86':
        data, wdat, wind, wptr, *attrs = tsk.args
        data = np.random.rand(*data[1]).astype(data[2])
        weight = random.choice([
            it for it in spconv_data
            if it.data.shape == wdat[1] and it.indptr.shape == wptr[1]
        ])
        wdat = weight.data.astype(wdat[2])
        wind = weight.indices.astype(wind[2])
        wptr = weight.indptr.astype(wptr[2])
        ret = np.zeros_like(data) #.reshape(-1, data.shape[-1]))
        opts['runner'].ref_input = reversed([data, wdat, wind, wptr, ret])
    print(tsk.name, len(tsk.config_space))
    nsamples = min(200, len(tsk.config_space))
    tuner = autotvm.tuner.GATuner(tsk)
    tuner.tune(
        nsamples,
        measure_option=opts,
        callbacks=[
            autotvm.callback.progress_bar(nsamples),
            autotvm.callback.log_to_file('test_sparse.log'),
        ],
    )

In [None]:
autotvm.record.pick_best('test_sparse.log', 'test_sparse.best.log')
%rm test_sparse.log

In [11]:
import tvm.autotvm.graph_tuner
gtuner = autotvm.graph_tuner.DPTuner(newfunc, {'data': (10, 3, 224, 224)}, 'test_sparse.best.log', 
    [relay.op.get('nn.dense')], 'llvm -mcpu=cascadelake')
gtuner.benchmark_layout_transform()
gtuner.run()
gtuner.write_opt_sch2record_file('test_sparse2.log')

Exception in thread Thread-27:
Traceback (most recent call last):
  File "/lustre/home/acct-hpc/hpcjsl/.conda/envs/tvm-build/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/lustre/home/acct-hpc/hpcjsl/.conda/envs/tvm-build/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/lustre/home/acct-hpc/hpcjsl/tvm/python/tvm/relay/build_module.py", line 341, in build
    executor_config, runtime_mod, params = bld_mod.build(
  File "/lustre/home/acct-hpc/hpcjsl/tvm/python/tvm/relay/build_module.py", line 156, in build
    self._build(mod, target, target_host, executor, mod_name)
  File "/lustre/home/acct-hpc/hpcjsl/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 237, in __call__
    raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
  10: TVMFuncCall
  9: _ZNSt17_Function_handlerIFvN
  8: tvm::relay::backend::RelayBuildModule::GetFunction(std::__cxx11::basic_string<char, std::

[Op(nn.dense)] []


IndexError: list index out of range

In [46]:
with autotvm.apply_history_best('test_sparse.best.log'):
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build_module.build(tvm.ir.IRModule({'main': newfunc}), target='llvm -mcpu=cascadelake', params=params)

import tvm.contrib.graph_executor as runtime
import numpy as np
dev = tvm.device('llvm -mcpu=cascadelake', 0)
graph_sparse = runtime.GraphModule(lib['default'](dev))
data = tvm.nd.array(np.random.rand(10, 3, 224, 224).astype('float32'))
graph_sparse.set_input('data', data)

ftimer = graph_sparse.module.time_evaluator("run", dev, number=100, repeat=1)
ftimer().mean

conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.


0.19715437504