In [1]:
from IPython.core.magic import register_cell_magic
import subprocess
import argparse
import sys
import os

@register_cell_magic
def withsave(line, cell):
    parser = argparse.ArgumentParser()
    parser.add_argument('fname')
    parser.add_argument('-f', '--force', action='store_true')
    parser.add_argument('-a', '--append', action='store_true')
    parser.add_argument('--subp')
    parser.add_argument('--no-capout', action='store_false', dest='capout')
    args = parser.parse_args(line.split())
    if not args.fname.endswith('.py'):
        args.fname += '.py'
    assert not (args.force and args.append)
    if os.path.exists(args.fname) and not (args.force or args.append):
        raise FileExistsError(args.fname)
    with open(args.fname, 'a' if args.append else 'w') as f:
        f.write(cell)
    if not args.subp:
        get_ipython().run_cell(cell)
    else:
        get_ipython().user_ns[args.subp] = subprocess.run(
            [sys.executable, args.fname],
            capture_output=args.capout
        )

In [2]:
%%withsave tmp_convshape -f
# resnet50
N, H, W = 10, 512, 512

convshape = [
    (N, C, H * 32 // C, W * 32 // C)
    for C in [64, 128, 256, 512]
]

# disable openmp
import os
os.environ['OMP_NUM_THREADS'] = '1'

# topi.x86.conv2d_NCHWc (AutoTVM)

    {N, OC, H}:para, ow =>
        IC, kh, iic, kw:unroll, iw:unroll, oc:vec =>
            @CC = N, OC, H, {ow, iw}/W, oc  // {IC, iic}/ic, kh, kw
        iw, oc:vec =>
            @CO = N, OC, H, {ow, iw}/W, oc

    {N, OC, H}:para, ow =>
        IC, kh, kw, iic, iw:unroll, oc:vec =>
            @CC = N, OC, H, {ow, iw}/W, oc  // {IC, iic}/ic, kh, kw
        iw, oc:vec =>
            @CO = N, OC, H, {ow, iw}/W, oc

Check this out for strange error in getting device in continued tuning: https://discuss.tvm.apache.org/t/solved-autotvm-cannot-get-remote-devices-from-the-tracker/2692/10

In [10]:
%%withsave tmp_topiconv_tune -f --subp task
from tvm import topi, autotvm, te
import logging

from tmp_convshape import *
Csplit = 16

with open('conv2d_nchwc.dbg', 'w') as logfile:
    logger = logging.getLogger("autotvm")
    logger.setLevel(logging.DEBUG)
    logger.addHandler(logging.StreamHandler(logfile))
    
    for N, C, H, W in convshape:
        Cgroups = C // Csplit
        A = te.placeholder((N, Cgroups, H, W, Csplit), name='A')
        B = te.placeholder((Cgroups, Cgroups, 3, 3, Csplit, Csplit), name='B')
        task = autotvm.task.create('conv2d_NCHWc.x86',
                                   args=(A, B, 1, 1, 1, 'NCHWc', 'NCHWc', 'float32'),
                                   target="llvm -mcpu=cascadelake")
        
        print(task.config_space, file=logfile)

        measure_option = autotvm.measure_option(builder="local",
            runner=autotvm.LocalRunner(number=4, repeat=3, timeout=20))

        tuner = autotvm.tuner.GATuner(task)
        ntrials = 400
        tuner.tune(
            n_trial=ntrials,
            measure_option=measure_option,
            callbacks=[
                autotvm.callback.log_to_file("conv2d_nchwc.log"),
                #autotvm.callback.progress_bar(ntrials)
            ],
        )

In [11]:
task

CompletedProcess(args=['/lustre/home/acct-hpc/hpcjsl/.conda/envs/tvm-build/bin/python', 'tmp_topiconv_tune.py'], returncode=0, stdout=b'', stderr=b'')

In [23]:
import tvm.topi.x86.conv2d
autotvm.record.pick_best('conv2d_nchwc.log', 'newconv.log')

# topi.\*.conv2d_nhwc

- topi.nn.conv2d_nhwc (w/ ansor)
- topi.x86.schedule_conv2d_nhwc (?)
- topi.nn.conv2d_winograd_nhwc (w/ ansor)

# My Conv2d_3x3_gemm
                
    yt, xt, yo =>
        yi, k9, ci:vec =>
            @im2col = {yt, yo, yi}/y, {k9, ci}/k
        xo =>
            ko, ki:unroll, yi:unroll, xi:vec =>
                @ccache = {yt, yo, yi}/y, {xt, xo, xi}/x  // {ko, ki}k
            yi:unroll, xi:vec =>
                @cout = {yt, yo, yi}/y, {xt, xo, xi}/x

In [2]:
%%withsave tmp_myconv -f
from tvm import autotvm, te, tir
from functools import partial, reduce
import tvm

@autotvm.template('conv2d_3x3_gemm')
def conv2d_3x3_gemm(N, H, W, CI, CO, dtype='float32'):
    Y, X, K = N*H*W, CO, 9*CI
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", Y, num_outputs=3)
    cfg.define_split("tile_x", X, num_outputs=3)
    cfg.define_split("tile_k", K, num_outputs=2)
    if cfg.is_fallback:
        pass

    data = te.placeholder((N, H, W, CI), dtype=dtype)
    weight = te.placeholder((X, K), dtype=dtype)
    idxsplit = lambda x,y: reduce(lambda a,b: a[:-1]+[a[-1]%b,a[-1]//b], y, [x])

    @partial(te.compute, (Y, K), name='im2col')
    def im2col(row, col):
        jw, jh, jn = idxsplit(row, [W, H])
        jc, kw, kh = idxsplit(col, [CI, 3])
        ih, iw = jh + kh - 1, jw + kw - 1
        return tir.if_then_else(
            tir.all(0 <= ih, ih < H, 0 <= iw, iw < W),
            data[jn, ih, iw, jc], 0)
    
    packw_bn = cfg["tile_x"].size[-1]
    packw = te.compute((X//packw_bn, K, packw_bn),
        lambda xo, k, xi: weight[xo * packw_bn + xi, k],
        name="packed_weight")
    
    k = te.reduce_axis((0, K), name="k")
    C = te.compute((Y, X),
        lambda y, x: te.sum(im2col[y, k] * packw[x//packw_bn, k, x%packw_bn], axis=k),
        name="dense_pack")
    
    s = te.create_schedule(C.op)
    CC = s.cache_write(C, "global")
    
    y, x = s[C].op.axis
    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
    s[C].reorder(yt, xt, yo, xo, yi, xi)
    #xyt = s[C].fuse(yt, xt)
    #s[C].parallel(xyt)
    #xyo = s[C].fuse(yo, xo)
    s[C].unroll(yi)
    s[C].vectorize(xi)

    s[CC].compute_at(s[C], xo)
    yi, xi = s[CC].op.axis
    (k,) = s[CC].op.reduce_axis
    ko, ki = cfg["tile_k"].apply(s, CC, k)
    s[CC].reorder(ko, ki, yi, xi)
    s[CC].vectorize(xi)
    s[CC].unroll(yi)
    s[CC].unroll(ki)
    
    s[im2col].compute_at(s[C], yo)
    yi, k = s[im2col].op.axis
    ko, ki = s[im2col].split(k, factor=CI)
    s[im2col].vectorize(ki)
    #s[im2col].unroll(yi)

    xo, k, xi = s[packw].op.axis
    s[packw].reorder(xo, xi, k)
    #s[packw].parallel(xo)
    return s, [data, weight, C]

In [None]:
%%withsave tmp_myconv_tune -f --subp task_myconv
from tmp_convshape import *
from tmp_myconv import *
import logging

with open('conv2d_3x3_gemm.dbg', 'w') as logfile:
    logger = logging.getLogger("autotvm")
    logger.setLevel(logging.DEBUG)
    logger.addHandler(logging.StreamHandler(logfile))
    
    for N, C, H, W in convshape:
        task = autotvm.task.create('conv2d_3x3_gemm',
                               args=(N, H, W, C, C, 'float32'),
                               target="llvm -mcpu=cascadelake")
        print(task.config_space, file=logfile)

        measure_option = autotvm.measure_option(
            builder=autotvm.LocalBuilder(),
            runner=autotvm.LocalRunner(number=4, repeat=3, timeout=20))
        tuner = autotvm.tuner.GATuner(task)
        tuner.tune(
            n_trial=500,
            measure_option=measure_option,
            callbacks=[autotvm.callback.log_to_file("conv2d_3x3_gemm.log")],
        )

In [18]:
task_myconv

CompletedProcess(args=['/lustre/home/acct-hpc/hpcjsl/.conda/envs/tvm-build/bin/python', 'tmp_myconv_tune.py'], returncode=0, stdout=b'', stderr=b'')

In [20]:
autotvm.record.pick_best('conv2d_3x3_gemm.log', 'newconv.log')

In [None]:
from tvm import autotvm
import numpy as np
import tvm

json_dict = {
    "index": 77156, 
    "code_hash": None, 
    "entity": [
        ["tile_y", "sp", [-1, 320, 128]], 
        ["tile_x", "sp", [-1, 8, 2]], 
        ["tile_k", "sp", [-1, 1]]
    ]
}
tgtstr = "llvm -mcpu=cascadelake"
ce = autotvm.task.ConfigEntity.from_json_dict(json_dict)
with autotvm.task.ApplyConfig(ce), tvm.target.Target(tgtstr):
    dev = tvm.device(tgtstr, 0)
    s, params = conv2d_3x3_gemm(10, 256, 256, 64, 64, "float32")
    args = [
        tvm.nd.array(
            np.random.rand(*[a.value for a in p.shape]).astype('float32'),
            dev)
        for p in params]
    print(tvm.lower(s, params, simple_mode=True))
    func = tvm.build(s, params, target=tgtstr, name="conv2d_3x3_gemm")
    func(*args)
    evt = func.time_evaluator(func.entry_name, dev, number=10)
    print(evt(*args).mean)

# My SpConv2d_3x3_gemm

    yt, xt, yo =>
        yi, k9, ci:vec =>
            @im2col = {yt, yo, yi}/y, {k9, ci}/k
        xo =>
            x1:1, ko:dyn(xr), yi:unroll, xi:vec, ki:unroll =>
                @CC = {yt, yo, yi}/y, {xt, xo, x1}/xr, xi, ki  // ko
            yi:unroll, xi:vec, ki:unroll =>
                @C = {yt, yo, yi}/y, {xt, xo, xi}/x  // ki

In [3]:
%%withsave tmp_myspconv -f
from tvm import autotvm, te, tir
from functools import partial, reduce

@autotvm.template('spconv2d_3x3_gemm')
def spconv2d_3x3_gemm(N, H, W, CI, CO, nElems, bsrR, bsrC, dtype='float32'):
    Y, X, K = N*H*W, CO, 9*CI
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", Y, num_outputs=3)
    cfg.define_split("tile_x", X // bsrR, num_outputs=2)
    cfg.add_flop(Y * (nElems * bsrC * bsrR * 2 - X))
    #cfg.define_split("tile_k", K, num_outputs=2)
    if cfg.is_fallback:
        cfg['tile_y'] = autotvm.task.space.SplitEntity([-1, 160, 8])
        cfg['tile_x'] = autotvm.task.space.SplitEntity([-1, 4])
    
    Data = te.placeholder((N, H, W, CI), dtype=dtype, name='Data')
    Wdat = te.placeholder((nElems, bsrR, bsrC), name='Wdat')
    Wind = te.placeholder((nElems,), dtype='int', name='Wind')
    Wptr = te.placeholder((X // bsrR + 1,), dtype='int', name='Wptr')
    idxsplit = lambda x,y: reduce(lambda a,b: a[:-1]+[a[-1]%b,a[-1]//b], y, [x])

    @partial(te.compute, (Y, K), name='Im2Col')
    def Im2Col(row, col):
        jw, jh, jn = idxsplit(row, [W, H])
        jc, kw, kh = idxsplit(col, [CI, 3])
        ih, iw = jh + kh - 1, jw + kw - 1
        return tir.if_then_else(
            tir.all(0 <= ih, ih < H, 0 <= iw, iw < W),
            Data[jn, ih, iw, jc], 0)
    
    @partial(te.compute, (Y, X // bsrR, bsrR, bsrC), name='CC')
    def CC(drow, wrow, brow, bcol):
        row_start, row_end = Wptr[wrow], Wptr[wrow+1]
        elem_idx = te.reduce_axis((0, row_end - row_start), name='elem_idx')
        elem = row_start + elem_idx
        return te.sum(Im2Col[drow, Wind[elem]*bsrC + bcol] * Wdat[elem, brow, bcol], axis=elem_idx)

    k = te.reduce_axis((0, bsrC), name='k')
    C = te.compute((Y, X), lambda y, x: te.sum(CC[y, x // bsrR, x % bsrR, k], axis=k), name='C')
    
    s = te.create_schedule(C.op)
    y, x = s[C].op.axis
    yt, yo, yi = cfg['tile_y'].apply(s, C, y)
    xo, xi = s[C].split(x, factor=bsrR)
    xt, xo = cfg['tile_x'].apply(s, C, xo)
    (k,) = s[C].op.reduce_axis
    s[C].reorder(yt, xt, yo, xo, yi, xi, k)
    s[C].unroll(k)
    s[C].vectorize(xi)
    s[C].unroll(yi)

    s[CC].compute_at(s[C], xo)
    yi, xi, r, c = s[CC].op.axis
    (k,) = s[CC].op.reduce_axis
    s[CC].reorder(xi, k, yi, r, c)
    s[CC].unroll(c)
    s[CC].vectorize(r)
    s[CC].unroll(yi)
    
    s[Im2Col].compute_at(s[C], yo)
    yi, k = s[Im2Col].op.axis
    ko, ki = s[Im2Col].split(k, factor=CI)
    s[Im2Col].vectorize(ki)
    #s[Im2Col].unroll(yi)
    return s, [Data, Wdat, Wind, Wptr, C]

- [autotvm ref_input removed](https://github.com/apache/tvm/commit/b5a7de879e67aca80aa25bf9ea9c46315dccb026#diff-f8cbe8a70063c3692732fa42db6f11779f92eb2afeb5576b68b7ede8064a8222L596)
- [ansor task_inputs](https://tvm.apache.org/docs/tutorials/auto_scheduler/tune_sparse_x86.html#create-the-search-task)

In [4]:
%%withsave tmp_mysputils -f
import scipy.sparse
import numpy as np
import tvm
from tvm.rpc import RPCSession

def make_bsr_sparse(dense, sprate, blocksize):
    bsrdata = scipy.sparse.bsr_matrix(dense, blocksize=blocksize)
    # find partition value
    summed = bsrdata.data.sum((1, 2))
    idx = int(sprate * len(summed) + 0.5)
    val = np.partition(summed, idx)[idx]
    # filter the data
    data, indices, indptr, bsrWid = [], [], [], bsrdata.indptr[1]
    for idx, (block, indval) in enumerate(zip(bsrdata.data, bsrdata.indices)):
        if idx % bsrWid == 0:
            indptr.append(len(data))
        if block.sum() >= val:
            data.append(block)
            indices.append(indval)
    indptr.append(len(data))
    # convert format
    bsrdata2 = tuple([np.array(i) for i in [data, indices, indptr]])
    return scipy.sparse.bsr_matrix(bsrdata2, shape=dense.shape)


def unpack_bsr(bsrdata):
    return bsrdata.data, bsrdata.indices, bsrdata.indptr


def hook_method(obj, attr):
    def real_decorator(func):
        orig = getattr(obj, attr)
        setattr(obj, attr, func)
        func.orig = orig
        func.revert = lambda: setattr(obj, attr, orig)
        return func
    return real_decorator


class NonRandomFill:
    srclst_ = []
    
    @classmethod
    def set_srclst(cls, srclst):
        cls.srclst_ = [tvm.nd.array(it) for it in srclst]

    def __init__(self):
        self.srclst = iter(self.srclst_)
    
    def __call__(self, tgt):
        src = next(self.srclst)
        tgt.copyfrom(src)


@hook_method(RPCSession, 'get_function')
def new_get_function(self, fname):
    if fname == 'tvm.contrib.random.random_fill':
        return NonRandomFill()
    else:
        return new_get_function.orig(self, fname)

In [None]:
%%withsave tmp_myspconv_tune -f --subp task_spconv
import numpy as np
import logging

from tmp_convshape import *
from tmp_myspconv import *
from tmp_mysputils import *

with open('spconv2d_3x3_gemm.dbg', 'w') as logfile:
    logger = logging.getLogger("autotvm")
    logger.setLevel(logging.DEBUG)
    logger.addHandler(logging.StreamHandler(logfile))

    for N, C, H, W in convshape:
        for sprate in [0.5, 0.6, 0.7, 0.8, 0.9]:
            nhwc_data = np.random.randint(0, 256, (N, H, W, C)).astype('float32')
            weight_ohwi = np.random.rand(C, 3*3*C).astype('float32')
            spweight_ohwi = make_bsr_sparse(weight_ohwi, sprate, (16, 1))
            ret = np.zeros((N*H*W, C), dtype='float32')

            args = (N, H, W, C, C, *spweight_ohwi.data.shape, 'float32')
            task = autotvm.task.create('spconv2d_3x3_gemm', args=args, target="llvm -mcpu=cascadelake")
            print(task.config_space, file=logfile)

            runner = autotvm.LocalRunner(number=4, repeat=3, timeout=20)
            NonRandomFill.set_srclst([nhwc_data, *unpack_bsr(spweight_ohwi), ret])
            measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=runner)
            tuner = autotvm.tuner.GATuner(task)
            tuner.tune(
                n_trial=500,
                measure_option=measure_option,
                callbacks=[autotvm.callback.log_to_file("spconv2d_3x3_gemm.log")],
            )

In [None]:
task_spconv

# Ansor

> [Ansor sparse_dense tutorial](https://tvm.apache.org/docs/tutorials/auto_scheduler/tune_sparse_x86.html)

sparse_dense_sp_rhs_bsrmm
```
BM:= m, nblk, br, R{elem:dyn(nblk), bc:1}
RS:= m, n{nblk, br}
BM,RS> {m, nblk}:para
  BM> {br, elem:dyn(nblk), bc:1}:reorder
    BM> elem:dyn(nblk), br:vec, bc:1
  RS> br:vec
```

1. task_input：提供下标输入，避免按随机输入搜索时暴毙
2. custom sketch：提供一个手写的init search policy
    - 作用于什么op
    - 如何进行init sketch，需要用ansor专门的一套LoopState API；

# The new TVM Schedule Representation

## TVM基本原语

TVM提供了一套在算子定义之外、对算子的循环层次进行编辑的操作指令。比较主要的包括：

- compute dag变换
    - `cache_*`和`rfactor`：一些会增加新stage的高层变换
- 轴重组
    - `split`和`reorder`：进行`tile`这样的访存顺序调整的原语
    - `compute_*`：一组用来对多个stage进行重组、嵌套的原语，会引起隐式的`split`
    - `fuse`：组合若干循环维度，本身没有任何用，作为其他变换的前置条件
- 单一轴
    - `vectorize`，`unroll`和`parallel`：指定某一循环维度具体实现方式的原语
    
实际上令人耗费精力的主要是前两类schedule，第三类schedule基本可以通过启发式的方法做出简单的少数几种策略。

另有gpu schedule原语 https://tvm.apache.org/docs/tutorials/optimize/opt_conv_cuda.html

命令式语言，可以灵活的进行版本分支；缺乏直观性、持久性。

## 现有TVM DSL/可视化

[Tensor Expression Debug Display](https://tvm.apache.org/docs/tutorials/language/tedd.html)或[Operational Model (TVMConf'19)](https://sampl.cs.washington.edu/tvmconf/slides/2019/E03-Yuan-Lin-Yongfeng-Gu.pdf)，包括三张图：

- ComputeDAG: Stage间的依赖
- IterVarRelation: IterVar间的演化
- ScheduleTree: 实现策略，包括Stage融合、LeafIterVar行为

评价：

- 把IterVar变换和Schedule变化进行了分离；前者具有很强的操作性特征，而后者基本是声明式的；
    - 如果只进行一轮`split`-`reorder`-`fuse`，那么IterVar变换也是声明式的；然而没有保证；
- 是否需要一个文字版的schedule tree？增加可写性，改善直观性；一种更简单的IR；
- IterVarRel的多版本需要改进的树形表示；
- Compute_\*依然是抽象问题；源于[InferBound](https://tvm.apache.org/docs/dev/inferbound.html)机制，即TVM通过infer机制尽可能减少计算的域的大小；但这套机制很难处理复杂的fuse和split的组合。

事实上这种面向轴的schedule语言可以产出非常多没什么道理的schedule；DSL的使命应该是利用domain knowledge，减少开发过程的自由度。

- [Fireiron (TVMConf'19)](https://tvmconf.org/slides/2019/E04-Vinod-Grover.pdf)是一种面向GPU的DSL。除了常见的decomp原语（split，tile）之外，还对结构化的bind、cache进行了支持；
- Ansor

    - 论文中Ansor拒绝了线性决策式的schedule构造（schedule原语的线性累加），自我标榜为一种“层次化”方法。
    - high-level structure不仅不确定参数，衍生规则也相对高层：inline、tiling、cachewrite、rfact。IterVar的重组完全由多层tile的RS表达式确定。
    - ansor中有layout free tensor的概念：对于weight数据，可以直接一次性改写为匹配搜索结果的数据格式。即在合适的rule设计下，ansor具有数据格式搜索能力。
    - ansor论文在single op部分的eval中提到了一些ansor具有优势的例子，包括rfact的广泛使用、在tile和inline上的灵活性。tile的灵活性体现在层次数量而非排列；当层次数量足够多时，即使排列固定，也能有非常丰富的组合。