In [1]:
import sys
sys.path.append("..")
import numpy as np
import os
from functools import reduce
import json

from layers import *
from blocks import *
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# test tvm block evaluator on multiple points
logpath = "blockResult.json"

def run_tvm_block(**kwargs):
    tvm_block = TVMBlockEvaluator(**kwargs, n_trial=100) # faster
    tvm_block.setup()
    tvm_block.autotune()
    tvm_block.build()
    return tvm_block.evaluate(verbose=True)

sampler = SearchSpaceSampler()
for i in range(10):
    block_setting = sampler.get_block_setting()
    for j in range(3):
        sparsity_setting = sampler.get_sparsity_setting(block_setting)

        res_sparse = run_tvm_block(mode="sparse", **block_setting, **sparsity_setting)
        res_dense = run_tvm_block(mode="dense", **block_setting, **sparsity_setting)

        sparse_total = reduce(lambda x, y: x+y, res_sparse.values())
        dense_total = reduce(lambda x, y: x+y, res_dense.values())

        res_sparse = {k: f"{v / sparse_total:.4%}" for k, v in res_sparse.items()}
        res_dense = {k: f"{v / dense_total:.4%}" for k, v in res_sparse.items()}

        record = {
            "setting": [*block_setting.values(), *sparsity_setting.values()],
            "sparse_total": f"{sparse_total*1000:.4f}ms",
            "dense_total": f"{dense_total*1000:.4f}ms",
            "sparse_detail": res_sparse,
            "dense_detail": res_dense,
        }

        with open(logpath, "a+") as f:
            f.write(json.dumps(record) + "\n")
        

scatter_add: 100%|██████████| 7/7 [01:30<00:00, 12.95s/it]


All layers in TVMBlockEvaluator are autotuned.


In [2]:
# test tvm block evaluator
tvm_sparse_block = TVMBlockEvaluator("sparse", 512, 4, 56, 2, 28, 4, n_trial=20)
tvm_sparse_block.setup()
tvm_sparse_block.autotune()
tvm_sparse_block.build()
tvm_sparse_block.evaluate(verbose=True)
tvm_sparse_block.evaluate()

  from pandas import MultiIndex, Int64Index
scatter_add: 100%|██████████| 7/7 [01:58<00:00, 16.98s/it]


All layers in TVMBlockEvaluator are autotuned.


0.0024408402700000003

In [3]:
tvm_dense_block = TVMBlockEvaluator("dense", 512, 4, 56, 2, 28, 4, n_trial=20)
tvm_dense_block.setup()
tvm_dense_block.autotune()
tvm_dense_block.build()
tvm_dense_block.evaluate(verbose=True)
tvm_dense_block.evaluate()

conv3: 100%|██████████| 3/3 [00:53<00:00, 17.91s/it]


All layers in TVMBlockEvaluator are autotuned.


0.02364724026

In [2]:
# test conv dense scheduler
conv_dense_scheduler = ConvDenseScheduler(128, 512, 64, 3, "log/conv3x3_dense.log")
conv_dense_scheduler.n_trial = 50
conv_dense_scheduler.rtol = 1e-5
conv_dense_scheduler.atol = 1e-3
conv_dense_scheduler.autotune(refresh=True)
conv_dense_scheduler.build(display=False)
conv_dense_scheduler.check(runtype="pytorch")
conv_dense_scheduler.evaluate()

  from pandas import MultiIndex, Int64Index


0.00235651856

In [3]:
# test conv dense scheduler
conv_dense_scheduler = ConvDenseScheduler(128, 512, 64, 1, "log/conv1x1_dense.log")
conv_dense_scheduler.n_trial = 50
conv_dense_scheduler.rtol = 1e-5
conv_dense_scheduler.atol = 1e-3
conv_dense_scheduler.autotune(refresh=True)
conv_dense_scheduler.build(display=False)
conv_dense_scheduler.check(runtype="pytorch")
conv_dense_scheduler.evaluate()

0.00020397229

In [5]:
# test conv1x1 gathered scheduler
conv1x1_gathered_scheduler = Conv1x1GatheredScheduler(512, 128, 128, 2, "log/conv1x1_gathered.log")
conv1x1_gathered_scheduler.n_trial = 50
conv1x1_gathered_scheduler.rtol = 1e-5
conv1x1_gathered_scheduler.atol = 1e-3
conv1x1_gathered_scheduler.autotune(refresh=True)
conv1x1_gathered_scheduler.build(display=False)
conv1x1_gathered_scheduler.check(runtype="pytorch")
conv1x1_gathered_scheduler.evaluate()

2.348793e-05

In [2]:
# test scatter add
scatter_add_scheduler = ScatterAddScheduler(64, 512, 128, 2, "log/scatter_add.log")
scatter_add_scheduler.n_trial = 10
scatter_add_scheduler.autotune(refresh=True)
scatter_add_scheduler.build(display=False)
scatter_add_scheduler.check()
scatter_add_scheduler.evaluate()

  from pandas import MultiIndex, Int64Index


9.848044e-05

In [None]:
# test conv3x3 gathered scheduler
conv3x3_gathered_scheduler = Conv3x3GatheredScheduler(512, 4, 128, 2, "log/conv3x3_gathered.log")
conv3x3_gathered_scheduler.n_trial = 50
conv3x3_gathered_scheduler.rtol = 1e-5
conv3x3_gathered_scheduler.atol = 1e-3
conv3x3_gathered_scheduler.autotune(refresh=True)
conv3x3_gathered_scheduler.build(display=False)
conv3x3_gathered_scheduler.check(runtype="pytorch")

# sample = conv3x3_gathered_scheduler._generate_sample()
# res_tvm = conv3x3_gathered_scheduler._run_tvm(sample)
# res_pytorch = conv3x3_gathered_scheduler._run_pytorch(sample)
# for i in range(128):
#     for j in range(512):
#         if np.sum(res_tvm[i, j, :] - res_pytorch[i, j, :]) > 1e-2:
#             print(i, j)

conv3x3_gathered_scheduler.evaluate()

0.0005954454

In [None]:
# test gather scheduler
gather_scheduler = GatherScheduler(64, 512, 128, 2, "log/gather.log")
gather_scheduler.n_trial = 10
gather_scheduler.autotune(refresh=True)
gather_scheduler.build(display=False)
gather_scheduler.check()
gather_scheduler.evaluate()

  from pandas import MultiIndex, Int64Index


2.398621e-05