In [2]:
from brt.jit.tvm import TVMTuner
from brt.common import BRT_CACHE_PATH
from brt.common import log

log.set_level("jit", "DEBUG")

tuner = TVMTuner()
onnx_model = "sparse_fusion_2_thor_model"
tuner.import_onnx_netlet(onnx_model)
tuner.export_netlet_template()

Get devices for measurement successfully!
DEBUG:brainstorm.jit:kernel args: [[8, 64, 64], [8, 64, 64], [8, 64, 64]]


In [None]:
from brt.jit import CUDACompiler
from brt.common import BRT_KERNEL_TEMPLATE_PATH
import torch

kernel_name = "sparse_fusion_2_thor_model"

kernel_template_filename = str(BRT_KERNEL_TEMPLATE_PATH / (kernel_name + ".cu"))

kernel_template_source = open(kernel_template_filename, "r").read()
kernel_func = CUDACompiler.generate_kernel(
    keyword_dict=None, template=kernel_template_source
)
data = torch.ones((8, 64, 64), device="cuda")
weight = torch.ones((8, 64, 64), device="cuda")
outdata = torch.ones((8, 64, 64), device="cuda")
kernel_func(data, weight, outdata)
print(outdata.shape)

In [1]:
from brt.jit.compiler import CUDACompiler
from brt.common import BRT_KERNEL_TEMPLATE_PATH
import time
import torch

kernel_name = "sample"

kernel_template_filename = str(BRT_KERNEL_TEMPLATE_PATH / (kernel_name + ".cu"))

kernel_template_source = open(kernel_template_filename, "r").read()
# print(kernel_template_source)
kernel_func = CUDACompiler.generate_kernel(
    {"batch_num": 2, "num_samples": 2}, kernel_template_source,
)
data = torch.ones((8, 64, 64), device="cuda")
weight = torch.ones((8, 64, 64), device="cuda")
outdata = torch.ones((8, 64, 64), device="cuda")
start_stamp = time.time()
kernel_func(data, weight, outdata)
end_stamp = time.time()
print("first time: {:.3f}".format((end_stamp - start_stamp) * 1000))

start_stamp = time.time()
kernel_func(data, weight, outdata)
end_stamp = time.time()
print("second time: {:.3f}".format((end_stamp - start_stamp) * 1000))


first time: 479.166
second time: 0.067


In [2]:
from brt.common import BRT_KERNEL_TEMPLATE_PATH
from brt.jit import RawFunction
from brt.jit.compiler import CUDACompiler
import torch

kernel_name = "sample"

kernel_template_filename = str(BRT_KERNEL_TEMPLATE_PATH / (kernel_name + ".cu"))

kernel_template_source = open(kernel_template_filename, "r").read()

# print(kernel_template_source)
raw_function = RawFunction(kernel_template_source)

# code = generic_function.get_code("device", 0, "asm")
code = raw_function.get_code("global")
processed_template_fname = str(
    BRT_KERNEL_TEMPLATE_PATH / ("processed_" + kernel_name + ".cu")
)
with open(processed_template_fname, "w") as f:
    f.write(code)

data = torch.ones((8, 64, 64), device="cuda")
weight = torch.ones((8, 64, 64), device="cuda")
outdata = torch.ones((8, 64, 64), device="cuda")
func = CUDACompiler.generate_kernel(None, code)

torch.cuda.synchronize()
stream = torch.cuda.default_stream()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

start_event.record(stream)
func(data, weight, outdata)
end_event.record(stream)
stream.synchronize()
print("first time: {:.3f}".format(start_event.elapsed_time(end_event)))

start_event.record(stream)
for i in range(100):
    func(data, weight, outdata)
end_event.record(stream)
stream.synchronize()
print("forward time: {:.3f}".format(start_event.elapsed_time(end_event)/100))


print(outdata)

first time: 468.030
forward time: 0.003
tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  

In [2]:
import torch
from brt.jit import HorizFuseFunction
from brt.common import BRT_KERNEL_TEMPLATE_PATH, log
from brt.jit.compiler import CUDACompiler
import time

log.set_level("jit", "DEBUG")

kernel_name = "sample"

kernel_template_filename = str(BRT_KERNEL_TEMPLATE_PATH / (kernel_name + ".cu"))

kernel_template_source = open(kernel_template_filename, "r").read()

fuser = HorizFuseFunction([kernel_template_source, kernel_template_source])

code = fuser.get_code()

processed_template_fname = str(
    BRT_KERNEL_TEMPLATE_PATH / ("processed_" + kernel_name + ".cu")
)
with open(processed_template_fname, "w") as f:
    f.write(code)

fused_matmul = CUDACompiler.generate_kernel(None, code)

data_0 = torch.ones((8, 64, 64), device="cuda")
weight_0 = torch.ones((8, 64, 64), device="cuda")
outdata_0 = torch.ones((8, 64, 64), device="cuda")
data_1 = torch.ones((8, 64, 64), device="cuda")
weight_1 = torch.ones((8, 64, 64), device="cuda")
outdata_1 = torch.ones((8, 64, 64), device="cuda")
torch.cuda.synchronize()
stream = torch.cuda.default_stream()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record(stream)
fused_matmul(
    data_0, weight_0, outdata_0, data_1, weight_1, outdata_1,
)
end_event.record(stream)
stream.synchronize()
print("first time: {:.3f}".format(start_event.elapsed_time(end_event)))

start_event.record(stream)
for i in range(100):
    fused_matmul(
        data_0,
        weight_0,
        outdata_0,
        data_1,
        weight_1,
        outdata_1,
    )
end_event.record(stream)
stream.synchronize()
print("forward time: {:.3f}".format(start_event.elapsed_time(end_event) / 100))

DEBUG:brainstorm.jit:Fusing blocks from 0 to 255 for 0-th block
DEBUG:brainstorm.jit:Fusing blocks from 256 to 511 for 1-th block
captured blocks: 512, 1, 1
captured threads: 32, 1, 1
gridsize: 0
blocksize: 0
first time: 869.802
forward time: 0.007


In [1]:
import torch
from brt.jit import HeteroFuseFunction
from brt.common import BRT_KERNEL_TEMPLATE_PATH, log
from brt.jit.compiler import CUDACompiler
import time

log.set_level("jit", "DEBUG")

kernel_name = "sample"

kernel_template_filename = str(BRT_KERNEL_TEMPLATE_PATH / (kernel_name + ".cu"))

kernel_template_source = open(kernel_template_filename, "r").read()

fuser = HeteroFuseFunction([kernel_template_source, kernel_template_source])

code = fuser.get_code()

processed_template_fname = str(
    BRT_KERNEL_TEMPLATE_PATH / ("processed_" + kernel_name + ".cu")
)
with open(processed_template_fname, "w") as f:
    f.write(code)

fused_matmul = CUDACompiler.generate_kernel(None, code)

data_0 = torch.ones((8, 64, 64), device="cuda")
weight_0 = torch.ones((8, 64, 64), device="cuda")
outdata_0 = torch.ones((8, 64, 64), device="cuda")
data_1 = torch.ones((8, 64, 64), device="cuda")
weight_1 = torch.ones((8, 64, 64), device="cuda")
outdata_1 = torch.ones((8, 64, 64), device="cuda")
torch.cuda.synchronize()
stream = torch.cuda.default_stream()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record(stream)
active_blocks = [1, 0]
fused_matmul(
    data_0,
    weight_0,
    outdata_0,
    data_1,
    weight_1,
    outdata_1,
    active_blocks=active_blocks,
)
end_event.record(stream)
stream.synchronize()
print("first time: {:.3f}".format(start_event.elapsed_time(end_event)))

start_event.record(stream)
for i in range(100):
    fused_matmul(
        data_0,
        weight_0,
        outdata_0,
        data_1,
        weight_1,
        outdata_1,
        active_blocks=active_blocks,
    )
end_event.record(stream)
stream.synchronize()
print("forward time: {:.3f}".format(start_event.elapsed_time(end_event) / 100))


first time: 843.554
forward time: 0.005


In [1]:
import torch
from brt.jit import HomoFuseFunctionV1
from brt.common import BRT_KERNEL_TEMPLATE_PATH, log
from brt.jit.compiler import CUDACompiler
import time

log.set_level("jit", "DEBUG")

kernel_name = "sample"

fuser = HomoFuseFunction(
    homo_func_name=kernel_name,
    branch_num=4,
    capacities=[1, 2, 3],
    shared_arg_indices=[0, 2],
)

fuser.fuse()
print(fuser.args)

code = fuser.get_code()
processed_template_fname = str(
    BRT_KERNEL_TEMPLATE_PATH / ("processed_" + kernel_name + ".cu")
)
with open(processed_template_fname, "w") as f:
    f.write(code)


ImportError: cannot import name 'HomoFuseFunction' from 'brt.jit.homo_fuse' (/home/whcui/brainstorm_project/brainstorm/python/brt/jit/homo_fuse.py)

In [2]:

fused_matmul = CUDACompiler.generate_kernel(None, code)

data_0 = torch.ones((8, 64, 64), device="cuda")
weight_0 = torch.ones((8, 64, 64), device="cuda")
outdata_0 = torch.ones((8, 64, 64), device="cuda")
data_1 = torch.ones((8, 64, 64), device="cuda")
weight_1 = torch.ones((8, 64, 64), device="cuda")
outdata_1 = torch.ones((8, 64, 64), device="cuda")
shared_inputs = [data_0, outdata_0]
weights=[weight_0, weight_1]
torch.cuda.synchronize()
stream = torch.cuda.default_stream()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record(stream)
active_blocks = [1, 0]
fused_matmul(
    shared_inputs=shared_inputs,
    weights=weights,
    active_blocks=active_blocks,
)
end_event.record(stream)
stream.synchronize()
print("first time: {:.3f}".format(start_event.elapsed_time(end_event)))

start_event.record(stream)
for i in range(100):
    fused_matmul(
    shared_inputs=shared_inputs,
    weights=weights,
    active_blocks=active_blocks,
)
end_event.record(stream)
stream.synchronize()
print("forward time: {:.3f}".format(start_event.elapsed_time(end_event) / 100))

: 

: 