In [40]:
import numpy as np
import tvm
from tvm import relax
from tvm.ir.module import IRModule
from tvm.script import relax as R
from tvm.script import tir as T

In [41]:
import IPython

def code2html(code):
    """Helper function to use pygments to turn the code string into highlighted html."""
    import pygments
    from pygments.formatters import HtmlFormatter
    from pygments.lexers import Python3Lexer
    formatter = HtmlFormatter()
    html = pygments.highlight(code, Python3Lexer(), formatter)
    return "<style>%s</style>%s\n" % (formatter.get_style_defs(".highlight"), html)

### 变换张量函数

In [42]:
@tvm.script.ir_module
class MyModule:
    @T.prim_func
    def main(
        A: T.Buffer((128, 128), "float32"),
        B: T.Buffer((128, 128), "float32"),
        C: T.Buffer((128, 128), "float32"),
    ):
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        for i, j, k in T.grid(128, 128, 128):
            with T.block("C"):
                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                with T.init():
                    C[vi, vj] = 0.0
                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]

In [43]:
def schedule_mm(sch: tvm.tir.Schedule, jfactor=4):
    block_C = sch.get_block("C", "main")
    i, j, k = sch.get_loops(block=block_C)
    j_0, j_1 = sch.split(loop=j, factors=[None, jfactor])
    sch.reorder(i, j_0, k, j_1)
    sch.decompose_reduction(block_C, k)
    return sch

sch = tvm.tir.Schedule(MyModule)
sch = schedule_mm(sch)
IPython.display.HTML(code2html(sch.mod.script()))

In [44]:
dtype = "float32"
a_np = np.random.rand(128, 128).astype(dtype)
b_np = np.random.rand(128, 128).astype(dtype)
a_nd = tvm.nd.array(a_np)
b_nd = tvm.nd.array(b_np)
c_nd = tvm.nd.empty((128, 128), dtype="float32")

In [45]:
lib = tvm.build(sch.mod, target="llvm")
f_timer_after = lib.time_evaluator("main", tvm.cpu())
print("Time cost of MyModule=>schedule_mm: %.3f ms" % (f_timer_after(a_nd, b_nd, c_nd).mean * 1000))

Time cost of MyModule=>schedule_mm: 0.408 ms


### 跟踪变换历史轨迹

In [46]:
print(sch.trace)

# from tvm import tir
def apply_trace(sch: tir.Schedule) -> None:
  b0 = sch.get_block(name="C", func_name="main")
  l1, l2, l3 = sch.get_loops(block=b0)
  l4, l5 = sch.split(loop=l2, factors=[None, 4], preserve_unit_iters=True, disable_predication=False)
  sch.reorder(l1, l4, l3, l5)
  b6 = sch.decompose_reduction(block=b0, loop=l3)


## Stochastic Schedule Transformation

In [47]:
def stochastic_schedule_mm(sch: tvm.tir.Schedule):
    block_C = sch.get_block("C", "main")
    i, j, k = sch.get_loops(block=block_C)
    j_factors = sch.sample_perfect_tile(loop=j, n=2)   ###  随机调度
    j_0, j_1 = sch.split(loop=j, factors=j_factors)
    sch.reorder(i, j_0, k, j_1)
    sch.decompose_reduction(block_C, k)
    return sch

In [48]:
sch = tvm.tir.Schedule(MyModule)
sch = stochastic_schedule_mm(sch)  ### 每次运行结果略有不同

IPython.display.HTML(code2html(sch.mod.script())) 

In [49]:
print(sch.trace)

# from tvm import tir
def apply_trace(sch: tir.Schedule) -> None:
  b0 = sch.get_block(name="C", func_name="main")
  l1, l2, l3 = sch.get_loops(block=b0)
  v4, v5 = sch.sample_perfect_tile(loop=l2, n=2, max_innermost_factor=16, decision=[16, 8])
  l6, l7 = sch.split(loop=l2, factors=[v4, v5], preserve_unit_iters=True, disable_predication=False)
  sch.reorder(l1, l6, l3, l7)
  b8 = sch.decompose_reduction(block=b0, loop=l3)


### 随机变换搜索

In [50]:
def random_search(mod: tvm.IRModule, num_trials=5):
    best_result = None
    best_sch = None

    for i in range(num_trials):
        sch = stochastic_schedule_mm(tvm.tir.Schedule(mod))
        lib = tvm.build(sch.mod, target="llvm")
        f_timer_after = lib.time_evaluator("main", tvm.cpu())
        result = f_timer_after(a_nd, b_nd, c_nd).mean

        print("=====Attempt %d, time-cost: %.3f ms====" % (i, result * 1000))
        print(sch.trace)

        # book keep the best result so far
        if best_result is None or result < best_result:
            best_result = result
            best_sch = sch

    return best_sch

sch = random_search(MyModule)

=====Attempt 0, time-cost: 0.276 ms====
# from tvm import tir
def apply_trace(sch: tir.Schedule) -> None:
  b0 = sch.get_block(name="C", func_name="main")
  l1, l2, l3 = sch.get_loops(block=b0)
  v4, v5 = sch.sample_perfect_tile(loop=l2, n=2, max_innermost_factor=16, decision=[16, 8])
  l6, l7 = sch.split(loop=l2, factors=[v4, v5], preserve_unit_iters=True, disable_predication=False)
  sch.reorder(l1, l6, l3, l7)
  b8 = sch.decompose_reduction(block=b0, loop=l3)
=====Attempt 1, time-cost: 1.596 ms====
# from tvm import tir
def apply_trace(sch: tir.Schedule) -> None:
  b0 = sch.get_block(name="C", func_name="main")
  l1, l2, l3 = sch.get_loops(block=b0)
  v4, v5 = sch.sample_perfect_tile(loop=l2, n=2, max_innermost_factor=16, decision=[128, 1])
  l6, l7 = sch.split(loop=l2, factors=[v4, v5], preserve_unit_iters=True, disable_predication=False)
  sch.reorder(l1, l6, l3, l7)
  b8 = sch.decompose_reduction(block=b0, loop=l3)
=====Attempt 2, time-cost: 0.143 ms====
# from tvm import tir
def

=====Attempt 4, time-cost: 1.585 ms====
# from tvm import tir
def apply_trace(sch: tir.Schedule) -> None:
  b0 = sch.get_block(name="C", func_name="main")
  l1, l2, l3 = sch.get_loops(block=b0)
  v4, v5 = sch.sample_perfect_tile(loop=l2, n=2, max_innermost_factor=16, decision=[128, 1])
  l6, l7 = sch.split(loop=l2, factors=[v4, v5], preserve_unit_iters=True, disable_predication=False)
  sch.reorder(l1, l6, l3, l7)
  b8 = sch.decompose_reduction(block=b0, loop=l3)


In [51]:
print(sch.trace) ## 它经过了几个选择，然后在五次试验中返回了最佳运行。

# from tvm import tir
def apply_trace(sch: tir.Schedule) -> None:
  b0 = sch.get_block(name="C", func_name="main")
  l1, l2, l3 = sch.get_loops(block=b0)
  v4, v5 = sch.sample_perfect_tile(loop=l2, n=2, max_innermost_factor=16, decision=[8, 16])
  l6, l7 = sch.split(loop=l2, factors=[v4, v5], preserve_unit_iters=True, disable_predication=False)
  sch.reorder(l1, l6, l3, l7)
  b8 = sch.decompose_reduction(block=b0, loop=l3)


### TVM's metaschedule 

meta_schedule 是支持搜索可能变换空间的命名空间。Meta-Schedule 在幕后做了很多额外的事情：

跨越多个进程的并行基准测试。

使用代价模型 (cost model) 来避免每次都进行基准测试。

基于历史轨迹进行遗传搜索 (evolutionary search)，而不是每次都随机采样。

In [55]:
from tvm import meta_schedule as ms

database = ms.tune_tir(
    mod=MyModule,
    target="llvm --num-cores=1",   ### 研究一下这个 参数
    max_trials_global=64,
    num_trials_per_iter=64,
    space=ms.space_generator.ScheduleFn(stochastic_schedule_mm),  # 只 tune tilng ! 
    work_dir="./tune_tmp",
    # task_name="main"
)

sch = ms.tir_integration.compile_tir(database, MyModule, "llvm --num-cores=1") 



2024-08-01 08:47:33 [INFO] Logging directory: ./tune_tmp/logs
2024-08-01 08:47:33 [INFO] LocalBuilder: max_workers = 8


[08:47:33] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:47:33] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:47:33] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`


2024-08-01 08:47:34 [INFO] LocalRunner: max_workers = 1
2024-08-01 08:47:36 [INFO] [task_scheduler.cc:159] Initializing Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,,,,0,


2024-08-01 08:47:36 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |            N/A |          N/A |                   N/A |      0 |      
------------------------------------------------------------------------------------------------------
Total trials: 0
Total latency (us): 0


Total trials: 0
Total latency (us): 0

2024-08-01 08:47:36 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2024-08-01 08:47:36 [INFO] [task_scheduler.cc:193] Sending 5 sample(s) to builder


[08:47:37] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:47:37] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:47:37] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:47:37] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:47:37] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=

2024-08-01 08:47:38 [INFO] [task_scheduler.cc:195] Sending 5 sample(s) to runner
2024-08-01 08:47:40 [DEBUG] XGB iter   0: tr-p-rmse: 0.305532	tr-a-peak@32: 0.936407	tr-rmse: 0.326114	tr-rmse: 0.326114
2024-08-01 08:47:40 [DEBUG] XGB iter  25: tr-p-rmse: 0.135544	tr-a-peak@32: 1.000000	tr-rmse: 0.078977	tr-rmse: 0.078977
2024-08-01 08:47:40 [DEBUG] XGB iter  50: tr-p-rmse: 0.130743	tr-a-peak@32: 1.000000	tr-rmse: 0.076376	tr-rmse: 0.076376
2024-08-01 08:47:40 [DEBUG] XGB iter  75: tr-p-rmse: 0.130704	tr-a-peak@32: 1.000000	tr-rmse: 0.076376	tr-rmse: 0.076376
2024-08-01 08:47:40 [DEBUG] XGB iter 100: tr-p-rmse: 0.130703	tr-a-peak@32: 1.000000	tr-rmse: 0.076376	tr-rmse: 0.076376
2024-08-01 08:47:40 [DEBUG] XGB iter 125: tr-p-rmse: 0.130703	tr-a-peak@32: 1.000000	tr-rmse: 0.076376	tr-rmse: 0.076376
2024-08-01 08:47:40 [DEBUG] XGB stopped. Best iteration: [81] tr-p-rmse:0.13070	tr-a-peak@32:1.00000	tr-rmse:0.07638	tr-rmse:0.07638 
2024-08-01 08:47:40 [INFO] [task_scheduler.cc:237] [Updated

Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,24.708,169.7549,169.7549,5,



Total trials: 5
Total latency (us): 169.755

2024-08-01 08:47:40 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |        24.7080 |     169.7549 |              169.7549 |      5 |      
------------------------------------------------------------------------------------------------------
Total trials: 5
Total latency (us): 169.755

2024-08-01 08:47:40 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2024-08-01 08:47:40 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder
2024-08-01 08:47:40 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner
2024-08-01 08:47:40 [INFO] [task_scheduler.cc:237] [Updated] Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,24.708,169.7549,169.7549,5,


2024-08-01 08:47:40 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |        24.7080 |     169.7549 |              169.7549 |      5 |      
------------------------------------------------------------------------------------------------------
Total trials: 5
Total latency (us): 169.755


Total trials: 5
Total latency (us): 169.755

2024-08-01 08:47:40 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2024-08-01 08:47:41 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder
2024-08-01 08:47:41 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner
2024-08-01 08:47:41 [INFO] [task_scheduler.cc:237] [Updated] Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,24.708,169.7549,169.7549,5,


2024-08-01 08:47:41 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |        24.7080 |     169.7549 |              169.7549 |      5 |      
------------------------------------------------------------------------------------------------------
Total trials: 5
Total latency (us): 169.755


Total trials: 5
Total latency (us): 169.755

2024-08-01 08:47:41 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2024-08-01 08:47:41 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder
2024-08-01 08:47:41 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner
2024-08-01 08:47:41 [INFO] [task_scheduler.cc:237] [Updated] Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,24.708,169.7549,169.7549,5,



Total trials: 5
Total latency (us): 169.755

2024-08-01 08:47:41 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |        24.7080 |     169.7549 |              169.7549 |      5 |      
------------------------------------------------------------------------------------------------------
Total trials: 5
Total latency (us): 169.755

2024-08-01 08:47:41 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2024-08-01 08:47:42 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder
2024-08-01 08:47:42 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner
2024-08-01 08:47:42 [INFO] [task_scheduler.cc:237] [Updated] Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,24.708,169.7549,169.7549,5,


2024-08-01 08:47:42 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |        24.7080 |     169.7549 |              169.7549 |      5 |      
------------------------------------------------------------------------------------------------------
Total trials: 5
Total latency (us): 169.755


Total trials: 5
Total latency (us): 169.755

2024-08-01 08:47:42 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2024-08-01 08:47:42 [INFO] [task_scheduler.cc:260] Task #0 has finished. Remaining task(s): 0


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,24.708,169.7549,169.7549,5,Y


2024-08-01 08:47:42 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |        24.7080 |     169.7549 |              169.7549 |      5 |    Y 
------------------------------------------------------------------------------------------------------
Total trials: 5
Total latency (us): 169.755


Total trials: 5
Total latency (us): 169.755



In [56]:
sch.trace.show()

In [54]:
IPython.display.HTML(code2html(sch.mod.script())) 

In [57]:
lib = tvm.build(sch.mod, target="llvm")
f_timer_after = lib.time_evaluator("main", tvm.cpu())
print("Time cost of MyModule after tuning: %.3f ms" % (f_timer_after(a_nd, b_nd, c_nd).mean * 1000))

Time cost of MyModule after tuning: 0.136 ms


### 默认调度器

In [59]:
database = ms.tune_tir(
    mod=MyModule,
    target="llvm --num-cores=1",
    max_trials_global=64,
    num_trials_per_iter=64,
    work_dir="./tune_tmp",
    # task_name="main",
)
sch = ms.tir_integration.compile_tir(database, MyModule, "llvm --num-cores=1")

2024-08-01 08:55:17 [INFO] Logging directory: ./tune_tmp/logs
2024-08-01 08:55:17 [INFO] LocalBuilder: max_workers = 8


[08:55:17] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:55:17] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:55:17] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`


2024-08-01 08:55:18 [INFO] LocalRunner: max_workers = 1
2024-08-01 08:55:20 [INFO] [task_scheduler.cc:159] Initializing Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,,,,0,



Total trials: 0
Total latency (us): 0

2024-08-01 08:55:20 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |            N/A |          N/A |                   N/A |      0 |      
------------------------------------------------------------------------------------------------------
Total trials: 0
Total latency (us): 0

2024-08-01 08:55:20 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2024-08-01 08:55:21 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder


[08:55:22] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:55:22] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:55:22] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:55:22] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[08:55:22] /home/ningzhangcaltech/Github_Repo/tvm/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.0.0git with `-mcpu=

2024-08-01 08:55:29 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner
2024-08-01 08:55:46 [DEBUG] XGB iter   0: tr-p-rmse: 0.401148	tr-a-peak@32: 0.987840	tr-rmse: 0.290605	tr-rmse: 0.290605
2024-08-01 08:55:46 [DEBUG] XGB iter  25: tr-p-rmse: 0.043304	tr-a-peak@32: 0.999240	tr-rmse: 0.319908	tr-rmse: 0.319908
2024-08-01 08:55:46 [DEBUG] XGB iter  50: tr-p-rmse: 0.043307	tr-a-peak@32: 0.999240	tr-rmse: 0.319904	tr-rmse: 0.319904
2024-08-01 08:55:46 [DEBUG] XGB stopped. Best iteration: [20] tr-p-rmse:0.04328	tr-a-peak@32:0.99924	tr-rmse:0.31994	tr-rmse:0.31994 
2024-08-01 08:55:46 [INFO] [task_scheduler.cc:237] [Updated] Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,119.6098,35.0666,35.0666,64,



Total trials: 64
Total latency (us): 35.0666

2024-08-01 08:55:46 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |       119.6098 |      35.0666 |               35.0666 |     64 |      
------------------------------------------------------------------------------------------------------
Total trials: 64
Total latency (us): 35.0666

2024-08-01 08:55:46 [INFO] [task_scheduler.cc:260] Task #0 has finished. Remaining task(s): 0


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,4194304,1,119.6098,35.0666,35.0666,64,Y



Total trials: 64
Total latency (us): 35.0666

2024-08-01 08:55:46 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 4194304 |      1 |       119.6098 |      35.0666 |               35.0666 |     64 |    Y 
------------------------------------------------------------------------------------------------------
Total trials: 64
Total latency (us): 35.0666



In [66]:
lib = tvm.build(sch.mod, target="llvm")
f_timer_after = lib.time_evaluator("main", tvm.cpu())
print("Time cost of MyModule after tuning: %.3f ms" % (f_timer_after(a_nd, b_nd, c_nd).mean * 1000))

#for _ in range(1000000):
#    f_timer_after = lib.time_evaluator("main", tvm.cpu())

Time cost of MyModule after tuning: 0.034 ms


In [61]:
sch.trace.show()

In [67]:
IPython.display.HTML(code2html(sch.mod.script()))