In [1]:
import tvm
from tvm import te
from tvm import rpc
import numpy as np
from tvm import topi
from tvm import testing
from tvm.contrib import utils

In [2]:
host = "192.168.55.1"
port = 9090
remote = rpc.connect(host, port)

In [3]:
n = te.var("n")
A = te.placeholder((n,), name="A")
B = te.placeholder((n,), name="B")
C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
s = te.create_schedule(C.op)

In [4]:
# bx, tx = s[C].split(C.op.axis[0], factor=64)
# s[C].bind(bx, te.thread_axis("blockIdx.x"))
# s[C].bind(tx, te.thread_axis("threadIdx.x"))

In [8]:
func = None
with tvm.target.Target(target='cuda -arch=sm_62', host='llvm -mtriple=aarch64-linux-gnu') as cuda_tgt:
    s = topi.cuda.schedule_injective([C])
    func = tvm.build(s, [A, B, C], cuda_tgt)

In [9]:
print(tvm.lower(s, [A, B, C]))

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*n: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*n)], [], type="auto"),
             C: Buffer(C_2: Pointer(float32), float32, [(stride_2: int32*n)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [n], [stride], type="auto"), B_1: B_3: Buffer(B_2, float32, [n], [stride_1], type="auto"), C_1: C_3: Buffer(C_2, float32, [n], [stride_2], type="auto")} {
  attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = floordiv((n + 511), 512);
  attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 512;
  if (blockIdx.x < floordiv(n, 512)) {
    C[(((blockIdx.x*512) + threadIdx.x)*str

In [10]:
temp = utils.tempdir()
path = temp.relpath("lib2.tar")
func.export_library(path)

In [11]:
remote.upload(path)
func = remote.load_module("lib2.tar")

In [15]:
dev = remote.cuda()
n = 1024
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
func(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())