- RPC 
    - ssh port forwarding
        - Object(Machine):ssh -L [LOCAL_IP:]LOCAL_PORT:DESTINATION:DESTINATION_PORT [USER@]SSH_SERVER
        - Tracker(Host)  :ssh -N -R tcp:9190:0.0.0.0:9190 tx2
        - Server(Tx2) :ssh -N -R tcp:9090:0.0.0.0:9090 pc
    - python script
        - Tracker
        - Server

In [1]:
import tvm
from tvm import te
import tvm.relay as relay
from tvm import rpc
from tvm.contrib import utils, graph_executor as runtime
from tvm.contrib.download import download_testdata
from mxnet.gluon.model_zoo.vision import get_model
from PIL import Image
import numpy as np
from tvm.relay import testing

# one line to get the model
block = get_model("resnet18_v1", pretrained=True)

In [2]:
# remote = rpc.connect("192.168.55.1", 9090)
tracker = tvm.rpc.connect_tracker("localhost", 9190)

In [3]:
tracker.summary()

{'queue_info': {'tx2': {'free': 1, 'pending': 0}},
 'server_info': [{'key': 'server:tx2', 'addr': ['127.0.0.1', 9090]}]}

In [4]:
remote = tracker.request("tx2")

In [5]:
remote.cpu(0).exist, remote.cuda(0).exist

(True, True)

In [6]:
batch_size = 1
num_class = 1000
image_shape = (3, 224, 224)
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_class)

mod, params = relay.testing.resnet.get_workload(batch_size=batch_size, num_layers=18)
# mod, params = relay.testing.vgg.get_workload(batch_size=batch_size, num_layers=11)

# set show_meta_data=True if you want to show meta data
# print(mod.astext(show_meta_data=False))

In [7]:
from tvm.relay.expr_functor import ExprMutator

class ScheduleConv2d(ExprMutator):
    def __init__(self):
        super().__init__()

    def visit_call(self, expr):
        visit = super().visit_call(expr)
        if expr.op == tvm.relay.op.get("nn.conv2d") or expr.op == tvm.relay.op.get("nn.softmax"):
            # cannot relay.vm.compile when annotate on remote cuda, tvm cuda instead
            return relay.annotation.on_device(visit, tvm.cuda(0))
        else:
            return visit

In [8]:
for gv in mod.get_global_vars():
    mod[gv] = ScheduleConv2d().visit(mod[gv])
mod = relay.transform.InferType()(mod)

In [9]:
with tvm.transform.PassContext(
    opt_level=3, 
    config={"relay.fallback_device_type": 1} # 1 for CPU, 2 for GPU
):
    exe = relay.vm.compile(
        mod, target={\
            "cpu": tvm.target.Target("llvm -mtriple=x86_64-linux-gnu"),
            "cuda": tvm.target.Target("cuda -arch=sm_86")
            # "cpu": tvm.target.Target("llvm -mtriple=aarch64-linux-gnu"), 
            # "cuda": tvm.target.Target("cuda -arch=sm_62")
        }, params=params
    )

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


In [10]:
# temp = utils.tempdir()
# path = temp.relpath("vm_lib.tar")
# exe.mod.export_library(path)

In [11]:
# remote.upload(path)
# rexe = remote.load_module("vm_lib.tar")

In [12]:
# remote devs here
devs = [tvm.cpu(0), tvm.cuda(0)]
vm = tvm.runtime.vm.VirtualMachine(exe, devs)

In [14]:
np.random.seed(0)
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
p = list(params.values())
input_data = tvm.nd.array(data, tvm.cpu(0))
out = vm.invoke("main", input_data)

# Print first 10 elements of output
print(out.numpy().flatten()[0:10])

[0.00090203 0.0010247  0.00090637 0.00102216 0.00109492 0.00107084
 0.00105591 0.00095502 0.0011055  0.00113423]


In [None]:
import time

In [None]:
st = time.time()
# time for malloc on remote cuda and transfer by bus and rpc
tvm.nd.array(np.random.uniform(-1, 1, size=data_shape).astype("float32"), remote.cuda()).asnumpy()
ed = time.time()
(ed - st) * 1000

82.9918384552002