# comparisons

This notebook has some timing comparisons between versions of models optimized with TVM and ones that went without. For the sake of simiplicity I'm not going to retrain the models first, I'm just going to grab existing model artifacts.

In [12]:
%%writefile .gitignore

mobilenet/
unet/
tweet-sentiment-extraction/

Writing .gitignore


## testing

In [6]:
# %pip install spell

In [7]:
import spell

In [8]:
import spell.client
client = spell.client.from_environment()

In [22]:
client.resources.cp("runs/480", "mobilenet/")

Extracting checkpoints/model_10.pth...
Extracting checkpoints/model_5.pth...


In [13]:
# https://github.com/spellml/mobilenet-cifar10/blob/master/models/model_1.py
import torch.nn as nn
import math
import torch
import os


def conv_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )

def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )

def make_divisible(x, divisible_by=8):
    import numpy as np
    return int(np.ceil(x * 1. / divisible_by) * divisible_by)


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = int(inp * expand_ratio)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t, c, n, s
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # building first layer
        assert input_size % 32 == 0
        # input_channel = make_divisible(input_channel * width_mult)  # first channel is always 32!
        self.last_channel = make_divisible(last_channel * width_mult) if width_mult > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, 2)]
        # building inverted residual blocks
        for t, c, n, s in interverted_residual_setting:
            output_channel = make_divisible(c * width_mult) if t > 1 else c
            for i in range(n):
                if i == 0:
                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
                else:
                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
                input_channel = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)

        # building classifier
        self.classifier = nn.Linear(self.last_channel, n_class)

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = x.mean(3).mean(2)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


mobilenet = MobileNetV2(width_mult=1, n_class=10, input_size=32)

In [24]:
mobilenet.load_state_dict(torch.load(os.getcwd() + "/mobilenet/checkpoints/model_10.pth"))

<All keys matched successfully>

In [42]:
relay.frontend.from_pytorch?

[0;31mSignature:[0m
[0mrelay[0m[0;34m.[0m[0mfrontend[0m[0;34m.[0m[0mfrom_pytorch[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mscript_module[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minput_infos[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcustom_convert_map[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdefault_dtype[0m[0;34m=[0m[0;34m'float32'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Load PyTorch model in the form of a scripted PyTorch model and convert into relay.
The companion parameters will be handled automatically.

Parameters
----------
script_module : TopLevelTracedModule object
    TorchScripted PyTorch graph
    Note: We currently only support traces (ie: torch.jit.trace(model, input))

input_infos : List of tuples
    Can be (input name, input shape) or (input name, (input shape, input types))
    Graph level input shape and type list
    The same input names need to be u

So Relay only supports JIT-traced versions of PyTorch models. [I wrote about tracing in this article](https://spell.ml/blog/pytorch-jit-YBmYuBEAACgAiv71). The easiest way to turn this model into a JIT traced version of itself is to using PyTorch's automatic tracing. This requires no code editing, but it doesn't work with dropout or batchnorm layers. However, idgaf because the accuracy of the model we're brewing here doesn't matter, only its performance delta does.

In [45]:
# %pip install torchvision

In [46]:
import torchvision
from torch.utils.data import DataLoader
transform = torchvision.transforms.Compose([
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.RandomPerspective(),
    torchvision.transforms.ToTensor()
])
dataset = torchvision.datasets.CIFAR10("/mnt/cifar10/", train=True, transform=transform, download=True)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /mnt/cifar10/cifar-10-python.tar.gz


80.3%IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

85.8%IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

91.0%IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

96.6%IOPub message rate exceede

In [51]:
X_ex, y_ex = next(iter(dataloader))

In [55]:
traced_mobilenet = torch.jit.trace(mobilenet.forward, (X_ex))

In [57]:
type(traced_mobilenet)

torch.jit._trace.TopLevelTracedModule

TVM needs chipset information. This is pretty much out of my ballzone, but copying the instructions from their tutorial gives me the following.

In [27]:
!less /proc/cpuinfo | grep "model name"

model name	: Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
model name	: Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
model name	: Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
model name	: Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz


Deets from [this overview](https://www.cpu-world.com/CPUs/Xeon/Intel-Xeon%208259CL.html) for this chip. [An example in the documentation](https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#define-network) points out that the following target is appropriate for the chip of the type we're looking at here:

In [68]:
target = "llvm -mcpu=skylake-avx512"

In [38]:
import tvm.relay as relay

In [76]:
mod, params = relay.frontend.from_pytorch(traced_mobilenet, input_infos=[('input0', X_ex.shape)])

This throws a lot of warnings but...I mean, this is what they document, so this is what I'm going to use lol.

In [69]:
import tvm

with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target=target, params=params)

Cannot find config for target=llvm -keys=cpu -link-params=0 -mcpu=skylake-avx512, workload=('conv2d_NCHWc.x86', ('TENSOR', (32, 3, 32, 32), 'float32'), ('TENSOR', (32, 3, 3, 3), 'float32'), (2, 2), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -keys=cpu -link-params=0 -mcpu=skylake-avx512, workload=('depthwise_conv2d_NCHWc.x86', ('TENSOR', (32, 32, 16, 16), 'float32'), ('TENSOR', (32, 1, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -keys=cpu -link-params=0 -mcpu=skylake-avx512, workload=('conv2d_NCHWc.x86', ('TENSOR', (32, 32, 16, 16), 'float32'), ('TENSOR', (16, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'NCHW', 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
Can

In [81]:
from tvm.contrib import graph_executor

dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))

In [85]:
dtype = "float32"
module.set_input("input0", X_ex)
module.run()

In [132]:
output_shape = mobilenet(X_ex).shape
output_shape = tvm.nd.empty(output_shape)

In [133]:
tvm_output = module.get_output(0, output_shape).asnumpy()

Numbers for after TVM but before the optimization pass:

In [135]:
import timeit

timing_number = 10
timing_repeat = 10
unoptimized = (
    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
    * 1000
    / timing_number
)
unoptimized = {
    "mean": np.mean(unoptimized),
    "median": np.median(unoptimized),
    "std": np.std(unoptimized),
}

print(unoptimized)

{'mean': 8.886561410008653, 'median': 8.756740450007783, 'std': 0.3270634636492541}


Pre-TVM, post-JIT numbers:

In [138]:
import timeit

timing_number = 10
timing_repeat = 10
unoptimized = (
    np.array(timeit.Timer(lambda: traced_mobilenet(X_ex)).repeat(repeat=timing_repeat, number=timing_number))
    * 1000
    / timing_number
)
unoptimized = {
    "mean": np.mean(unoptimized),
    "median": np.median(unoptimized),
    "std": np.std(unoptimized),
}

print(unoptimized)

{'mean': 357.99847124001644, 'median': 357.83253340005103, 'std': 8.665247280124268}


Pre-JIT numbers:

In [141]:
import timeit

timing_number = 10
timing_repeat = 10
unoptimized = (
    np.array(timeit.Timer(lambda: mobilenet(X_ex)).repeat(repeat=timing_repeat, number=timing_number))
    * 1000
    / timing_number
)
unoptimized = {
    "mean": np.mean(unoptimized),
    "median": np.median(unoptimized),
    "std": np.std(unoptimized),
}

print(unoptimized)

{'mean': 362.23234594999667, 'median': 363.61967815000753, 'std': 7.793487259223359}


Hmm, same speed pretty much. So let's just omit the JIT version from the benchmarks.

In [143]:
import tvm.auto_scheduler as auto_scheduler
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm

# Set up some basic parameters for the runner. The runner takes compiled code
# that is generated with a specific set of parameters and measures the
# performance of it. ``number`` specifies the number of different
# configurations that we will test, while ``repeat`` specifies how many
# measurements we will take of each configuration. ``min_repeat_ms`` is a value
# that specifies how long need to run configuration test. If the number of
# repeats falls under this time, it will be increased. This option is necessary
# for accurate tuning on GPUs, and is not required for CPU tuning. Setting this
# value to 0 disables it. The ``timeout`` places an upper limit on how long to
# run training code for each tested configuration.

number = 10
repeat = 1
min_repeat_ms = 0  # since we're tuning on a CPU, can be set to 0
timeout = 10  # in seconds

# create a TVM runner
runner = autotvm.LocalRunner(
    number=number,
    repeat=repeat,
    timeout=timeout,
    min_repeat_ms=min_repeat_ms,
)

# Create a simple structure for holding tuning options. We use an XGBoost
# algorithim for guiding the search. For a production job, you will want to set
# the number of trials to be larger than the value of 10 used here. For CPU we
# recommend 1500, for GPU 3000-4000. The number of trials required can depend
# on the particular model and processor, so it's worth spending some time
# evaluating performance across a range of values to find the best balance
# between tuning time and model optimization. Because running tuning is time
# intensive we set number of trials to 10, but do not recommend a value this
# small. The ``early_stopping`` parameter is the minimum number of trails to
# run before a condition that stops the search early can be applied. The
# measure option indicates where trial code will be built, and where it will be
# run. In this case, we're using the ``LocalRunner`` we just created and a
# ``LocalBuilder``. The ``tuning_records`` option specifies a file to write
# the tuning data to.

tuning_option = {
    "tuner": "xgb",
    "trials": 10,
    "early_stopping": 100,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(build_func="default"), runner=runner
    ),
    "tuning_records": "resnet-50-v2-autotuning.json",
}

In [144]:
# begin by extracting the taks from the pytorch model
tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)

In [146]:
# Tune the extracted tasks sequentially.
for i, task in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
    tuner_obj = XGBTuner(task, loss_type="rank")
    tuner_obj.tune(
        n_trial=min(tuning_option["trials"], len(task.config_space)),
        early_stopping=tuning_option["early_stopping"],
        measure_option=tuning_option["measure_option"],
        callbacks=[
            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
        ],
    )

[Task  1/32]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s

Process Process-10:
Traceback (most recent call last):
  File "/opt/conda/envs/spell/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/conda/envs/spell/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/root/.local/lib/python3.9/site-packages/tvm-0.8.dev915+g09df4edb2-py3.9-linux-x86_64.egg/tvm/rpc/tracker.py", line 366, in _tracker_server
    handler.run()
  File "/root/.local/lib/python3.9/site-packages/tvm-0.8.dev915+g09df4edb2-py3.9-linux-x86_64.egg/tvm/rpc/tracker.py", line 361, in run
    self._ioloop.start()
  File "/opt/conda/envs/spell/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/opt/conda/envs/spell/lib/python3.9/asyncio/base_events.py", line 586, in run_forever
    self._check_running()
  File "/opt/conda/envs/spell/lib/python3.9/asyncio/base_events.py", line 578, in _check_running
    raise 

KeyboardInterrupt: 

  File "/opt/conda/envs/spell/lib/python3.9/multiprocessing/queues.py", line 365, in get
    with self._rlock:
  File "/opt/conda/envs/spell/lib/python3.9/multiprocessing/connection.py", line 221, in recv_bytes
    buf = self._recv_bytes(maxlength)
KeyboardInterrupt
  File "/opt/conda/envs/spell/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
    buf = self._recv(4)
  File "/opt/conda/envs/spell/lib/python3.9/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/envs/spell/lib/python3.9/multiprocessing/connection.py", line 384, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
KeyboardInterrupt


Ah, this uses an `asyncio` event loop, which cannot be run in this interactive REPL. It _has_ to appear in a `__main__` block, e.g. in a scripting context. The rest of this stuff is going to have to happen in a script.

## boilerplate

In [165]:
%%writefile ../scripts/tvm_funcs.py
import tvm
import tvm.relay as relay
from tvm.contrib import graph_executor
import tvm.auto_scheduler as auto_scheduler
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm

import numpy as np

TARGET = "llvm -mcpu=skylake-avx512"


def time_it(model_func):
    import timeit

    timing_number = 10
    timing_repeat = 10
    timing = (
        np.array(timeit.Timer(model_func).repeat(repeat=timing_repeat, number=timing_number))
        * 1000
        / timing_number
    )
    results = {
        "mean": np.mean(timing),
        "median": np.median(timing),
        "std": np.std(timing),
    }
    return results


def get_tvm_model(traced_model, X_ex):
    mod, params = relay.frontend.from_pytorch(traced_model, input_infos=[('input0', X_ex.shape)])

    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(mod, target=TARGET, params=params)

    dev = tvm.device(str(TARGET), 0)
    module = graph_executor.GraphModule(lib["default"](dev))

    module.set_input("input0", X_ex)
    module.run()  # just a test run to make sure it works

    # mod is an IR struct. Used downstream. params IDK, used downstream.
    # module is a Relay Python collable
    return mod, params, module


def tune(mod, params, X_ex):
    number = 10
    repeat = 1
    min_repeat_ms = 0  # since we're tuning on a CPU, can be set to 0
    timeout = 10  # in seconds

    # create a TVM runner
    runner = autotvm.LocalRunner(
        number=number,
        repeat=repeat,
        timeout=timeout,
        min_repeat_ms=min_repeat_ms,
    )

    # Create a simple structure for holding tuning options. We use an XGBoost
    # algorithim for guiding the search. For a production job, you will want to set
    # the number of trials to be larger than the value of 10 used here. For CPU we
    # recommend 1500, for GPU 3000-4000. The number of trials required can depend
    # on the particular model and processor, so it's worth spending some time
    # evaluating performance across a range of values to find the best balance
    # between tuning time and model optimization. Because running tuning is time
    # intensive we set number of trials to 10, but do not recommend a value this
    # small. The ``early_stopping`` parameter is the minimum number of trails to
    # run before a condition that stops the search early can be applied. The
    # measure option indicates where trial code will be built, and where it will be
    # run. In this case, we're using the ``LocalRunner`` we just created and a
    # ``LocalBuilder``. The ``tuning_records`` option specifies a file to write
    # the tuning data to.

    tuning_option = {
        "tuner": "xgb",
        "trials": 10,
        "early_stopping": 100,
        "measure_option": autotvm.measure_option(
            builder=autotvm.LocalBuilder(build_func="default"), runner=runner
        ),
        "tuning_records": "resnet-50-v2-autotuning.json",
    }
    
    tasks = autotvm.task.extract_from_program(mod["main"], target=TARGET, params=params)

    for i, task in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
        tuner_obj = XGBTuner(task, loss_type="rank")
        tuner_obj.tune(
            n_trial=min(tuning_option["trials"], len(task.config_space)),
            early_stopping=tuning_option["early_stopping"],
            measure_option=tuning_option["measure_option"],
            callbacks=[
                autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
                autotvm.callback.log_to_file(tuning_option["tuning_records"]),
            ],
        )

    with autotvm.apply_history_best(tuning_option["tuning_records"]):
        with tvm.transform.PassContext(opt_level=3, config={}):
            lib = relay.build(mod, target=target, params=params)

    dev = tvm.device(str(target), 0)
    optimized_module = graph_executor.GraphModule(lib["default"](dev))

    optimized_module.set_input("input0", X_ex)
    optimized_module.run()  # dry run test

    return optimized_module

Overwriting ../scripts/tvm_funcs.py


In [164]:
%%writefile ../scripts/test_mobilenet.py
# Model code adpated from:
# https://github.com/spellml/mobilenet-cifar10/blob/master/models/model_1.py
import math
import torch
import os

import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader

from tvm_funcs import *


def conv_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )

def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )

def make_divisible(x, divisible_by=8):
    import numpy as np
    return int(np.ceil(x * 1. / divisible_by) * divisible_by)


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = int(inp * expand_ratio)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t, c, n, s
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # building first layer
        assert input_size % 32 == 0
        # input_channel = make_divisible(input_channel * width_mult)  # first channel is always 32!
        self.last_channel = make_divisible(last_channel * width_mult) if width_mult > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, 2)]
        # building inverted residual blocks
        for t, c, n, s in interverted_residual_setting:
            output_channel = make_divisible(c * width_mult) if t > 1 else c
            for i in range(n):
                if i == 0:
                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
                else:
                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
                input_channel = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)

        # building classifier
        self.classifier = nn.Linear(self.last_channel, n_class)

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = x.mean(3).mean(2)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


def get_model():
    mobilenet = MobileNetV2(width_mult=1, n_class=10, input_size=32)
    # mobilenet.load_state_dict(torch.load("/mnt/checkpoints/model_10.pth"))
    mobilenet.load_state_dict(torch.load("/spell/notebooks/mobilenet/checkpoints/model_10.pth"))
    return mobilenet


def get_dataloader():
    transform = torchvision.transforms.Compose([
        torchvision.transforms.RandomHorizontalFlip(),
        torchvision.transforms.RandomPerspective(),
        torchvision.transforms.ToTensor()
    ])
    dataset = torchvision.datasets.CIFAR10("/mnt/cifar10/", train=True, transform=transform, download=True)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    return dataloader


if __name__ == "__main__":
    mobilenet = get_model()
    dataloader = get_dataloader()
    X_ex, y_ex = next(iter(dataloader))

    traced_mobilenet = torch.jit.trace(mobilenet.forward, (X_ex))

    # tvm part
    mod, params, module = get_tvm_model(traced_mobilenet, X_ex)
    tvm_optimized_module = tune(mod, params, X_ex)

    # timing part
    print("PyTorch timings:")
    print(time_it(lambda: traced_mobilenet(X_ex)))
    print("TVM (Relay) timings:")
    print(time_it(lambda: module.run()))
    print("TVM (Tuned) timings:")
    print(time_it(lambda: tvm_optimized_module.run()))

Overwriting ../scripts/test_mobilenet.py


In [None]:
# !spell run \
#     --machine-type t4 \
#     --github-url 