In [1]:
import torch
from torchvision import models
from py_auto_fact import auto_fact

# Init Model

In [2]:
model = models.vgg16()
model

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

# Factorize Model

In [3]:
%%time
fact_model = auto_fact(model, rank=64, deepcopy=True, solver='random')

CPU times: user 576 ms, sys: 144 ms, total: 720 ms
Wall time: 96.1 ms


In [4]:
%%time
fact_model = auto_fact(model, rank=64, deepcopy=True, solver='svd')

CPU times: user 11 s, sys: 404 ms, total: 11.4 s
Wall time: 1.19 s


In [5]:
%%time
fact_model = auto_fact(model, rank=64, deepcopy=True, solver='snmf')

CPU times: user 39 s, sys: 18 s, total: 57 s
Wall time: 6.65 s


# Test on CPU

### Test Inference CPU

In [6]:
%%timeit
with torch.no_grad():
    y = model(torch.zeros(16,3,224,224, dtype=torch.float))

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


705 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
with torch.no_grad():
    y = fact_model(torch.zeros(16,3,224,224, dtype=torch.float))

292 ms ± 1.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Test Forward-Backward CPU

In [8]:
%%timeit
y = model(torch.zeros(4,3,224,224, dtype=torch.float))
y.sum().backward()

533 ms ± 110 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
y = fact_model(torch.zeros(4,3,224,224, dtype=torch.float))
y.sum().backward()

295 ms ± 607 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Test on GPU

### Move models to GPU

In [10]:
model = model.cuda()
fact_model = fact_model.cuda()

### Test Inference GPU

In [11]:
x = torch.zeros(4,3,224,224, dtype=torch.float).cuda()

In [12]:
%%timeit
with torch.no_grad():
    y = model(x)

17.5 ms ± 8.91 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%%timeit
with torch.no_grad():
    y = fact_model(x)

7.39 ms ± 31.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Test Forward-Backward GPU

In [14]:
x = torch.zeros(4,3,224,224, dtype=torch.float).cuda()

In [15]:
%%timeit
y = model(x)
y.sum().backward()

45.6 ms ± 104 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%%timeit
y = fact_model(x)
y.sum().backward()

25.1 ms ± 182 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
