In [1]:
import os, sys
import torch
import torch.nn as nn
from torchvision import models
from greenformer import auto_fact

In [2]:
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

# Init Model

In [3]:
model = models.vgg16(pretrained=True)
model

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [4]:
count_param(model)

138357544

# Factorize Model

### Apply absolute rank

In [5]:
%%time
fact_model = auto_fact(model, rank=256, deepcopy=True, solver='random', num_iter=20)
count_param(fact_model)

CPU times: user 519 ms, sys: 491 ms, total: 1.01 s
Wall time: 185 ms




25596712

In [6]:
%%time
fact_model = auto_fact(model, rank=256, deepcopy=True, solver='svd', num_iter=20)
count_param(fact_model)

CPU times: user 1min 57s, sys: 3.52 s, total: 2min 1s
Wall time: 9.1 s


25596712

In [7]:
%%time
fact_model = auto_fact(model, rank=256, deepcopy=True, solver='snmf', num_iter=20)
count_param(fact_model)

CPU times: user 3min 29s, sys: 32.9 s, total: 4min 2s
Wall time: 18.7 s


25596712

### Apply percentage rank

In [8]:
%%time
fact_model = auto_fact(model, rank=0.4, deepcopy=True, solver='random', num_iter=20)
count_param(fact_model)

CPU times: user 1 s, sys: 417 ms, total: 1.42 s
Wall time: 442 ms


52613904

In [9]:
%%time
fact_model = auto_fact(model, rank=0.4, deepcopy=True, solver='svd', num_iter=20)
count_param(fact_model)

CPU times: user 12min 36s, sys: 1min 8s, total: 13min 44s
Wall time: 56.4 s


52613904

In [10]:
%%time
fact_model = auto_fact(model, rank=0.4, deepcopy=True, solver='snmf', num_iter=20)
count_param(fact_model)

CPU times: user 14min 20s, sys: 1min 51s, total: 16min 11s
Wall time: 1min 5s


52613904

### Apply factorization only on specific modules

In [11]:
# Only factorize last 3 convolution layers and 3 linear layers of the model
factorizable_submodules = list(model.features[24:]) + list(model.classifier)

In [12]:
%%time
fact_model = auto_fact(model, rank=0.2, deepcopy=True, solver='random', num_iter=20, submodules=factorizable_submodules)
count_param(fact_model)

CPU times: user 720 ms, sys: 489 ms, total: 1.21 s
Wall time: 328 ms


33140776

In [13]:
%%time
fact_model = auto_fact(model, rank=0.2, deepcopy=True, solver='svd', num_iter=20, submodules=factorizable_submodules)
count_param(fact_model)

CPU times: user 4min 55s, sys: 5.66 s, total: 5min 1s
Wall time: 22.9 s


33140776

In [14]:
%%time
fact_model = auto_fact(model, rank=0.2, deepcopy=True, solver='snmf', num_iter=20, submodules=factorizable_submodules)
count_param(fact_model)

CPU times: user 6min 44s, sys: 42.6 s, total: 7min 27s
Wall time: 34.2 s


33140776

# Speed test on CPU

### Test Inference CPU

In [15]:
%%timeit
with torch.no_grad():
    y = model(torch.zeros(16,3,224,224, dtype=torch.float))

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


1.18 s ± 47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit
with torch.no_grad():
    y = fact_model(torch.zeros(16,3,224,224, dtype=torch.float))

1.03 s ± 12.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Test Forward-Backward CPU

In [17]:
%%timeit
y = model(torch.zeros(8,3,224,224, dtype=torch.float))
y.sum().backward()

1.93 s ± 49.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit
y = fact_model(torch.zeros(8,3,224,224, dtype=torch.float))
y.sum().backward()

1.75 s ± 22.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Speed test on GPU

### Move models to GPU

In [19]:
model = model.cuda()
fact_model = fact_model.cuda()

### Test Inference GPU

In [20]:
x = torch.zeros(16,3,224,224, dtype=torch.float).cuda()

In [21]:
%%timeit
with torch.no_grad():
    y = model(x)

73.6 ms ± 553 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
%%timeit
with torch.no_grad():
    y = fact_model(x)

67.3 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Test Forward-Backward GPU

In [23]:
x = torch.zeros(8,3,224,224, dtype=torch.float).cuda()

In [24]:
%%timeit
y = model(x)
y.sum().backward()

128 ms ± 5.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
%%timeit
y = fact_model(x)
y.sum().backward()

111 ms ± 4.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
