In [1]:
!pip install flopco-pytorch

Collecting flopco-pytorch
  Downloading flopco_pytorch-0.1.4-py3-none-any.whl.metadata (411 bytes)
Downloading flopco_pytorch-0.1.4-py3-none-any.whl (4.6 kB)
Installing collected packages: flopco-pytorch
Successfully installed flopco-pytorch-0.1.4


In [2]:
import torch
from torch import nn
import flopco
from flopco import FlopCo

from tqdm import tqdm as tqdm
import math
import time
import sys
import gc
import statistics

In [3]:
torch.random.manual_seed(10)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cpu")
device_cpu = torch.device("cpu")
print(device)

cuda


In [4]:
l = torch.nn.Linear(10, 20)
for pname, p in l.named_parameters():
  print(pname, p.shape)

t = torch.randn((1, 20), dtype=torch.float16)
t.dtype

torch.finfo(torch.float16)

weight torch.Size([20, 10])
bias torch.Size([20])


finfo(resolution=0.001, min=-65504, max=65504, eps=0.000976562, smallest_normal=6.10352e-05, tiny=6.10352e-05, dtype=float16)

## Auxiliary functions

In [5]:
def print_mem_usage(layer):
  print('\t Shape')
  for name, tens in layer.named_parameters():
    print(name + ':\t', tuple(tens.shape))
  print()

  sum_bytes = 0
  sum_params = 0
  print('Parameters:')
  print('\tElems\t Bits\t\tBytes\t Kb\t\t Mb')
  for name, tens in layer.named_parameters():
    if tens.dtype is torch.float32:
      bytes_in_dtype = 4
    elif tens.dtype is torch.float64:
      bytes_in_dtype = 8
    else:
      print(name + " has unsupported type")
      continue
    elems = tens.numel()
    sum_params += elems
    sum_bytes += elems * bytes_in_dtype

    print(name + ':\t',
          elems, '\t', elems * bytes_in_dtype * 8,
          '\t', elems * bytes_in_dtype,
          '\t', elems * bytes_in_dtype / 1024,
          '\t', round(elems * bytes_in_dtype / 1024 / 1024, 5))
  print()
  print('Buffers:')
  print('\tElems\t Bits\t\tBytes\t Kb\t\t Mb')
  for name, tens in layer.named_buffers():
    if tens.dtype is torch.float32:
      bytes_in_dtype = 4
    elif tens.dtype is torch.float64:
      bytes_in_dtype = 8
    else:
      print(name + " has unsupported type")
      continue
    elems = tens.numel()
    sum_params += elems
    sum_bytes += elems * bytes_in_dtype

    print(name + ':\t',
          elems, '\t', elems * bytes_in_dtype * 8,
          '\t', elems * bytes_in_dtype,
          '\t', elems * bytes_in_dtype / 1024,
          '\t', round(elems * bytes_in_dtype / 1024 / 1024, 5))
  print()
  print('All:\t',
          sum_params, '\t', sum_bytes * 8,
          '\t', sum_bytes,
          '\t', sum_bytes / 1024,
          '\t', round(sum_bytes / 1024 / 1024, 5))


def gpu_mem_usage():
    gc.collect()

    if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
        torch.cuda.reset_peak_memory_stats()

    ma = torch.cuda.memory_allocated() / (1024 * 1024)
    max_ma = torch.cuda.max_memory_allocated() / (1024 * 1024)
    ca = torch.cuda.memory_reserved() / (1024 * 1024)
    max_ca = torch.cuda.max_memory_reserved() / (1024 * 1024)


    print(
        f"MA {round(ma, 4)} MB \
        Max_MA {round(max_ma, 4)} MB \
        CA {round(ca, 4)} MB \
        Max_CA {round(max_ca, 4)} MB "
        )
    if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
        torch.cuda.reset_peak_memory_stats()

    return (ma, max_ma, ca, max_ca)

## Custom Linear layer

In [6]:
class CustomLinear(nn.Module):
    """
    A module which applies a linear transformation
    A common name is fully-connected layer, InnerProductLayer in caffe.

    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out, dtype=torch.float32, device=None):
        super().__init__()
        if device is None:
          device = torch.device("cpu")
        self.W = nn.Parameter(torch.Tensor(n_out, n_in).type(dtype).to(device))
        self.b = nn.Parameter(torch.Tensor(n_out).type(dtype).to(device))

        # This is a nice initialization
        stdv = 1./math.sqrt(n_in)
        nn.init.uniform_(self.W, -stdv, stdv)
        nn.init.uniform_(self.b, -stdv, stdv)

    def forward(self, input):
        self.output = torch.add(torch.mm(input, self.W.T), self.b)

        return self.output

In [7]:
in_ch, out_ch = 100, 200 # example numbers

In [8]:
custom_linear_tf32 = CustomLinear(in_ch, out_ch, dtype=torch.float32, device=device_cpu)
print('Theoretical memory usage of linear layer in float32')
print_mem_usage(custom_linear_tf32)

Theoretical memory usage of linear layer in float32
	 Shape
W:	 (200, 100)
b:	 (200,)

Parameters:
	Elems	 Bits		Bytes	 Kb		 Mb
W:	 20000 	 640000 	 80000 	 78.125 	 0.07629
b:	 200 	 6400 	 800 	 0.78125 	 0.00076

Buffers:
	Elems	 Bits		Bytes	 Kb		 Mb

All:	 20200 	 646400 	 80800 	 78.90625 	 0.07706


In [9]:
custom_linear_tf64 = CustomLinear(in_ch, out_ch, dtype=torch.float64, device=device_cpu)
print('Theoretical memory usage of linear layer in float64')
print_mem_usage(custom_linear_tf64)

Theoretical memory usage of linear layer in float64
	 Shape
W:	 (200, 100)
b:	 (200,)

Parameters:
	Elems	 Bits		Bytes	 Kb		 Mb
W:	 20000 	 1280000 	 160000 	 156.25 	 0.15259
b:	 200 	 12800 	 1600 	 1.5625 	 0.00153

Buffers:
	Elems	 Bits		Bytes	 Kb		 Mb

All:	 20200 	 1292800 	 161600 	 157.8125 	 0.15411


In [None]:
in_channel, out_channel = in_ch, out_ch
custom_linear = CustomLinear(in_channel, out_channel, device=device)

print("Theoretical memory usage")
print_mem_usage(custom_linear)
print()
print("Practical memory usage")
gpu_mem_usage()

Theoretical memory usage
	 Shape
W:	 (200, 100)
b:	 (200,)

Parameters:
	Elems	 Bits		Bytes	 Kb		 Mb
W:	 20000 	 640000 	 80000 	 78.125 	 0.07629
b:	 200 	 6400 	 800 	 0.78125 	 0.00076

Buffers:
	Elems	 Bits		Bytes	 Kb		 Mb

All:	 20200 	 646400 	 80800 	 78.90625 	 0.07706

Practical memory usage
MA 0.0776 MB         Max_MA 0.0776 MB         CA 2.0 MB         Max_CA 2.0 MB 


In [11]:

(torch_linear.weight.numel()*4 + torch_linear.bias.numel()*4)/1024/1024

NameError: name 'torch_linear' is not defined

In [None]:
inp = torch.randn(1,in_channel).to(device)

torch_linear = nn.Linear(in_channel, out_channel).to(device)
torch_linear.weight = custom_linear.W
torch_linear.bias = custom_linear.b

torch.linalg.norm(custom_linear(inp) - torch_linear(inp)).item()

## Custom Conv2d layer

In [None]:
class CustomConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size,
                 dtype=torch.float32, device=None):
        super().__init__()
        if device is None:
          device = torch.device("cpu")

        self.kernel_size = kernel_size
        if isinstance(kernel_size, int):
            self.kernel_size = (kernel_size, kernel_size)
        self.out_channels = out_channels
        self.in_channels = in_channels
        self.conv = nn.Parameter(torch.Tensor(self.out_channels, self.in_channels, *self.kernel_size).type(dtype).to(device))
        self.b = nn.Parameter(torch.Tensor(self.out_channels).type(dtype).to(device))

        # This is a nice initialization
        stdv = 1./math.sqrt(self.in_channels)
        nn.init.uniform_(self.conv, -stdv, stdv)
        nn.init.uniform_(self.b, -stdv, stdv)

    def forward(self, x):
        return nn.functional.conv2d(x, self.conv, self.b)

In [None]:
in_channel, out_channel = 5, 10
h, w = 300, 300
kernel_size = 3
inp = torch.randn(1, in_channel, h, w).to(device)
custom_conv = CustomConv2d(in_channel, out_channel, kernel_size, device=device)

In [None]:
torch_conv = nn.Conv2d(in_channel, out_channel, kernel_size).to(device)
torch_conv.weight = custom_conv.conv
torch_conv.bias = custom_conv.b

In [None]:
torch.linalg.norm(custom_conv(inp) - torch_conv(inp))

In [None]:
print_mem_usage(custom_conv)
print()
print_mem_usage(torch_conv)

In [None]:
nn.Conv2d?

# FLOP and Time estimation and dependence on input size

In [None]:
in_channel, out_channel = 3, 21
h, w = 300, 300
kernel_size = 3

In [None]:
torch_conv = nn.Conv2d(in_channel, out_channel, kernel_size).to(device_cpu)
stats = FlopCo(torch_conv,
               img_size = (1, in_channel, h, w),
               device = device_cpu,
               instances = [nn.Conv2d])

print("MFlop:\t", stats.flops[''][0]/10e6)
print("Parameters:\t", stats.params[''])

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

## Linear Flop estimation

In [None]:
flops = []
for in_chan in range(200,4000,200):
  torch_linear = nn.Linear(in_chan, 1000).to(device_cpu)
  stats = FlopCo(torch_linear,
                img_size = (1, in_chan),
                device = device_cpu,
                instances = [nn.Linear])
  flops.append(stats.flops[''][0]/10e6)

In [None]:
import matplotlib.pylab as plt

plt.plot(list(range(200,4000,200)), flops, '*')
plt.xlabel('vector length')
plt.ylabel('MFlop')
plt.title('MFlop vs vector length for Linear layer')
plt.show()

## Convd2d Flop estimation

In [None]:
torch_conv = nn.Conv2d(in_channel, out_channel, kernel_size).to(device_cpu)
flops = []
for h in tqdm(range(100,2000,100)):
  stats = FlopCo(torch_conv,
                img_size = (1, in_channel, h, h),
                device = device_cpu,
                instances = [nn.Conv2d])
  flops.append(stats.flops[''][0]/10e6)

In [None]:
plt.plot(list(range(100,2000,100)), flops, '*')
plt.xlabel('weight/height of "image"')
plt.ylabel('MFlop')
plt.title('MFlop vs weight for Conv2d layer')
plt.show()

## Convd2d time estimation

In [None]:
torch_conv = nn.Conv2d(in_channel, out_channel, kernel_size).to(device_cpu)
time_empir = []
warmups = 5
repeat = 45
for h in tqdm(range(100,2000,100)):
  time_h = []
  input = torch.randn(1, in_channel, h, h).to(device_cpu)
  for _ in range(warmups):
    tmp = torch_conv(input)
  for _ in range(repeat):
    start = time.time()
    tmp = torch_conv(input)
    end = time.time()
    time_h.append(end-start)
  time_empir.append(statistics.mean(time_h))

In [None]:
plt.plot(list(range(100,2000,100)), time_empir, '*')
plt.xlabel('weight/height of "image"')
plt.ylabel('Time, s')
plt.title('Time vs weight for Conv2d layer on CPU')
plt.show()

In [None]:
torch_conv = nn.Conv2d(in_channel, out_channel, kernel_size).to(device)
time_empir = []
warmups = 5
repeat = 30
for h in tqdm(range(100,2000,100)):
  time_h = []
  input = torch.randn(10, in_channel, h, h).to(device)
  for _ in range(warmups):
    tmp = torch_conv(input)
  for _ in range(repeat):
    start = time.time()
    tmp = torch_conv(input)
    torch.cuda.synchronize()
    end = time.time()
    time_h.append(end-start)
  time_empir.append(statistics.mean(time_h))

In [None]:
plt.plot(list(range(100,2000,100)), time_empir, '*')
plt.xlabel('weight/height of "image"')
plt.ylabel('Time, s')
plt.title('Time vs weight for Conv2d layer on GPU')
plt.show()

## ReLU time estimation

In [None]:
torch_relu = nn.ReLU().to(device_cpu)
time_empir = []
vector_len = []
warmups = 5
repeat = 150
for h in tqdm(range(100,2000,100)):
  time_h = []
  input = torch.randn(1, h*10000).to(device_cpu)
  for _ in range(warmups):
    tmp = torch_relu(input)
  for _ in range(repeat):
    start = time.time()
    tmp = torch_relu(input)
    end = time.time()
    time_h.append(end-start)
  time_empir.append(statistics.mean(time_h))
  vector_len.append(input.numel())

In [None]:
plt.plot(vector_len, time_empir, '*')
plt.xlabel('length of vector')
plt.ylabel('Time, s')
plt.title('Time vs length of vector for ReLU layer on CPU')
plt.show()

## Custom BatchNorm2d layer

In [None]:
class CustomBatchNorm2d(nn.Module):
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super().__init__()

        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        self.register_buffer('running_mean', torch.zeros(num_features))
        self.register_buffer('running_var', torch.ones(num_features))

    def forward(self, input):
        momentum = self.momentum

        # calculate running estimates
        if self.training:
            mean = input.mean([0, 2, 3])
            # use biased var in train
            var = input.var([0, 2, 3], unbiased=False)
            n = input.numel() / input.size(1)
            with torch.no_grad():
                self.running_mean = momentum * mean \
                                + (1 - momentum) * self.running_mean
                # update running_var with unbiased var
                self.running_var = momentum * var * n / (n - 1) \
                                + (1 - momentum) * self.running_var
        else:
            mean = self.running_mean
            var = self.running_var

        input = (input - mean[None, :, None, None]) / (torch.sqrt(var[None, :, None, None] + self.eps))

        return input

In [None]:
n_channels = 100
custom_bn = CustomBatchNorm2d(n_channels)
torch_bn = nn.BatchNorm2d(n_channels, affine=False)

inp = torch.randn(20, n_channels, 25, 25, dtype=torch.float32)

In [None]:
torch.linalg.norm(custom_bn(inp) - torch_bn(inp))

In [None]:
print_mem_usage(custom_bn)
print()
print_mem_usage(torch_bn)

## Linear+ReLU flops and time estimation

In [None]:

time_empir = []
flops = []
all_stats = []
warmups = 5
repeat = 45
for in_chan in range(200,4000,200):
  time_h = []
  torch_linear = nn.Linear(in_chan, 1000).to(device_cpu)
  relu = nn.ReLU()
  tiny_model = nn.Sequential(torch_linear, relu)
  input = torch.randn(1, in_chan).to(device_cpu)

  for _ in range(warmups):
    tmp = tiny_model(input)
  for _ in range(repeat):
    start = time.time()
    tmp = tiny_model(input)
    end = time.time()
    time_h.append(end-start)
  time_empir.append(statistics.mean(time_h))

  stats = FlopCo(tiny_model,
                img_size = (1, in_chan),
                device = device_cpu,
                instances = [nn.Linear, nn.ReLU])
  flops.append(stats.total_flops/10e6)
  all_stats.append(stats)


In [None]:
plt.plot(list(range(200,4000,200)), flops, '*')
plt.xlabel('input size')
plt.ylabel('MFlop')
plt.title('MFlop vs weight for Linear+ReLU layers on CPU')
plt.show()

In [None]:
plt.plot(list(range(200,4000,200)), time_empir, '*')
plt.xlabel('input size')
plt.ylabel('Time, s')
plt.title('Time vs weight for Linear+ReLU layers on CPU')
plt.show()

## Conv2d+BN+ReLU flops and time estimation

In [None]:
in_channel, out_channel = 3, 21
h, w = 300, 300
kernel_size = 3

time_empir = []
flops = []
all_stats = []
warmups = 5
repeat = 25
for h in tqdm(range(100,2000,100)):
  time_h = []
  torch_conv = nn.Conv2d(in_channel, out_channel, kernel_size).to(device_cpu)
  torch_bn = nn.BatchNorm2d(out_channel)
  relu = nn.ReLU()
  tiny_model = nn.Sequential(torch_conv, torch_bn, relu)
  input = torch.randn(1, in_channel, h, h).to(device_cpu)

  for _ in range(warmups):
    tmp = tiny_model(input)
  for _ in range(repeat):
    start = time.time()
    tmp = tiny_model(input)
    end = time.time()
    time_h.append(end-start)
  time_empir.append(statistics.mean(time_h))

  stats = FlopCo(tiny_model,
                img_size = input.shape,
                device = device_cpu,
                instances = [nn.Conv2d, nn.BatchNorm2d, nn.ReLU])
  flops.append(stats.total_flops/10e6)
  all_stats.append(stats)



In [None]:
plt.plot(list(range(100,2000,100)), flops, '*')
plt.xlabel('input size')
plt.ylabel('MFlop')
plt.title('MFlop vs weight for Conv2d+BN+ReLU layers on CPU')
plt.show()

In [None]:
plt.plot(list(range(100,2000,100)), time_empir, '*')
plt.xlabel('input size')
plt.ylabel('Time, s')
plt.title('Time vs weight for Conv2d+BN+ReLU layers on CPU')
plt.show()