In [5]:
from torchvision import models
import torch
import torch.nn as nn
from torch.profiler import profile, ProfilerActivity
import torch.distributed as dist
import torch.multiprocessing as mp
from torchsummary import summary
from deepspeed.profiling.flops_profiler import get_model_profile

In [2]:
num_parameters = sum(p.numel() for p in models.resnet18().parameters())
num_parameters

11689512

In [4]:
models.resnet18()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [7]:
with torch.cuda.device(0):
    model = models.vgg16()
    batch_size = 1
    flops, macs, params = get_model_profile(model=model, # model
                                     input_shape=(batch_size, 3, 224, 224), # input shape or input to the input_constructor
                                     print_profile=True, # prints the model graph with the measured profile attached to each module
                                     top_modules=3, # the number of top modules to print aggregated profile
                                     warm_up=3, # the number of warm-ups before measuring the time of each module
    )
    print("{:<30}  {:<8}".format("Batch size: ", batch_size))
    print('{:<30}  {:<8}'.format('Number of MACs: ', macs))
    print('{:<30}  {:<8}'.format('Number of parameters: ', params))
    print('{:<30}  {:<8}'.format('Number of FLOPs: ', flops))


-------------------------- DeepSpeed Flops Profiler --------------------------
Profile Summary at step 3:
Notations:
data parallel size (dp_size), model parallel size(mp_size),
number of parameters (params), number of multiply-accumulate operations(MACs),
number of floating-point operations (flops), floating-point operations per second (FLOPS),
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
step (weights update latency), iter latency (sum of fwd, bwd and step latency)

params per gpu:                                               138.36 M
params of model = params per GPU * mp_size:                   1       
fwd MACs per GPU:                                             15.47 GMACs
fwd flops per GPU:                                            30.97 G 
fwd flops of model = fwd flops per GPU * mp_size:             1       
fwd latency:                                                  59.17 ms
fwd FLOPS per GPU = fwd flops per GPU / fwd latency:    