# Notebook to experiment with model optimizing model for inference

In [13]:
# import dependencies
import torch
import cv2
import numpy as np
from time import time
from torchvision.models import detection
import torch.utils.benchmark as benchmark

print(torch.__version__)

2.3.1+cu121


In [14]:
# benchmark basic model.
num_threads = torch.get_num_threads()
print(f"Benchmarking on {num_threads} threads")

Benchmarking on 2 threads


In [15]:
# load baseline model for optimization and benchmarking...
baseline_model = detection.ssdlite320_mobilenet_v3_large(weights=detection.SSDLite320_MobileNet_V3_Large_Weights.DEFAULT)
baseline_model.eval()

SSD(
  (backbone): SSDLiteFeatureExtractorMobileNet(
    (features): Sequential(
      (0): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): Hardswish()
        )
        (1): InvertedResidual(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): Conv2dNormActivation(
              (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (2): Invert

In [29]:
# benchmark model
setup = '''
import torch
from __main__ import baseline_model

x = torch.rand(1, 3, 640, 480)
'''

baseline_benchmark = benchmark.Timer(
    stmt = "baseline_model(x)",
    setup= setup,
    num_threads=num_threads,
    label="Baseline model")

print(baseline_benchmark.timeit(100))

<torch.utils.benchmark.utils.common.Measurement object at 0x7c8685c7dc60>
Baseline model
setup:
  import torch
  from __main__ import baseline_model

  x = torch.rand(1, 3, 640, 480)

  582.75 ms
  1 measurement, 100 runs , 2 threads


In [30]:
reduced_resolution_setup = '''
import torch
from __main__ import baseline_model

x = torch.rand(1, 3, 320, 240)
'''

baseline_benchmark_reduced_resolution =  benchmark.Timer(
    stmt = "baseline_model(x)",
    setup= reduced_resolution_setup,
    num_threads=num_threads,
    label="Baseline model",
    sub_label="reduced resolution")

print(baseline_benchmark_reduced_resolution.timeit(100))

<torch.utils.benchmark.utils.common.Measurement object at 0x7c8685c7d4b0>
Baseline model: reduced resolution
setup:
  import torch
  from __main__ import baseline_model

  x = torch.rand(1, 3, 320, 240)

  615.39 ms
  1 measurement, 100 runs , 2 threads


### Torch Scripting
general points to note scripting and tracing applies optimization to the model to improves inference speed in production environment. Tracing the model effectively freezes the conditional logic of the model to match the data given during tracing. There are more subtle difference between the two approaches.

In [None]:
# Apply torch scripitng...
scripted_model = torch.jit.script(baseline_model.eval())
scripted_model.eval()

RecursiveScriptModule(
  original_name=SSD
  (backbone): RecursiveScriptModule(
    original_name=SSDLiteFeatureExtractorMobileNet
    (features): RecursiveScriptModule(
      original_name=Sequential
      (0): RecursiveScriptModule(
        original_name=Sequential
        (0): RecursiveScriptModule(
          original_name=Conv2dNormActivation
          (0): RecursiveScriptModule(original_name=Conv2d)
          (1): RecursiveScriptModule(original_name=BatchNorm2d)
          (2): RecursiveScriptModule(original_name=Hardswish)
        )
        (1): RecursiveScriptModule(
          original_name=InvertedResidual
          (block): RecursiveScriptModule(
            original_name=Sequential
            (0): RecursiveScriptModule(
              original_name=Conv2dNormActivation
              (0): RecursiveScriptModule(original_name=Conv2d)
              (1): RecursiveScriptModule(original_name=BatchNorm2d)
              (2): RecursiveScriptModule(original_name=ReLU)
            )
     

In [None]:
# benchmark the performance of the model
setup = '''
import torch
from __main__ import scripted_model

x = torch.rand(3, 640, 480)
'''
scripted_model_benchmark = benchmark.Timer(
    stmt = "scripted_model([x])", # interesting note that i've add to pass the 
    setup= setup,
    num_threads=num_threads,
    label="Scripted baseline model")

print(scripted_model_benchmark.timeit(1000))



<torch.utils.benchmark.utils.common.Measurement object at 0x7c867b6124a0>
Scripted baseline model
setup:
  import torch
  from __main__ import scripted_model

  x = torch.rand(3, 640, 480)

  529.44 ms
  1 measurement, 100 runs , 2 threads


Scripting the model results in some improvement in time taken to run inference, the resulting speedup are not consistent with multiple run. As a sidenote i've also tried reducing the resolution of the input image passed to the model to see if it improves inference time, it doesn't seem to result in any significant speed up. The results reported above where obtained after briefly going through notes on torchscripting there might be more details/approach to scripting and tracing that might improve inference time.

i'll try other approaches
- quantizations
- experimenting with onnx runtime

https://www.reddit.com/r/MachineLearning/comments/yg1mpz/d_how_to_get_the_fastest_pytorch_inference_and/

As a side note or maybe a more concluding thought on scripting as an optimization approach.   
Scripted model main advantage is that they can also run independently of a python environment, So they are designed to be flexible and portable, enabling deployment into a non-python environment. They are not necessarily an optimization technique to speed up inference, it's more focused on flexible and portable deployments.

### Quantization

# Onnx