In [1]:
import os
import torch

from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time
import numpy as np
import lightning
import pytorch_lightning 
import onnx
import onnxruntime as ort
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.quantization import quantize_dynamic
import seaborn as sns
import random
from PIL import Image
from IPython.display import clear_output

import sys
sys.path.insert(0, "./mlflow-scripts")
from model import LightningModel, get_depthpro_model
from dataloader import get_dataloaders, Urban100Dataset, collate_fn
clear_output()

In [2]:
model_path = "./model_weights_10.pth"
model = LightningModel(get_depthpro_model(32))
checkpoint = torch.load(model_path, map_location="cpu")
state_dict = checkpoint # ["state_dict"]  
model.load_state_dict(state_dict)
model.eval()  
clear_output()

### model size on disk

In [3]:
model_size = os.path.getsize(model_path) 
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")

Model Size on Disk: 654.93 MB


### Inference latency

In [8]:

def compute_inference_latency(model):
    test_dataset = Urban100Dataset()
    test_loader = DataLoader(
        test_dataset,
        batch_size=1,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=0
    )
    clear_output()
    
    def offline_eval(batch):
        with torch.no_grad():
            lr, hr = batch  # adjust depending on dataset output
            lr = lr.to("cpu")
            hr = hr.to("cpu")
            sr = model(lr)
            sr = F.interpolate(sr, size=hr.shape[2:])
            return model.mse(sr, hr), model.psnr(sr, hr), model.ssim(sr, hr), model.snr(sr, hr)
    
    
    latencies = []
    count = 0
    num_trials = 10
    for batch in test_loader:
        print(count)
        if count > num_trials:
            break
        t1 = time.time()
        _ = offline_eval(batch)
        t2 = time.time()
        latencies.append(t2-t1)
        count += 1
    clear_output() 

    print(f"average inferenace latency is {np.mean(latencies)} seconds.")
    print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
    print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")


In [7]:
compute_inference_latency(model)

average inferenace latency is 8.018798221241344 seconds.
Inference Latency (single sample, median): 4520.34 ms
Inference Latency (single sample, 95th percentile): 21563.01 ms
Inference Latency (single sample, 99th percentile): 25347.59 ms
Inference Throughput (single sample): 0.11 FPS


### Optimization with Quantization

In [4]:
torch.backends.quantized.engine = 'qnnpack'
quantized_model = torch.quantization.quantize_dynamic(
    model,  # the original model
    {torch.nn.Linear},  # a set of layers to dynamically quantize
    dtype=torch.qint8  # the target dtype for quantized weights
)

In [9]:
compute_inference_latency(quantized_model)

average inferenace latency is 6.026599504730918 seconds.
Inference Latency (single sample, median): 3908.45 ms
Inference Latency (single sample, 95th percentile): 14256.32 ms
Inference Latency (single sample, 99th percentile): 14458.82 ms
Inference Throughput (single sample): 0.15 FPS
