# Description

Measurements of model inference time performed on a laptop.

In [1]:
import torch
import pandas as pd
import numpy as np

from product_classificator import Classificator
from product_classificator.utils import SpeedTest
from product_classificator.training.utils import get_images_from_zip

import warnings
warnings.filterwarnings("ignore")

In [2]:
test = pd.read_parquet('D:/HorizontalML/test_short.parquet')

image_names = test.nm.apply(lambda x: str(x) + '.jpg').values
images = get_images_from_zip(image_names, 'D:/HorizontalML/wb_school_horizcv_images.zip')
texts = test.description.values

# Base model

In [3]:
device = 'cuda'
clf = Classificator(device=device)

st = SpeedTest(clf)

In [4]:
test_results = pd.DataFrame(
    columns = pd.MultiIndex.from_product([['CUDA', 'CPU'], ['sample', 'batch64']]),
)

st.test_inference(texts, images, batch_size=1, warm_iterations=100)
test_results.loc['base', ('CUDA', 'sample')] = np.mean(st.tests[-1]['log'])

for size, col in zip([32, 64], ['batch32', 'batch64']):
    st.test_inference(texts, images, batch_size=size)
    test_results.loc['base', ('CUDA', col)] = np.mean(st.tests[-1]['log'])

Warming up: 100%|████████████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 18.86it/s]
Inference (batch: 1): 100%|████████████████████████████████████████████████████████| 1255/1255 [01:07<00:00, 18.68it/s]
Inference (batch: 32): 100%|███████████████████████████████████████████████████████████| 39/39 [00:50<00:00,  1.30s/it]
Inference (batch: 64): 100%|███████████████████████████████████████████████████████████| 19/19 [00:49<00:00,  2.58s/it]


In [5]:
clf = clf.to('cpu')

for size, col in zip([1, 32, 64], ['sample', 'batch32', 'batch64']):
    st.test_inference(texts[:640], images[:640], batch_size=size)
    test_results.loc['base', ('CPU', col)] = np.mean(st.tests[-1]['log'])

Inference (batch: 1): 100%|██████████████████████████████████████████████████████████| 640/640 [08:16<00:00,  1.29it/s]
Inference (batch: 32): 100%|███████████████████████████████████████████████████████████| 20/20 [07:30<00:00, 22.51s/it]
Inference (batch: 64): 100%|███████████████████████████████████████████████████████████| 10/10 [07:03<00:00, 42.40s/it]


# Model converted to ONNX

In [6]:
clf.to_onnx_clip()

In [7]:
for size, col in zip([1, 32, 64], ['sample', 'batch32', 'batch64']):
    st.test_inference(texts[:640], images[:640], batch_size=size)
    test_results.loc['onnx', ('CPU', col)] = np.mean(st.tests[-1]['log'])

Inference (batch: 1): 100%|██████████████████████████████████████████████████████████| 640/640 [05:17<00:00,  2.02it/s]
Inference (batch: 32): 100%|███████████████████████████████████████████████████████████| 20/20 [04:22<00:00, 13.11s/it]
Inference (batch: 64): 100%|███████████████████████████████████████████████████████████| 10/10 [04:19<00:00, 25.95s/it]


In [8]:
clf.to('cuda')

for size, col in zip([1, 32, 64], ['sample', 'batch32', 'batch64']):
    st.test_inference(texts, images, batch_size=size)
    test_results.loc['onnx', ('CUDA', col)] = np.mean(st.tests[-1]['log'])

Inference (batch: 1): 100%|████████████████████████████████████████████████████████| 1255/1255 [01:02<00:00, 19.95it/s]
Inference (batch: 32): 100%|███████████████████████████████████████████████████████████| 39/39 [00:52<00:00,  1.34s/it]
Inference (batch: 64): 100%|███████████████████████████████████████████████████████████| 19/19 [14:17<00:00, 45.12s/it]


# Results

In [9]:
st.machine_info

{'OS': 'Windows',
 'CPU': 'Intel64 Family 6 Model 158 Stepping 13, GenuineIntel',
 'CPU cores': 12,
 'GPUs': {0: 'GPU 0: NVIDIA GeForce RTX 2060 (6 GB)'},
 'GPU driver': '555.85',
 'RAM': '32 GB'}

In [30]:
test_results.sort_index(axis=1,level=[0,1],ascending=[True,True]).style.highlight_min(axis=0, color='lightgreen')

Unnamed: 0_level_0,CPU,CPU,CPU,CUDA,CUDA,CUDA
Unnamed: 0_level_1,batch32,batch64,sample,batch32,batch64,sample
base,22.506521,42.39495,0.775298,1.297878,2.579755,0.052864
onnx,13.113996,25.950253,0.49493,1.336286,45.120367,0.049569


Utilizing the model converted to .onnx one can reduce inference time on CPU by 1.5-1.7 times, whereas there is no improvement in case of computations on GPU. Moreover, high batches for .onnx model can significantly worse inference time on GPU.