# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cene555/ru-clip-tiny/blob/main/notebooks/ru_CLIP_tiny_onnx.ipynb)

## Select a runtime GPU to continue:

Click Runtime -> Change Runtime Type -> switch "Harware accelerator" to be GPU. Save it, and you maybe connect to GPU

In [1]:
#@title Allowed Resources
import multiprocessing
import torch
from psutil import virtual_memory

ram_gb = round(virtual_memory().total / 1024**3, 1)

print('CPU:', multiprocessing.cpu_count())
print('RAM GB:', ram_gb)
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device.type)

!nvidia-smi

CPU: 2
RAM GB: 12.7
PyTorch version: 1.10.0+cu111
CUDA version: 11.1
cuDNN version: 8005
device: cuda
Tue Feb  1 17:26:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8    11W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                  

## Restart colab session after installation
Reload session if something doesn't work (may need multiple times)

## Install requirements

In [2]:
%%capture
!gdown -O ru-clip-tiny.pkl https://drive.google.com/uc?id=1-3g3J90pZmHo9jbBzsEmr7ei5zm3VXOL

!pip install git+https://github.com/cene555/ru-clip-tiny.git
!pip install git+https://github.com/Lednik7/CLIP-ONNX.git
!pip install onnxruntime-gpu

!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true

In [3]:
import onnxruntime

# priority device (if available)
print(onnxruntime.get_device())

GPU


## Import libraries

In [1]:
import torch
from rucliptiny import RuCLIPtiny
from rucliptiny.utils import get_transform
from rucliptiny.tokenizer import Tokenizer

In [2]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

## Load model

In [3]:
#@title speed_test function

import time

def speed_test(func, data_gen, n=5, empty_cache=True, is_text=False,
               first_run=True):
    if empty_cache: torch.cuda.empty_cache()
    if first_run:
        if is_text:
            input_data1, input_data2 = data_gen()
            func(input_data1, input_data2)
        else:
            input_data = data_gen()
            func(input_data)
        torch.cuda.empty_cache()
    
    values = []
    for _ in range(n):
        if is_text:
            input_data1, input_data2 = data_gen()
        else:
            input_data = data_gen()
        if is_text:
            t = time.time()
            func(input_data1, input_data2)
        else:
            t = time.time()
            func(input_data)
        values.append(time.time() - t)
        if empty_cache: torch.cuda.empty_cache()
    return sum(values) / n

In [4]:
torch.manual_seed(1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
model = RuCLIPtiny()
model.load_state_dict(torch.load('ru-clip-tiny.pkl',
                                 map_location=device))
model = model.to(device).eval()
for x in model.parameters(): x.requires_grad = False
torch.cuda.empty_cache()

In [6]:
transforms = get_transform()
tokenizer = Tokenizer()

Downloading:   0%|          | 0.00/373k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/175 [00:00<?, ?B/s]

## [Speed test] Batch 64

In [7]:
speed_test(model.encode_image, lambda: torch.randint(1, 255, (64, 3, 224, 224)).to(device))

0.011787748336791993

In [8]:
speed_test(model.encode_text,
           lambda: (torch.randint(1, 255, (64, 77)).to(device),
                    torch.randint(0, 2, (64, 77)).to(device)),
           is_text=True)

0.004021787643432617

## Prepare functions

In [9]:
from PIL import Image
import numpy as np

In [10]:
# batch first
image = transforms(Image.open("CLIP.png")).unsqueeze(0).cpu() # [1, 3, 224, 224]

# batch first
texts = ['диаграмма', 'собака', 'кошка']
text_tokens, attention_mask = tokenizer.tokenize(texts, max_len=77)
text_tokens, attention_mask = text_tokens.cpu(), attention_mask.cpu() # [3, 77]

# batch second
dummy_input_text = torch.stack([text_tokens, attention_mask]).detach().cpu()

In [11]:
text_tokens_onnx = text_tokens.detach().cpu().numpy().astype(np.int64)
attention_mask_onnx = attention_mask.detach().cpu().numpy().astype(np.int64)

image_onnx = image.detach().cpu().numpy().astype(np.float32)
text_onnx = torch.stack([text_tokens, attention_mask]).detach().cpu()\
                                                    .numpy().astype(np.int64)

## Convert RuCLIP model to ONNX

In [12]:
class Textual(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_data):
        input_ids, attention_mask = input_data
        x = self.model.transformer(input_ids=input_ids, attention_mask=attention_mask)
        x = x.last_hidden_state[:, 0, :]
        x = self.model.final_ln(x)
        return x

In [13]:
from clip_onnx import clip_onnx
from clip_onnx.utils import DEFAULT_EXPORT

visual_path = "clip_visual.onnx"
textual_path = "clip_textual.onnx"

textual_export_params = DEFAULT_EXPORT.copy()
textual_export_params["dynamic_axes"] = {'input': {1: 'batch_size'},
                                         'output': {0: 'batch_size'}}

onnx_model = clip_onnx(model.cpu(), visual_path=visual_path, textual_path=textual_path)
onnx_model.convert2onnx(image, dummy_input_text, verbose=True,
                        textual_wrapper=Textual,
                        textual_export_params=textual_export_params)

[CLIP ONNX] Start convert visual model
[CLIP ONNX] Start check visual model
[CLIP ONNX] Start convert textual model


  import sys


[CLIP ONNX] Start check textual model
[CLIP ONNX] Models converts successfully


## [ONNX] CUDA inference mode

In [14]:
# Optional cell, can be skipped

visual_path = "clip_visual.onnx"
textual_path = "clip_textual.onnx"

onnx_model.load_onnx(visual_path,
                     textual_path,
                     29.9119) # model.logit_scale.exp()

In [15]:
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CUDAExecutionProvider"]) # cuda mode

In [16]:
onnx_model.visual_session.get_providers()

['CUDAExecutionProvider', 'CPUExecutionProvider']

## [Speed test] Batch 64

In [17]:
speed_test(onnx_model.encode_image,
           lambda: np.random.uniform(1, 255, (64, 3, 224, 224))\
                                                .astype(np.float32))

0.28517956733703614

In [18]:
speed_test(onnx_model.encode_text,
           lambda: np.stack([np.random.randint(1, 255, (64, 77)),
                             np.random.randint(0, 2, (64, 77))]))

0.012344837188720703

## [Speed test] Compare Pytorch and ONNX

In [19]:
import random
import torch
import time

def set_seed():
    torch.manual_seed(12)
    torch.cuda.manual_seed(12)
    np.random.seed(12)
    random.seed(12)

    torch.backends.cudnn.deterministic=True

In [20]:
n = 20
model = model.to(device)

clip_results = {"encode_image": [],
                "encode_text": []}

onnx_results = {"encode_image": [],
                "encode_text": []}
                
for batch in [2, 8, 16, 32, 64]:
    set_seed()
    result = speed_test(onnx_model.encode_image,
                        lambda: np.random.uniform(1, 255, (batch, 3, 224, 224))\
                        .astype(np.float32), n=n)
    result = round(result, 3)
    onnx_results["encode_image"].append([batch, result])
    print("onnx", batch, "encode_image", result)

    set_seed()
    with torch.inference_mode():
        result = speed_test(model.encode_image,
                            lambda: torch.randint(1, 255, (batch, 3, 224, 224))\
                            .to(device), n=n)
        result = round(result, 3)
    print("torch", batch, "encode_image", result)
    clip_results["encode_image"].append([batch, result])

    set_seed()
    result = speed_test(onnx_model.encode_text,
                        lambda: np.stack([np.random.randint(1, 255, (batch, 77)),
                                          np.random.randint(0, 2, (batch, 77))]),
                        n=n)
    result = round(result, 3)
    onnx_results["encode_text"].append([batch, result])
    print("onnx", batch, "encode_text", result)

    set_seed()
    with torch.inference_mode():
        result = speed_test(model.encode_text,
                            lambda: (torch.randint(1, 255, (batch, 77)).to(device),
                                     torch.randint(0, 2, (batch, 77)).to(device)),
                            is_text=True, n=n)
        result = round(result, 3)
    print("torch", batch, "encode_text", result)
    clip_results["encode_text"].append([batch, result])

    print("-" * 78)

onnx 2 encode_image 0.011
torch 2 encode_image 0.018
onnx 2 encode_text 0.001
torch 2 encode_text 0.003
------------------------------------------------------------------------------
onnx 8 encode_image 0.035
torch 8 encode_image 0.01
onnx 8 encode_text 0.002
torch 8 encode_text 0.003
------------------------------------------------------------------------------
onnx 16 encode_image 0.07
torch 16 encode_image 0.01
onnx 16 encode_text 0.004
torch 16 encode_text 0.003
------------------------------------------------------------------------------
onnx 32 encode_image 0.145
torch 32 encode_image 0.012
onnx 32 encode_text 0.007
torch 32 encode_text 0.004
------------------------------------------------------------------------------
onnx 64 encode_image 0.294
torch 64 encode_image 0.013
onnx 64 encode_text 0.014
torch 64 encode_text 0.005
------------------------------------------------------------------------------


In [21]:
import pandas as pd

pd.DataFrame({"backend": ["onnx", "torch"] * 5,
              "batch": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],
              "encode_image": [j[1] for i in zip(onnx_results["encode_image"],
                                              clip_results["encode_image"]) for j in i],
              "encode_text": [j[1] for i in zip(onnx_results["encode_text"],
                                              clip_results["encode_text"]) for j in i]})

Unnamed: 0,backend,batch,encode_image,encode_text
0,onnx,2,0.011,0.001
1,torch,2,0.018,0.003
2,onnx,8,0.035,0.002
3,torch,8,0.01,0.003
4,onnx,16,0.07,0.004
5,torch,16,0.01,0.003
6,onnx,32,0.145,0.007
7,torch,32,0.012,0.004
8,onnx,64,0.294,0.014
9,torch,64,0.013,0.005


In [22]:
onnx_df = pd.DataFrame({"ONNX": ["RuCLIPtiny"] * 5,
              "batch": [2, 8, 16, 32, 64],
              "encode_image": [i[1] for i in onnx_results["encode_image"]],
              "encode_text": [i[1] for i in onnx_results["encode_text"]]})
onnx_df["total"] = onnx_df["encode_image"] + onnx_df["encode_text"]

print(onnx_df.to_markdown(index=False))

| ONNX       |   batch |   encode_image |   encode_text |   total |
|:-----------|--------:|---------------:|--------------:|--------:|
| RuCLIPtiny |       2 |          0.011 |         0.001 |   0.012 |
| RuCLIPtiny |       8 |          0.035 |         0.002 |   0.037 |
| RuCLIPtiny |      16 |          0.07  |         0.004 |   0.074 |
| RuCLIPtiny |      32 |          0.145 |         0.007 |   0.152 |
| RuCLIPtiny |      64 |          0.294 |         0.014 |   0.308 |


In [23]:
clip_df = pd.DataFrame({"TORCH": ["RuCLIPtiny"] * 5,
              "batch": [2, 8, 16, 32, 64],
              "encode_image": [i[1] for i in clip_results["encode_image"]],
              "encode_text": [i[1] for i in clip_results["encode_text"]]})
clip_df["total"] = clip_df["encode_image"] + clip_df["encode_text"]
print(clip_df.to_markdown(index=False))

| TORCH      |   batch |   encode_image |   encode_text |   total |
|:-----------|--------:|---------------:|--------------:|--------:|
| RuCLIPtiny |       2 |          0.018 |         0.003 |   0.021 |
| RuCLIPtiny |       8 |          0.01  |         0.003 |   0.013 |
| RuCLIPtiny |      16 |          0.01  |         0.003 |   0.013 |
| RuCLIPtiny |      32 |          0.012 |         0.004 |   0.016 |
| RuCLIPtiny |      64 |          0.013 |         0.005 |   0.018 |
