In [4]:
import torch
from transformers import AutoImageProcessor, AutoTokenizer, TimesformerModel
from optimum.exporters.onnx import TextEncoderOnnxConfig #VisionOnnxConfig # Нужно добавить VisionOnnxConfig в импорт пакета
from optimum.exporters.onnx.config import VisionOnnxConfig
from optimum.utils.input_generators import DummyVisionInputGenerator
from optimum.utils import NormalizedConfig
from optimum.exporters.onnx import export
from transformers import PretrainedConfig
from typing import *
from pathlib import Path

In [5]:
WEIGHTS = '../weights/csl_transformers_base'

# Генератор входов для модели

In [6]:
class VideoInputGenerator(DummyVisionInputGenerator):
    def generate(self, input_name: str, int_dtype, float_dtype, framework: str = "pt"):
        return super().random_float_tensor(shape=[2, 8, 3, 224, 224], framework=framework)

# Конфиг для конвертации

In [7]:
class MyTimesformerOnnxConfig(VisionOnnxConfig):

    NORMALIZED_CONFIG_CLASS = NormalizedConfig
    DUMMY_INPUT_GENERATOR_CLASSES = (VideoInputGenerator,)

    @property
    def inputs(self) -> Dict[str, Dict[int, str]]:
        return {
            "pixel_values": {0: "batch_size", 1: "num_frames", 2: "num_channels", 3: "height", 4: "width"},
        }

    @property
    def outputs(self) -> Dict[str, Dict[int, str]]:
        return {
            "last_hidden_state": {0: "batch_size"},
        }

# Конвертация

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
model = TimesformerModel.from_pretrained(WEIGHTS)
config = PretrainedConfig.from_pretrained(WEIGHTS)

onnx_path = Path("model.onnx")
onnx_config = MyTimesformerOnnxConfig(config)

You are using a model of type timesformer to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


In [10]:
onnx_inputs, onnx_outputs = export(model, onnx_config, onnx_path, onnx_config.DEFAULT_ONNX_OPSET)

Using framework PyTorch: 2.4.1+cu121
  if embeddings.size(1) != self.position_embeddings.size(1):
  if num_frames != self.time_embeddings.size(1):


# Запуск сессии ORT

In [11]:
import torch
import numpy as np
import onnxruntime as ort

In [12]:
ort.get_available_providers()

['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']

In [13]:
path = '../weights/csl_transformers_base/model.onnx'
options = ort.SessionOptions()
options.graph_optimization_level = \
    ort.GraphOptimizationLevel.ORT_ENABLE_ALL 
sess = ort.InferenceSession(path, providers=['CUDAExecutionProvider'], sess_options=options)

model.cuda()

[0;93m2024-09-27 21:44:00.223146496 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-09-27 21:44:00.223187615 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


TimesformerModel(
  (embeddings): TimesformerEmbeddings(
    (patch_embeddings): TimesformerPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (time_drop): Dropout(p=0.0, inplace=False)
  )
  (encoder): TimesformerEncoder(
    (layer): ModuleList(
      (0): TimesformerLayer(
        (drop_path): Identity()
        (attention): TimeSformerAttention(
          (attention): TimesformerSelfAttention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
          )
          (output): TimesformerSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): TimesformerIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): Dropout(p=0.0, inplace=False

# Сранение скоростей работы

In [14]:
import time

onnx_time = []
torch_time = []
total = 32
batch = 16

for i in range(total):
    data = torch.rand([batch, 8, 3, 224, 224])
    
    t1 = time.time()
    data = data.cuda()
    with torch.no_grad():
        orig = model(data)
        orig.last_hidden_state.cpu()
        torch_time.append(time.time() - t1)

In [15]:
for i in range(total):
    inputs = {
        "pixel_values": torch.rand([batch, 8, 3, 224, 224]).numpy().astype(np.float32),
    }
    
    t1 = time.time()
    outputs = sess.run(None, inputs)
    onnx_time.append(time.time() - t1)
    

In [16]:
print(np.array(onnx_time).mean())
print(np.array(torch_time).mean())

0.33456847816705704
0.40667273104190826


# Сравнение точности конвертации

In [17]:
data = torch.rand([batch, 8, 3, 224, 224])
inputs = {
    "pixel_values": data.numpy().astype(np.float32),
}

In [18]:
outputs = sess.run(None, inputs)
with torch.no_grad():
    orig = model(data.cuda())

In [19]:
orig.last_hidden_state[:, 0].cpu().numpy().shape

(16, 768)

In [20]:
print(np.abs(orig.last_hidden_state[:, 0].cpu().numpy() - outputs[0][:, 0]).max())
print(np.abs(orig.last_hidden_state[:, 0].cpu().numpy() - outputs[0][:, 0]).min())

0.0002488792
0.0
