In [1]:
!pip install onnx
!pip install onnxruntime-gpu
!pip install transformers
!pip install sentence-transformers



In [2]:
# # Load pretrained model and tokenizer
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    temp =  torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return F.normalize(temp, p=2, dim=1)


model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, )
model = AutoModel.from_pretrained(model_name, )

In [3]:
# Get the first example data to run the model and export it to ONNX

sample = ['Hey, how are you today?']
inputs = tokenizer(sample,
                   padding=True,
                   truncation=True,
                   return_tensors="pt"
                   )
inputs

{'input_ids': tensor([[ 101, 4931, 1010, 2129, 2024, 2017, 2651, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [4]:
## Convert Model to ONNX Format
import os
device = torch.device("cpu")

# Set model to inference mode, which is required before exporting the model because some operators behave differently in
# inference and training mode.
model.eval()
model.to(device)

output_dir = os.path.join(".", "onnx_models")

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

export_model_path = os.path.join(output_dir, 'all_MiniLM_L6-v2.onnx')


with torch.no_grad():
    symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
    torch.onnx.export(model,                                            # model being run
                      args=tuple(inputs.values()),                      # model input (or a tuple for multiple inputs)
                      f=export_model_path,                              # where to save the model (can be a file or file-like object)
                      opset_version=11,                                 # the ONNX version to export the model to
                      do_constant_folding=True,                         # whether to execute constant folding for optimization
                      input_names=['input_ids',                         # the model's input names
                                    'attention_mask',
                                    'token_type_ids'],
                      output_names=['start', 'end'],                    # the model's output names
                      dynamic_axes={'input_ids': symbolic_names,        # variable length axes
                                    'attention_mask' : symbolic_names,
                                    'token_type_ids' : symbolic_names,
                                    'start' : symbolic_names,
                                    'end' : symbolic_names})
    print("Model exported at ", export_model_path)

verbose: False, log level: Level.ERROR

Model exported at  ./onnx_models/all_MiniLM_L6-v2.onnx


In [5]:
import pandas as pd
import numpy as np
df = pd.read_csv('./imdb_top_1000.csv', usecols=['Overview'])
df

Unnamed: 0,Overview
0,Two imprisoned men bond over a number of years...
1,An organized crime dynasty's aging patriarch t...
2,When the menace known as the Joker wreaks havo...
3,The early life and career of Vito Corleone in ...
4,A jury holdout attempts to prevent a miscarria...
...,...
995,A young New York socialite becomes interested ...
996,Sprawling epic covering the life of a Texas ca...
997,"In Hawaii in 1941, a private is cruelly punish..."
998,Several survivors of a torpedoed merchant ship...


## Vanilla SBERT CPU

In [6]:
import time
from tqdm import tqdm
total_samples = len(df)

latency = []
outputs_cpu = []
with torch.no_grad():
    for i in tqdm(range(total_samples)):

        data = [df.loc[i, "Overview"]]

        inputs = tokenizer(data,
                            padding=True,
                            truncation=True,
                            return_tensors="pt"
                            )

        start = time.time()
        outputs_cpu.append(mean_pooling(model(**inputs), inputs['attention_mask']).cpu().detach().numpy())

        latency.append(time.time() - start)

print("\n")
print("PyTorch {} Inference time = {} ms".format(device.type, np.round(np.average(latency)*1000, 4)))

100%|██████████| 1000/1000 [00:36<00:00, 27.62it/s]



PyTorch cpu Inference time = 34.2605 ms





## ONNX Converted Model CPU

In [7]:
import onnxruntime
import numpy as np

sess_options = onnxruntime.SessionOptions()

session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])

latency = []
ort_outputs_cpu = []

for i in tqdm(range(total_samples)):

    data = [df.loc[i, "Overview"]]

    inputs = tokenizer(data,
                            padding=True,
                            truncation=True,
                            return_tensors="pt"
                            )

    ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}

    start = time.time()
    op = session.run(None, ort_inputs)
    op = torch.from_numpy(op[0])
    ort_outputs_cpu.append(mean_pooling([op], inputs['attention_mask']).cpu().detach().numpy())
    latency.append(time.time() - start)

print("\n")
print("OnnxRuntime {} Inference time = {} ms".format(device.type, np.round(np.average(latency)*1000, 4)))

100%|██████████| 1000/1000 [00:16<00:00, 60.80it/s]



OnnxRuntime cpu Inference time = 15.5696 ms





In [8]:
outputs_cpu[0][:,:10]

array([[-0.06326339,  0.0414625 , -0.04707527, -0.03361899, -0.02562934,
         0.03499832,  0.00804075, -0.05042004,  0.00215668, -0.03816812]],
      dtype=float32)

In [9]:
ort_outputs_cpu[0][:,:10]

array([[-0.06326343,  0.04146247, -0.04707528, -0.033619  , -0.02562926,
         0.03499835,  0.0080408 , -0.05042008,  0.00215669, -0.03816817]],
      dtype=float32)

## Vanila SBERT GPU

In [10]:
device = torch.device("cuda")

# Set model to inference mode, which is required before exporting the model because some operators behave differently in
# inference and training mode.

model.eval()
model.to(device)

import time
from tqdm import tqdm
total_samples = 1000

latency = []
outputs_gpu = []
with torch.no_grad():
    for i in tqdm(range(total_samples)):

        data = [df.loc[i, "Overview"]]

        inputs = tokenizer(data,
                           padding=True,
                           truncation=True,
                           return_tensors="pt"
                          ).to(device)

        start = time.time()
        outputs_gpu.append(mean_pooling(model(**inputs), inputs['attention_mask']).cpu().detach().numpy())
        latency.append(time.time() - start)

print("\n")
print("PyTorch {} Inference time = {} ms".format(device.type, np.round(np.average(latency)*1000, 4)))

100%|██████████| 1000/1000 [00:07<00:00, 135.29it/s]



PyTorch cuda Inference time = 6.737 ms





## Onnx Converted Model GPU

In [11]:
import onnxruntime
import numpy as np

sess_options = onnxruntime.SessionOptions()

session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CUDAExecutionProvider'])

latency = []
ort_outputs_gpu = []
for i in tqdm(range(total_samples)):

    data = [df.loc[i, "Overview"]]

    inputs = tokenizer(data,
                       padding=True,
                       truncation=True,
                       return_tensors="pt"
                      ).to(device)


    ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}

    start = time.time()
    op = session.run(None, ort_inputs)
    op = torch.from_numpy(op[0])
    ort_outputs_gpu.append(mean_pooling([op], inputs['attention_mask'].cpu()).cpu().detach().numpy())
    latency.append(time.time() - start)

print("\n")
print("OnnxRuntime {} Inference time = {} ms".format(device.type, np.round(np.average(latency)*1000, 4)))

100%|██████████| 1000/1000 [00:02<00:00, 373.49it/s]



OnnxRuntime cuda Inference time = 1.9466 ms





In [12]:
outputs_gpu[0][:,:10]

array([[-0.06326333,  0.04146247, -0.0470753 , -0.03361904, -0.02562935,
         0.03499833,  0.00804079, -0.05042002,  0.00215669, -0.03816818]],
      dtype=float32)

In [13]:
ort_outputs_gpu[0][:,:10]

array([[-0.06326336,  0.04146249, -0.04707528, -0.03361899, -0.02562931,
         0.03499832,  0.0080408 , -0.05042004,  0.00215668, -0.03816817]],
      dtype=float32)