In [1]:
!pip install onnx
!pip install onnxruntime-gpu
!pip install transformers
!pip install sentence-transformers



In [2]:
# # Load pretrained model and tokenizer
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    temp =  torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return F.normalize(temp, p=2, dim=1)


model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, )
model = AutoModel.from_pretrained(model_name, )


In [3]:
# Get the first example data to run the model and export it to ONNX

sample = ['Hey, how are you today?']
inputs = tokenizer(sample,
                   padding=True,
                   truncation=True,
                   return_tensors="pt"
                   )
inputs

{'input_ids': tensor([[ 101, 4931, 1010, 2129, 2024, 2017, 2651, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [4]:
## Convert Model to ONNX Format
import os
device = torch.device("cpu")

# Set model to inference mode, which is required before exporting the model because some operators behave differently in
# inference and training mode.
model.eval()
model.to(device)

output_dir = os.path.join(".", "onnx_models")

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

export_model_path = os.path.join(output_dir, 'all_MiniLM_L6-v2.onnx')


with torch.no_grad():
    symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
    torch.onnx.export(model,                                            # model being run
                      args=tuple(inputs.values()),                      # model input (or a tuple for multiple inputs)
                      f=export_model_path,                              # where to save the model (can be a file or file-like object)
                      opset_version=11,                                 # the ONNX version to export the model to
                      do_constant_folding=True,                         # whether to execute constant folding for optimization
                      input_names=['input_ids',                         # the model's input names
                                    'attention_mask',
                                    'token_type_ids'],
                      output_names=['start', 'end'],                    # the model's output names
                      dynamic_axes={'input_ids': symbolic_names,        # variable length axes
                                    'attention_mask' : symbolic_names,
                                    'token_type_ids' : symbolic_names,
                                    'start' : symbolic_names,
                                    'end' : symbolic_names})
    print("Model exported at ", export_model_path)

verbose: False, log level: Level.ERROR

Model exported at  ./onnx_models/all_MiniLM_L6-v2.onnx


In [5]:
# Generate Quantised ONNX model
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

quantized_model_path = "./onnx_models/all_MiniLM_L6-v2_quantised.onnx"

onnx_opt_model = onnx.load(export_model_path)
quantize_dynamic(export_model_path,
                 quantized_model_path,
                 weight_type=QuantType.QInt8)

print(f"quantized model saved to:{quantized_model_path}")

Ignore MatMul due to non constant B: /[/encoder/layer.0/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layer.0/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layer.1/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layer.1/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layer.2/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layer.2/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layer.3/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layer.3/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layer.4/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layer.4/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layer.5/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layer.5/attention/self/MatMul_1]
quantized model saved to:./onnx_models/a

## Using Sentence Transformer Library (CPU)

In [6]:
from sentence_transformers import SentenceTransformer

model_sbert_cpu = SentenceTransformer(model_name, device=torch.device("cpu"))

import time
from tqdm import tqdm
total_samples = 1000

latency = []

for i in tqdm(range(total_samples)):
    # data = dataset[i]
    # inputs = {
    #     'input_ids':      data[0].to(device).reshape(1, max_seq_length),
    #     'attention_mask': data[1].to(device).reshape(1, max_seq_length),
    #     'token_type_ids': data[2].to(device).reshape(1, max_seq_length)
    # }
    start = time.time()
    embeddings_cpu = model_sbert_cpu.encode(sample)

    latency.append(time.time() - start)

print("\n")
print("Sentence Transformer SBERT {} Inference time = {} ms".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))

100%|██████████| 1000/1000 [00:28<00:00, 34.97it/s]



Sentence Transformer SBERT cpu Inference time = 27.50 ms





## Vanila SBERT CPU

In [7]:
import time
from tqdm import tqdm
total_samples = 1000

latency = []
with torch.no_grad():
    for i in tqdm(range(total_samples)):
        # data = dataset[i]
        # inputs = {
        #     'input_ids':      data[0].to(device).reshape(1, max_seq_length),
        #     'attention_mask': data[1].to(device).reshape(1, max_seq_length),
        #     'token_type_ids': data[2].to(device).reshape(1, max_seq_length)
        # }
        start = time.time()
        outputs_cpu = mean_pooling(model(**inputs), inputs['attention_mask']).cpu().detach().numpy()

        latency.append(time.time() - start)
print("\n")
print("PyTorch {} Inference time = {} ms".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))

100%|██████████| 1000/1000 [00:14<00:00, 69.30it/s]



PyTorch cpu Inference time = 14.14 ms





## ONNX Converted Model CPU

In [8]:
import onnxruntime
import numpy

sess_options = onnxruntime.SessionOptions()

session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])

latency = []
for i in tqdm(range(total_samples)):
    # data = dataset[i]
    ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}
    # ort_inputs = {
    #     'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
    #     'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
    #     'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
    # }
    start = time.time()
    op = session.run(None, ort_inputs)
    op = torch.from_numpy(op[0])
    ort_outputs_cpu = mean_pooling([op], inputs['attention_mask']).cpu().detach().numpy()
    latency.append(time.time() - start)
print("\n")
print("OnnxRuntime cpu Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))

100%|██████████| 1000/1000 [00:05<00:00, 174.17it/s]



OnnxRuntime cpu Inference time = 5.63 ms





## Quantised ONNX Converted Model CPU

In [9]:
import onnxruntime
import numpy

sess_options = onnxruntime.SessionOptions()

session = onnxruntime.InferenceSession(quantized_model_path, sess_options, providers=['CPUExecutionProvider'])

latency = []
for i in tqdm(range(total_samples)):
    # data = dataset[i]
    ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}
    # ort_inputs = {
    #     'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
    #     'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
    #     'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
    # }
    start = time.time()
    op = session.run(None, ort_inputs)
    op = torch.from_numpy(op[0])
    ort_outputs_quantised_cpu = mean_pooling([op], inputs['attention_mask']).cpu().detach().numpy()
    latency.append(time.time() - start)
print("\n")
print("OnnxRuntime Quantised cpu Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))

100%|██████████| 1000/1000 [00:03<00:00, 271.16it/s]



OnnxRuntime Quantised cpu Inference time = 3.58 ms





In [10]:
embeddings_cpu[:,:10]

array([[-0.03050258,  0.0481415 ,  0.10223   ,  0.0486282 ,  0.01371921,
        -0.05840193,  0.10347056, -0.03206377, -0.10671089, -0.00481879]],
      dtype=float32)

In [11]:
outputs_cpu[:,:10]

array([[-0.03050258,  0.0481415 ,  0.10223   ,  0.0486282 ,  0.01371921,
        -0.05840193,  0.10347056, -0.03206377, -0.10671089, -0.00481879]],
      dtype=float32)

In [12]:
ort_outputs_cpu[:,:10]

array([[-0.03050255,  0.04814148,  0.10222996,  0.04862824,  0.01371919,
        -0.05840201,  0.10347056, -0.03206379, -0.10671096, -0.00481878]],
      dtype=float32)

In [13]:
ort_outputs_quantised_cpu[:,:10]

array([[-0.03625318,  0.02991685,  0.11215121,  0.03071814,  0.0087405 ,
        -0.05622218,  0.09673943, -0.01424983, -0.11920619, -0.0072534 ]],
      dtype=float32)

## Using Sentence Transformers library (GPU)

In [14]:
device = torch.device("cuda")
model_sbert_gpu = SentenceTransformer(model_name, device=device)
total_samples = 1000

latency = []

for i in tqdm(range(total_samples)):
    # data = dataset[i]
    # inputs = {
    #     'input_ids':      data[0].to(device).reshape(1, max_seq_length),
    #     'attention_mask': data[1].to(device).reshape(1, max_seq_length),
    #     'token_type_ids': data[2].to(device).reshape(1, max_seq_length)
    # }
    start = time.time()
    embeddings_gpu = model_sbert_gpu.encode(sample)

    latency.append(time.time() - start)

print("\n")
print("Sentence Transformer SBERT {} Inference time = {} ms".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))

100%|██████████| 1000/1000 [00:08<00:00, 117.30it/s]



Sentence Transformer SBERT cuda Inference time = 8.40 ms





## Vanila SBERT GPU

In [15]:
device = torch.device("cuda")

# Set model to inference mode, which is required before exporting the model because some operators behave differently in
# inference and training mode.
model.eval()
model.to(device)


sample = ['Hey, how are you today?']
inputs = tokenizer(sample,
                   padding=True,
                   truncation=True,
                   return_tensors="pt"
                   )
inputs.to(device)

import time
from tqdm import tqdm
total_samples = 1000

latency = []
with torch.no_grad():
    for i in tqdm(range(total_samples)):
        # data = dataset[i]
        # inputs = {
        #     'input_ids':      data[0].to(device).reshape(1, max_seq_length),
        #     'attention_mask': data[1].to(device).reshape(1, max_seq_length),
        #     'token_type_ids': data[2].to(device).reshape(1, max_seq_length)
        # }
        start = time.time()
        outputs_gpu = mean_pooling(model(**inputs), inputs['attention_mask']).cpu().detach().numpy()
        latency.append(time.time() - start)
print("\n")
print("PyTorch {} Inference time = {} ms".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))

100%|██████████| 1000/1000 [00:05<00:00, 180.17it/s]



PyTorch cuda Inference time = 5.47 ms





## Onnx Converted Model GPU

In [16]:
import onnxruntime
import numpy

sess_options = onnxruntime.SessionOptions()

session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CUDAExecutionProvider'])

latency = []
for i in tqdm(range(total_samples)):
    # data = dataset[i]
    ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}
    # ort_inputs = {
    #     'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
    #     'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
    #     'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
    # }
    start = time.time()
    op = session.run(None, ort_inputs)
    op = torch.from_numpy(op[0])
    ort_outputs_gpu = mean_pooling([op], inputs['attention_mask'].cpu()).cpu().detach().numpy()
    latency.append(time.time() - start)
print("\n")
print("OnnxRuntime GPU Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))

100%|██████████| 1000/1000 [00:02<00:00, 439.91it/s]



OnnxRuntime GPU Inference time = 2.09 ms





## Quantised ONNX Converted Model GPU

In [17]:
import onnxruntime
import numpy

sess_options = onnxruntime.SessionOptions()

session = onnxruntime.InferenceSession(quantized_model_path, sess_options, providers=['CUDAExecutionProvider'])

latency = []
for i in tqdm(range(total_samples)):
    # data = dataset[i]
    ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}
    # ort_inputs = {
    #     'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
    #     'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
    #     'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
    # }
    start = time.time()
    op = session.run(None, ort_inputs)
    op = torch.from_numpy(op[0])
    ort_outputs_quantised_gpu = mean_pooling([op], inputs['attention_mask'].cpu()).cpu().detach().numpy()
    latency.append(time.time() - start)
print("\n")
print("OnnxRuntime Quantised GPU Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))

100%|██████████| 1000/1000 [00:05<00:00, 172.96it/s]



OnnxRuntime Quantised GPU Inference time = 5.55 ms





In [18]:
embeddings_gpu[:,:10]

array([[-0.03050257,  0.04814149,  0.10222998,  0.04862824,  0.01371918,
        -0.05840195,  0.10347056, -0.03206379, -0.10671094, -0.00481877]],
      dtype=float32)

In [19]:
outputs_gpu[:,:10]

array([[-0.03050257,  0.04814149,  0.10222998,  0.04862824,  0.01371918,
        -0.05840195,  0.10347056, -0.03206379, -0.10671094, -0.00481877]],
      dtype=float32)

In [20]:
ort_outputs_gpu[:,:10]

array([[-0.03050256,  0.04814154,  0.10222998,  0.04862824,  0.01371915,
        -0.05840195,  0.10347054, -0.0320638 , -0.10671094, -0.00481879]],
      dtype=float32)

In [21]:
ort_outputs_quantised_gpu[:,:10]

array([[-0.03625319,  0.02991689,  0.1121512 ,  0.03071815,  0.0087405 ,
        -0.05622218,  0.09673939, -0.01424985, -0.11920615, -0.0072534 ]],
      dtype=float32)