# distilBERT

In [None]:
!optimum-cli export onnx --model distilbert-base-uncased-finetuned-sst-2-english --optimize O4 distilbert_onnx/ --device cuda

In [3]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import logging
logging.disable(logging.WARNING)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
onnx_model = ORTModelForSequenceClassification.from_pretrained('distilbert_onnx', provider='CUDAExecutionProvider')

2023-05-16 18:25:37.881117089 [W:onnxruntime:, session_state.cc:1136 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-05-16 18:25:37.881128100 [W:onnxruntime:, session_state.cc:1138 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device).eval()

In [6]:
import random
import time

sentences = [
    "Mars is the fourth planet from the Sun.",
    "has a crust primarily composed of elements",
    "However, it is unknown",
    "can be viewed from Earth",
    "It was the Romans",
]

len_dataset = 100

texts = []
for _ in range(len_dataset):
    n_times = random.randint(1, 30)
    texts.append(" ".join(random.choice(sentences) for _ in range(n_times)))

In [9]:
classification = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda:0")
start = time.time()
classification(texts)
end = time.time()
print(f"Average response time for original distilBERT: {(end-start)/100} ms")

Average response time for original distilBERT: 0.003664400577545166 ms


In [10]:
onnx_classification = pipeline("text-classification", model=onnx_model, tokenizer=tokenizer, device="cuda:0")
start = time.time()
onnx_classification(texts)
end = time.time()
print(f"Average response time for original distilBERT: {(end-start)/100} ms")

Average response time for original distilBERT: 0.0016323709487915039 ms


2023-05-16 18:25:46.483073433 [W:onnxruntime:, session_state.cc:1136 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-05-16 18:25:46.483083402 [W:onnxruntime:, session_state.cc:1138 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


# GPT2

In [None]:
!optimum-cli export onnx --model gpt2 --optimize O4 gpt2_onnx/ --device cuda

In [11]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from optimum.onnxruntime import ORTModelForCausalLM
import logging
logging.disable(logging.WARNING)

In [12]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')
onnx_model = ORTModelForCausalLM.from_pretrained('gpt2_onnx', provider='CUDAExecutionProvider')

2023-05-16 18:25:56.538311185 [W:onnxruntime:, session_state.cc:1136 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-05-16 18:25:56.538323819 [W:onnxruntime:, session_state.cc:1138 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device).eval()

In [16]:
generation = pipeline("text-generation", model=model, tokenizer=tokenizer, device="cuda:0")
start = time.time()
generation(texts)
end = time.time()
print(f"Average response time for original GPT2: {(end-start)/100} ms")

Average response time for original GPT2: 0.03029601812362671 ms


In [17]:
onnx_generation = pipeline("text-generation", model=onnx_model, tokenizer=tokenizer, device="cuda:0")
start = time.time()
onnx_generation(texts)
end = time.time()
print(f"Average response time for onnx GPT2: {(end-start)/100} ms")

2023-05-16 18:26:15.795406358 [W:onnxruntime:, session_state.cc:1136 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-05-16 18:26:15.795419262 [W:onnxruntime:, session_state.cc:1138 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


Average response time for onnx GPT2: 0.022464430332183837 ms


# T5

In [None]:
!optimum-cli export onnx --model t5-small --optimize O4 t5_onnx/ --device cuda

In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
onnx_model = ORTModelForSeq2SeqLM.from_pretrained("t5_onnx", use_io_binding=True, provider='CUDAExecutionProvider')

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device).eval()

In [21]:
summarization = pipeline("summarization", model=model, tokenizer=tokenizer, device="cuda:0")
start = time.time()
summarization(texts)
end = time.time()
print(f"Average response time for original T5: {(end-start)/100} ms")

Average response time for original T5: 0.22183520793914796 ms


In [22]:
onnx_summarization = pipeline("summarization", model=onnx_model, tokenizer=tokenizer, device="cuda:0")
start = time.time()
onnx_summarization(texts)
end = time.time()
print(f"Average response time for onnx T5: {(end-start)/100} ms")

Average response time for onnx T5: 0.1483987784385681 ms
