In [None]:
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', torchscript=True)

# Move the model to gpu if available and set eval mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device).eval()

In [2]:
import random

sentences = [
    "Mars is the fourth planet from the Sun.",
    "has a crust primarily composed of elements",
    "However, it is unknown",
    "can be viewed from Earth",
    "It was the Romans",
]

len_dataset = 100

texts = []
for _ in range(len_dataset):
    n_times = random.randint(1, 30)
    texts.append(" ".join(random.choice(sentences) for _ in range(n_times)))

In [None]:
from speedster import optimize_model, save_model, load_model

In [4]:
dynamic_info = {
    "inputs": [
        {0: 'batch', 1: 'num_tokens'},
        {0: 'batch', 1: 'num_tokens'},
        {0: 'batch', 1: 'num_tokens'},
    ],
    "outputs": [
        {0: 'batch', 1: 'num_tokens'},
        {0: 'batch'},
    ]
}

In [5]:
import time

# Move inputs to gpu if available
encoded_inputs = [tokenizer(text, return_tensors="pt").to(device) for text in texts]

In [None]:
optimized_model = optimize_model(
    model=model,
    input_data=encoded_inputs,
    optimization_time="constrained",
    ignore_compilers=["tensor_rt", "tvm"],  # TensorRT does not work for this model
    dynamic_info=dynamic_info,
)

In [7]:
_ = model.to(device).eval()

In [14]:
times = []

# Warmup for 30 iterations
for encoded_input in encoded_inputs[:30]:
    with torch.no_grad():
        final_out = model(**encoded_input)

# Benchmark
for encoded_input in encoded_inputs:
    st = time.time()
    with torch.no_grad():
        final_out = model(**encoded_input)
    times.append(time.time()-st)
original_model_time = sum(times)/len(times)*1000
print(f"Average response time for original BERT: {original_model_time} ms")

Average response time for original BERT: 5.4140496253967285 ms


In [15]:
times = []

# Warmup for 30 iterations
for encoded_input in encoded_inputs[:30]:
    with torch.no_grad():
        final_out = optimized_model(**encoded_input)

# Benchmark
for encoded_input in encoded_inputs:
    st = time.time()
    with torch.no_grad():
        final_out = optimized_model(**encoded_input)
    times.append(time.time()-st)
optimized_model_time = sum(times)/len(times)*1000
print(f"Average response time for optimized BERT (no metric drop): {optimized_model_time} ms")

Average response time for optimized BERT (no metric drop): 4.281759262084961 ms


In [None]:
optimized_model = optimize_model(
    model=model,
    input_data=encoded_inputs,
    optimization_time="constrained",
    ignore_compilers=["tensor_rt", "tvm"],
    dynamic_info=dynamic_info,
    metric_drop_ths=0.1,
)

In [27]:
times = []

# Warmup for 30 iterations
for encoded_input in encoded_inputs[:30]:
    with torch.no_grad():
        final_out = optimized_model(**encoded_input)

# Benchmark
for encoded_input in encoded_inputs:
    st = time.time()
    with torch.no_grad():
        final_out = optimized_model(**encoded_input)
    times.append(time.time()-st)
optimized_model_time = sum(times)/len(times)*1000
print(f"Average response time for optimized BERT (metric drop): {optimized_model_time} ms")

Average response time for optimized BERT (metric drop): 2.9406261444091797 ms


## GPT2

In [28]:
import torch
from transformers import GPT2Tokenizer, GPT2Model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2', torchscript=True)

# Move the model to gpu if available and set eval mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device).eval()

In [None]:
dynamic_info = {
    "inputs": [
        {0: 'batch', 1: 'num_tokens'},
        {0: 'batch', 1: 'num_tokens'}
    ],
    "outputs": [
        {0: 'batch', 1: 'num_tokens'},
    ] + [{0: 'batch', 2: 'num_tokens'} for i in range(24)]
}

optimized_model = optimize_model(
    model=model,
    input_data=encoded_inputs,
    optimization_time="constrained",
    ignore_compilers=["tensor_rt", "tvm"],
    dynamic_info=dynamic_info,
)

In [33]:
_ = model.to(device)

In [40]:
times = []

# Warmup for 30 iterations
for encoded_input in encoded_inputs[:30]:
    with torch.no_grad():
        final_out = model(**encoded_input)

# Benchmark
for encoded_input in encoded_inputs:
    st = time.time()
    with torch.no_grad():
        final_out = model(**encoded_input)
    times.append(time.time()-st)
original_model_time = sum(times)/len(times)*1000
print(f"Average response time for original GPT2: {original_model_time} ms")

Average response time for original GPT2: 6.764638423919678 ms


In [41]:
times = []

# Warmup for 30 iterations
for encoded_input in encoded_inputs[:30]:
    with torch.no_grad():
        final_out = optimized_model(**encoded_input)

# Benchmark
for encoded_input in encoded_inputs:
    st = time.time()
    with torch.no_grad():
        final_out = optimized_model(**encoded_input)
    times.append(time.time()-st)
optimized_model_time = sum(times)/len(times)*1000
print(f"Average response time for optimized GPT2 (no metric drop): {optimized_model_time} ms")

Average response time for optimized GPT2 (no metric drop): 4.888675212860107 ms


In [None]:
optimized_model = optimize_model(
    model=model,
    input_data=encoded_inputs,
    optimization_time="constrained",
    ignore_compilers=["tensor_rt", "tvm"],
    dynamic_info=dynamic_info,
    metric_drop_ths=0.1,
)

In [49]:
times = []

# Warmup for 30 iterations
for encoded_input in encoded_inputs[:30]:
    with torch.no_grad():
        final_out = optimized_model(**encoded_input)

# Benchmark
for encoded_input in encoded_inputs:
    st = time.time()
    with torch.no_grad():
        final_out = optimized_model(**encoded_input)
    times.append(time.time()-st)
optimized_model_time = sum(times)/len(times)*1000
print(f"Average response time for optimized GPT2 (metric drop): {optimized_model_time} ms")

Average response time for optimized GPT2 (metric drop): 3.5162806510925293 ms


## T5

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import time

model_name = "google/t5-efficient-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torchscript=True)

# Move the model to gpu if available and set eval mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device).eval()

In [9]:
encoder = model.get_encoder()
decoder = model.get_decoder()

In [10]:
texts = [
    """BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.""",
    """GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was trained to guess the next word in sentences.""",
    """With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task.""",
    """LayoutLMv3 is a pre-trained multimodal Transformer for Document AI with unified text and image masking. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model. For example, LayoutLMv3 can be fine-tuned for both text-centric tasks, including form understanding, receipt understanding, and document visual question answering, and image-centric tasks such as document image classification and document layout analysis.""",
    """XLNet is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs Transformer-XL as the backbone model, exhibiting excellent performance for language tasks involving long context. Overall, XLNet achieves state-of-the-art (SOTA) results on various downstream language tasks including question answering, natural language inference, sentiment analysis, and document ranking."""
]
texts = texts*20
encoded_inputs = [tokenizer(text, padding="longest", return_tensors="pt").to(device) for text in texts]

In [None]:
encoder_dynamic_info = {
    "inputs": [
        {0: 'batch', 1: 'num_tokens'},
        {0: 'batch', 1: 'num_tokens'}
    ],
    "outputs": [
        {0: 'batch', 1: 'num_tokens'},
    ]
}

# Create the optimized encoder model seperately
optimized_encoder_model = optimize_model(
    model=encoder,
    input_data=encoded_inputs,
    optimization_time="constrained",
    ignore_compilers=["tensor_rt", "tvm"],
    dynamic_info=encoder_dynamic_info,
)

In [None]:
decoder_dynamic_info = {
    "inputs": [
        {0: 'batch', 1: 'num_tokens'},
        {0: 'batch', 1: 'num_tokens'}
    ],
    "outputs": [
        {0: 'batch', 1: 'num_tokens'},
    ] + [{0: 'batch', 2: 'num_tokens'} for i in range(24)]
}

# Create the optimized decoder model seperately
optimized_decoder_model = optimize_model(
    model=decoder,
    input_data=encoded_inputs,
    optimization_time="constrained",
    ignore_compilers=["tensor_rt", "tvm"],
    dynamic_info=decoder_dynamic_info,
)

In [13]:
_ = encoder.to(device)
_ = decoder.to(device)

In [26]:
times = []
# Warmup for 30 iterations
for encoded_input in encoded_inputs[:30]:
    with torch.no_grad():
        encoder_out = encoder(**encoded_input)
        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])

# Benchmark
for encoded_input in encoded_inputs:
    st = time.time()
    with torch.no_grad():
        encoder_out = encoder(**encoded_input)
        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])
    times.append(time.time()-st)
original_model_time = sum(times)/len(times)*1000
print(f"Average response time for original T5: {original_model_time} ms")

Average response time for original T5: 14.36943769454956 ms


In [29]:
times = []

# Warmup for 30 iterations
for encoded_input in encoded_inputs[:30]:
    with torch.no_grad():
        encoder_out = optimized_encoder_model(**encoded_input)
        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])

# Benchmark
for encoded_input in encoded_inputs:
    st = time.time()
    with torch.no_grad():
        encoder_out = optimized_encoder_model(**encoded_input)
        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])
    times.append(time.time()-st)
optimized_model_time = sum(times)/len(times)*1000
print(f"Average response time for optimized T5 (no metric drop): {optimized_model_time} ms")

Average response time for optimized T5 (no metric drop): 9.04134750366211 ms


In [None]:
optimized_encoder_model = optimize_model(
    model=encoder,
    input_data=encoded_inputs,
    optimization_time="constrained",
    ignore_compilers=["tensor_rt", "tvm"],
    dynamic_info=encoder_dynamic_info,
    metric_drop_ths=0.1,
)

In [None]:
optimized_decoder_model = optimize_model(
    model=decoder,
    input_data=encoded_inputs,
    optimization_time="constrained",
    ignore_compilers=["tensor_rt", "tvm"],
    dynamic_info=decoder_dynamic_info,
    metric_drop_ths=0.1,
)

In [32]:
_ = encoder.to(device)
_ = decoder.to(device)

In [38]:
times = []

# Warmup for 30 iterations
for encoded_input in encoded_inputs[:30]:
    with torch.no_grad():
        encoder_out = optimized_encoder_model(**encoded_input)
        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])

# Benchmark
for encoded_input in encoded_inputs:
    st = time.time()
    with torch.no_grad():
        encoder_out = optimized_encoder_model(**encoded_input)
        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])
    times.append(time.time()-st)
optimized_model_time = sum(times)/len(times)*1000
print(f"Average response time for optimized T5 (metric drop): {optimized_model_time} ms")

Average response time for optimized T5 (metric drop): 7.490894794464111 ms
