In [1]:
################Just Test#######################

import torch
import time
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model.eval()

input_text = "What are the symptoms of COVID-19?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")


# Warmup
for _ in range(5):
    _ = model(input_ids)

# Measure latency
latencies = []
for _ in range(100):
    start = time.time()
    _ = model(input_ids)
    latencies.append(time.time() - start)

print(f"Inference Latency (median): {np.percentile(latencies, 50)*1000:.2f} ms")
print(f"95th percentile: {np.percentile(latencies, 95)*1000:.2f} ms")
print(f"Throughput: {100/np.sum(latencies):.2f} req/s")


###################Still to do batch throughput

Inference Latency (median): 19.06 ms
95th percentile: 21.46 ms
Throughput: 52.46 req/s


In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model.eval()

onnx_model_path = "models/gpt-2.onnx"
input_text = "Hello"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

torch.onnx.export(
    model, input_ids, onnx_model_path,
    input_names=["input_ids"], output_names=["logits"],
    dynamic_axes={"input_ids": {0: "batch", 1: "sequence"}, "logits": {0: "batch"}},
    opset_version=13
)
print("✅ Exported gpt2.onnx")

#______________Just a test to convert ONNX, need to scale to lab level when working on docker______________

✅ Exported gpt2.onnx


In [9]:
import onnx

onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)

In [6]:
############Inferenece Session on CPU#########
import onnxruntime as ort
import numpy as np
from transformers import GPT2Tokenizer
import time

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
session = ort.InferenceSession("gpt2.onnx", providers=["CPUExecutionProvider"])

def benchmark_session(session):

    input_text = "Tell me about diabetes."
    input_ids = tokenizer.encode(input_text, return_tensors="np").astype(np.int64)
    
    # Warm-up
    for _ in range(5):
        session.run(None, {"input_ids": input_ids})
    
    # Benchmark
    latencies = []
    for _ in range(100):
        start = time.time()
        session.run(None, {"input_ids": input_ids})
        latencies.append(time.time() - start)
    
    print(f"ONNX Inference Latency (median): {np.percentile(latencies, 50)*1000:.2f} ms")
    print(f"Throughput: {100/np.sum(latencies):.2f} req/s")


benchmark_session(session)


ONNX Inference Latency (median): 9.63 ms
Throughput: 101.40 req/s


In [None]:
###############Lots of Metrics Missing above, Just a template we'll scale on docker later################b

In [13]:
###########Apply Graph Optimisations#######

import onnxruntime as ort

sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.optimized_model_filepath = "models/gpt2_optimized.onnx"

session = ort.InferenceSession("gpt2.onnx", sess_options=sess_options, providers=["CPUExecutionProvider"])
print("✅ Saved optimized graph to gpt2_optimized.onnx")


✅ Saved optimized graph to gpt2_optimized.onnx


[0;93m2025-04-07 15:45:54.928811 [W:onnxruntime:, inference_session.cc:2039 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.[m


In [15]:
###############Checking metrics for graph optimisedb
onnx_model_path = "models/gpt2_optimized.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)

ONNX Inference Latency (median): 8.97 ms
Throughput: 110.01 req/s


In [2]:
import neural_compressor
from neural_compressor import quantization

# Load ONNX model into Intel Neural Compressor
model_path = "models/gpt2.onnx"
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)

# Configure the quantizer
config_ptq = neural_compressor.PostTrainingQuantConfig(
    approach="dynamic"
)

# Fit the quantized model
q_model = quantization.fit(
    model=fp32_model, 
    conf=config_ptq
)

# Save quantized model
q_model.save_model_to_file("models/food11_quantized_dynamic.onnx")

2025-04-07 16:14:40 [INFO] Start auto tuning.
2025-04-07 16:14:40 [INFO] Quantize model without tuning!
2025-04-07 16:14:40 [INFO] Quantize the model with default configuration without evaluating the model.                To perform the tuning process, please either provide an eval_func or provide an                    eval_dataloader an eval_metric.
2025-04-07 16:14:40 [INFO] Adaptor has 5 recipes.
2025-04-07 16:14:40 [INFO] 0 recipes specified by user.
2025-04-07 16:14:40 [INFO] 3 recipes require future tuning.
2025-04-07 16:14:40 [INFO] *** Initialize auto tuning
2025-04-07 16:14:40 [INFO] {
2025-04-07 16:14:40 [INFO]     'PostTrainingQuantConfig': {
2025-04-07 16:14:40 [INFO]         'AccuracyCriterion': {
2025-04-07 16:14:40 [INFO]             'criterion': 'relative',
2025-04-07 16:14:40 [INFO]             'higher_is_better': True,
2025-04-07 16:14:40 [INFO]             'tolerable_loss': 0.01,
2025-04-07 16:14:40 [INFO]             'absolute': None,
2025-04-07 16:14:40 [INFO]     

In [None]:
####################Do static quantisation on the DOCKER container#############
################Run benchmarks on all the quantised models like before#########bb

In [None]:
############Below things wont work now but once we have gpu they will#############

In [7]:
import onnxruntime as ort
import numpy as np
from transformers import GPT2Tokenizer
import time
onnx_model_path = "models/gpt2.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CUDAExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()

ONNX Inference Latency (median): 9.14 ms
Throughput: 108.20 req/s


'CPU'

In [9]:
onnx_model_path = "models/gpt2.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['TensorrtExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()

ONNX Inference Latency (median): 9.17 ms
Throughput: 107.88 req/s


'CPU'