In [2]:
# Install Dependencies

In [1]:
!pip install transformers onnx onnxruntime onnxruntime-tools psutil


Collecting onnx
  Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting onnxruntime-tools
  Downloading onnxruntime_tools-1.7.0-py3-none-any.whl.metadata (14 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting py3nvml (from onnxruntime-tools)
  Downloading py3nvml-0.2.7-py3-none-any.whl.metadata (13 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting xmltodict (from py3nvml->onnxruntime-tools)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB

# Load DistilBERT and Tokenizer

In [3]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)
model.eval()

# Sample input
text = "Machine learning optimization is exciting!"
inputs = tokenizer(text, return_tensors="pt")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

#Baseline Inference & Latency

In [4]:
import time
import psutil
import os

def measure_latency(model, inputs, runs=20):
    with torch.no_grad():
        # Warm up
        for _ in range(5):
            _ = model(**inputs)

        start = time.time()
        for _ in range(runs):
            _ = model(**inputs)
        end = time.time()

    latency = ((end - start) / runs) * 1000  # ms
    ram_usage = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
    return latency, ram_usage

latency_fp32, ram_fp32 = measure_latency(model, inputs)
print(f"Baseline Latency (FP32): {latency_fp32:.2f} ms")
print(f"RAM Usage: {ram_fp32:.2f} MB")


Baseline Latency (FP32): 43.92 ms
RAM Usage: 1441.26 MB


# Export to ONNX

In [6]:
import os

onnx_path = "distilbert.onnx"
torch.onnx.export(
    model,
    (inputs["input_ids"],),
    onnx_path,
    input_names=["input_ids"],
    output_names=["last_hidden_state"],
    dynamic_axes={"input_ids": {0: "batch_size", 1: "sequence"}},
    opset_version=17  # ✅ Fix: Supports scaled_dot_product_attention
)

print(f"Model exported to {onnx_path}")


Model exported to distilbert.onnx


# Quantize to INT8 using ONNX Runtime

In [7]:
from onnxruntime.quantization import quantize_dynamic, QuantType

quantized_model_path = "distilbert_quantized.onnx"
quantize_dynamic(
    model_input=onnx_path,
    model_output=quantized_model_path,
    weight_type=QuantType.QInt8
)

print(f"Quantized model saved to {quantized_model_path}")




Quantized model saved to distilbert_quantized.onnx


# Run Quantized Model & Benchmark

In [8]:
import onnxruntime as ort
import numpy as np

def ort_inference(session, input_ids):
    ort_inputs = {"input_ids": input_ids.cpu().numpy()}
    return session.run(None, ort_inputs)

# Load session
session = ort.InferenceSession(quantized_model_path)
input_ids = inputs["input_ids"]

# Warm-up + timing
start = time.time()
for _ in range(20):
    ort_inference(session, input_ids)
end = time.time()

latency_int8 = ((end - start) / 20) * 1000  # ms
ram_int8 = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

print(f"Quantized Latency (INT8): {latency_int8:.2f} ms")
print(f"RAM Usage: {ram_int8:.2f} MB")


Quantized Latency (INT8): 21.83 ms
RAM Usage: 1813.94 MB


# Save metrics.json for Streamlit

In [9]:
import json
from pathlib import Path

metrics = [
    {
        "model": "DistilBERT FP32",
        "latency_ms": round(latency_fp32, 2),
        "size_mb": round(Path(onnx_path).stat().st_size / 1e6, 2),
        "ram_usage_mb": round(ram_fp32, 2),
    },
    {
        "model": "DistilBERT INT8",
        "latency_ms": round(latency_int8, 2),
        "size_mb": round(Path(quantized_model_path).stat().st_size / 1e6, 2),
        "ram_usage_mb": round(ram_int8, 2),
    },
]

with open("metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

print("Saved metrics to metrics.json")


Saved metrics to metrics.json


# Load GPT2 with KV cache setup

In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model.eval()

prompt = "The future of AI lies in"
gpt2_inputs = gpt2_tokenizer(prompt, return_tensors="pt")


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

# GPT2 Latency Benchmark (with vs. without KV cache)

In [12]:
from transformers import DynamicCache

def benchmark_gpt2(model, tokenizer, prompt, use_cache=True, runs=30):
    model.config.use_cache = use_cache
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
    ram_start = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

    past_key_values = None
    start = time.time()

    for _ in range(runs):
        if use_cache:
            if past_key_values is not None:
                input_ids = tokenizer("!", return_tensors="pt")["input_ids"]  # 1 token continuation
            outputs = model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True)
            past_key_values = DynamicCache.from_legacy_cache(outputs.past_key_values)
        else:
            input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
            outputs = model(input_ids=input_ids, use_cache=False)

    end = time.time()
    ram_end = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

    latency = ((end - start) / runs) * 1000
    ram_usage = ram_end - ram_start
    return round(latency, 2), round(ram_usage, 2)


In [13]:
prompt = "The future of AI lies in"
lat_kv, ram_kv = benchmark_gpt2(gpt2_model, gpt2_tokenizer, prompt, use_cache=True)
lat_nokv, ram_nokv = benchmark_gpt2(gpt2_model, gpt2_tokenizer, prompt, use_cache=False)

print(f"With KV Cache: {lat_kv} ms | RAM: {ram_kv} MB")
print(f"Without KV Cache: {lat_nokv} ms | RAM: {ram_nokv} MB")


With KV Cache: 82.15 ms | RAM: 30.25 MB
Without KV Cache: 147.27 ms | RAM: 0.0 MB


In [14]:
gpt2_results = [
    {
        "model": "GPT2 (no KV cache)",
        "latency_ms": round(lat_nokv, 2),
        "size_mb": round(gpt2_model.num_parameters() * 4 / 1e6, 2),
        "ram_usage_mb": round(ram_nokv, 2),
    },
    {
        "model": "GPT2 (KV cache)",
        "latency_ms": round(lat_kv, 2),
        "size_mb": round(gpt2_model.num_parameters() * 4 / 1e6, 2),
        "ram_usage_mb": round(ram_kv, 2),
    }
]

# Append to file
with open("metrics.json", "r") as f:
    existing = json.load(f)

all_metrics = existing + gpt2_results

with open("metrics.json", "w") as f:
    json.dump(all_metrics, f, indent=2)

print("Updated metrics.json with GPT2 KV benchmarks")


Updated metrics.json with GPT2 KV benchmarks
