In [1]:
!pip install onnxruntime datasets transformers numpy

Collecting onnxruntime
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 M

In [3]:
import onnxruntime as ort
import numpy as np
import time
import tracemalloc
from datasets import load_dataset
from transformers import AutoTokenizer

# Function to load the tokenizer and model together

In [8]:
def load_model_and_tokenizer(model_name):
  """Loads the ONNX model and its matching tokenizer."""

  # Tokenizer

  tokenize = AutoTokenizer.from_pretrained(model_name)


  # Downloading the distilbert-sst2 model

  onnx_model_path = "/content/distilbert-sst2.onnx"

  !wget -O {onnx_model_path} "https://huggingface.co/optimum/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/model.onnx"

  # load model

  session = ort.InferenceSession(onnx_model_path)

  return tokenize,session


# Function to preprocess and run inference

In [15]:
def run_inference(sentence,tokenize,session):
   """Tokenizes input, runs inference, and tracks performance."""

   # Tokenizes input

   tokenized = tokenize(sentence,padding=True,truncation=True,return_tensors="np")

   input_ids = np.array(tokenized["input_ids"],dtype=np.int64)
   attention_mask = np.array(tokenized["attention_mask"],dtype=np.int64)


   # Measure performance
   start_time = time.time()
   tracemalloc.start()

   # Inference

   outputs = session.run(None,{"input_ids":input_ids,"attention_mask":attention_mask})

   # stop performance

   end_time = time.time()
   current,peak = tracemalloc.get_traced_memory()
   tracemalloc.stop()


   # Results
   print("\n Inference Completed!")
   print(f"🔹 Execution Time: {end_time - start_time:.4f} seconds")
   print(f"🔹 Peak Memory Usage: {peak / 1024:.2f} KB")




# Step 1: Load dataset

In [4]:
dataset = load_dataset("glue","sst2")

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [5]:
train_data = dataset["train"]

In [6]:
sentence = [sample["sentence"] for sample in train_data.select(range(10))]

In [7]:
sentence

['hide new secretions from the parental units ',
 'contains no wit , only labored gags ',
 'that loves its characters and communicates something rather beautiful about human nature ',
 'remains utterly satisfied to remain the same throughout ',
 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
 "that 's far too tragic to merit such superficial treatment ",
 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ',
 'of saucy ',
 "a depressed fifteen-year-old 's suicidal poetry ",
 "are more deeply thought through than in most ` right-thinking ' films "]

# Step 2: Load model and tokenizer

In [9]:
model_name = "distilbert-base-uncased"

tokenize,session = load_model_and_tokenizer(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

--2025-03-19 13:51:05--  https://huggingface.co/optimum/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/model.onnx
Resolving huggingface.co (huggingface.co)... 3.166.152.44, 3.166.152.65, 3.166.152.105, ...
Connecting to huggingface.co (huggingface.co)|3.166.152.44|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/repos/d0/6e/d06eae48d74816d2ffbbe257ebf858e495147e679482d296e25f4e8bc92ad50d/d59a5577be6a248acf73055c5c81a7a05fc436548f302c7847a6cd1164b0c216?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.onnx%3B+filename%3D%22model.onnx%22%3B&Expires=1742395865&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MjM5NTg2NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9kMC82ZS9kMDZlYWU0OGQ3NDgxNmQyZmZiYmUyNTdlYmY4NThlNDk1MTQ3ZTY3OTQ4MmQyOTZlMjVmNGU4YmM5MmFkNTBkL2Q1OWE1NTc3YmU2YTI0OGFjZjczMDU1YzVjODFhN2EwNWZjNDM2NTQ4ZjMwMmM3ODQ3YTZjZDExNjRiMGMyMTY%7EcmVzcG9uc2U

# Step 3: Run inference

In [17]:
outputs = run_inference(sentence ,tokenize,session)


 Inference Completed!
🔹 Execution Time: 0.3809 seconds
🔹 Peak Memory Usage: 46.17 KB
