In [19]:
import onnxruntime
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTModelForFeatureExtraction
import os

from transformers import AutoTokenizer
from transformers import AutoModel

In [4]:
# Check if ONNX model already exists to save time
onnx_model_dir = "ms-marco-MiniLM-L12-v2"
model_name = "cross-encoder/ms-marco-MiniLM-L12-v2"


ort_model = ORTModelForSequenceClassification.from_pretrained(
    model_name,          
    export=True,         
)

tokenizer = AutoTokenizer.from_pretrained(model_name)


ort_model.save_pretrained(onnx_model_dir)
tokenizer.save_pretrained(onnx_model_dir) # Save tokenizer for easy loading with ONNX model

The model cross-encoder/ms-marco-MiniLM-L12-v2 was already converted to ONNX but got `export=True`, the model will be converted to ONNX once again. Don't forget to save the resulting model with `.save_pretrained()`


('ms-marco-MiniLM-L12-v2/tokenizer_config.json',
 'ms-marco-MiniLM-L12-v2/special_tokens_map.json',
 'ms-marco-MiniLM-L12-v2/vocab.txt',
 'ms-marco-MiniLM-L12-v2/added_tokens.json',
 'ms-marco-MiniLM-L12-v2/tokenizer.json')

In [5]:
test_pairs = [
    ("How is the weather today?", "The weather is sunny and warm."),
    ("What is ONNX Runtime?", "It is a cross-platform inferencing and training accelerator."),
    ("This is a relevant document.", "This is a relevant document."),
    ("This is a relevant document.", "This is a completely irrelevant document.")
]

In [33]:
print("\n--- Running Inference with Optimum ORTModel ---")
ort_model_loaded = ORTModelForSequenceClassification.from_pretrained(
    onnx_model_dir, 
)
tokenizer_for_ort = AutoTokenizer.from_pretrained(onnx_model_dir)

# Prepare input (Optimum can handle PyTorch tensors or NumPy arrays)
encoded_input_optimum = tokenizer_for_ort(
    test_pairs,
    padding=True,
    truncation=True,
    return_tensors="np" # Can be "pt" for PyTorch tensors or "np" for NumPy
)

outputs_optimum = ort_model_loaded(**encoded_input_optimum)

logits_optimum = outputs_optimum.logits # Get logits and move to CPU if they were on GPU

print("Input pairs:", test_pairs)
print("Logits from Optimum ORTModel:")
for i, pair in enumerate(test_pairs):
    print(f"  Pair {i+1}: Score = {logits_optimum[i][0]:.4f}")


--- Running Inference with Optimum ORTModel ---
Input pairs: [('How is the weather today?', 'The weather is sunny and warm.'), ('What is ONNX Runtime?', 'It is a cross-platform inferencing and training accelerator.'), ('This is a relevant document.', 'This is a relevant document.'), ('This is a relevant document.', 'This is a completely irrelevant document.')]
Logits from Optimum ORTModel:
  Pair 1: Score = 0.7604
  Pair 2: Score = -8.9707
  Pair 3: Score = 7.9912
  Pair 4: Score = 1.2184


In [34]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

print(sigmoid(logits_optimum))

[[6.8144011e-01]
 [1.2706606e-04]
 [9.9966168e-01]
 [7.7177614e-01]]


## BiEncoder

In [35]:
model_name = "sentence-transformers/all-MiniLM-L6-v2" # HF model name
onnx_model_optimum_dir = "all-MiniLM-L6-v2" # Directory for Optimum's ONNX export

# Load original PyTorch model (base) and tokenizer
# For sentence-transformers, the AutoModel gives the base transformer
model_base = AutoModel.from_pretrained(model_name).eval()
tokenizer_base = AutoTokenizer.from_pretrained(model_name)

In [36]:
ort_model_exporter = ORTModelForFeatureExtraction.from_pretrained(
        model_name,
        export=True,
        # provider="CUDAExecutionProvider" # if you want to optimize for CUDA during export
    )
ort_model_exporter.save_pretrained(onnx_model_optimum_dir)
tokenizer_base.save_pretrained(onnx_model_optimum_dir)

The model sentence-transformers/all-MiniLM-L6-v2 was already converted to ONNX but got `export=True`, the model will be converted to ONNX once again. Don't forget to save the resulting model with `.save_pretrained()`


('all-MiniLM-L6-v2/tokenizer_config.json',
 'all-MiniLM-L6-v2/special_tokens_map.json',
 'all-MiniLM-L6-v2/vocab.txt',
 'all-MiniLM-L6-v2/added_tokens.json',
 'all-MiniLM-L6-v2/tokenizer.json')

In [37]:
sentences = [
    "This is an example sentence.",
    "Each sentence is converted to a vector.",
    "ONNX makes deployment easier."
]


In [38]:
print("\n--- Running Inference with Optimum ORTModel ---")
ort_model_loaded = ORTModelForFeatureExtraction.from_pretrained(
    onnx_model_optimum_dir, 
)
tokenizer_for_ort = AutoTokenizer.from_pretrained(onnx_model_optimum_dir)

# Prepare input (Optimum can handle PyTorch tensors or NumPy arrays)
encoded_input_optimum = tokenizer_for_ort(
    sentences,
    padding=True,
    truncation=True,
    return_tensors="np" # Can be "pt" for PyTorch tensors or "np" for NumPy
)

outputs_optimum = ort_model_loaded(**encoded_input_optimum)


--- Running Inference with Optimum ORTModel ---


In [48]:
mean_embedding = np.mean(outputs_optimum.last_hidden_state,axis=1).tolist()

In [50]:
def normalize_embeddings_numpy(embeddings_np, p=2, axis=1, epsilon=1e-12):
    """
    Performs L2 normalization on embeddings.

    Args:
        embeddings_np (np.ndarray): Embeddings to normalize, e.g., (batch_size, hidden_dim).
        p (int, optional): The order of the norm. Defaults to 2 (L2 norm).
        axis (int, optional): The axis along which to compute the norm. Defaults to 1.
        epsilon (float, optional): A small value to add to the norm to prevent division by zero.

    Returns:
        np.ndarray: Normalized embeddings.
    """
    # Calculate the norm
    norm = np.linalg.norm(embeddings_np, ord=p, axis=axis, keepdims=True)

    # Add epsilon to prevent division by zero if norm is 0
    norm = np.maximum(norm, epsilon)

    return embeddings_np / norm
