In [1]:
!pip install onnx onnxruntime transformers torch -qq



In [4]:
import os

from transformers import AutoModel, AutoTokenizer

In [15]:
from transformers import AutoModel, AutoTokenizer
import torch

def download_and_save_onnx(model_repository, model_save_path):
    # Load model and tokenizer
    model = AutoModel.from_pretrained(model_repository, output_attentions=True)
    tokenizer = AutoTokenizer.from_pretrained(model_repository)

    # Prepare dummy input for model export
    inputs = tokenizer("Hello, this is a test.", return_tensors="pt")

    # Prepare the model for exporting
    model.eval()

    # Export the model to ONNX
    with torch.no_grad():
        torch.onnx.export(model,
                  args=(inputs['input_ids'],),
                  f="all_miniLM_L6_v2_with_attentions.onnx",
                  input_names=['input_ids'],
                  output_names=['last_hidden_state', 'attention_scores'],
                  dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'},
                                'last_hidden_state': {0: 'batch_size', 1: 'sequence_length'},
                                'attention_scores': {0: 'batch_size', 1: 'num_heads', 2: 'sequence_length', 3: 'sequence_length'}},
                  opset_version=12)

# Specify the Hugging Face repository and the local path for saving the ONNX model
model_repository = "sentence-transformers/all-MiniLM-L6-v2"
model_save_path = "./all_miniLM_L6_v2.onnx"

# Download and save the model
download_and_save_onnx(model_repository, model_save_path)


## Finding layer names and graphs

In [17]:
import onnxruntime as ort
import numpy as np

def get_attention_values(model_path, input_data):
    session = ort.InferenceSession(model_path)
    input_name = session.get_inputs()[0].name
    output_name = [output.name for output in session.get_outputs() if "attention" in output.name][0] # I am getting everything, we can filter this

    # Run inference
    attention_values = session.run([output_name], {input_name: input_data})
    return attention_values
    
# Prepare your input data correctly, ensuring it matches the expected input dimensions and type
model_path = "all_miniLM_L6_v2_with_attentions.onnx"
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
text = "Example text for extracting attention."
inputs = tokenizer(text, return_tensors="np").input_ids.astype(np.int64)

# Extract and print attention values
attention = get_attention_values(model_path, inputs)
print("Attention Values:", attention)


Attention Values: [array([[ 5.91564877e-03,  3.75230201e-02,  7.84645695e-03,
        -1.85632911e-02, -9.26318318e-02,  7.92923849e-03,
         4.08564135e-02,  7.15774149e-02, -5.30700833e-02,
        -9.51065421e-02,  4.69905026e-02, -7.36653293e-03,
         4.07047793e-02, -8.41774140e-03, -1.34617900e-02,
         2.51114778e-02,  1.19788453e-01, -4.26220931e-02,
        -9.33553874e-02,  7.02253059e-02,  2.35134345e-02,
         4.73588742e-02,  6.61782622e-02, -1.46158307e-03,
        -2.89014336e-02,  7.03726411e-02, -4.20038104e-02,
         7.57771507e-02,  4.66566309e-02,  8.49057883e-02,
         5.01826145e-02,  7.13686412e-03, -4.75834981e-02,
         2.83810589e-02, -4.67140693e-03, -1.54618360e-02,
        -5.84125817e-02,  5.22408187e-02, -8.97673052e-03,
         1.20889973e-02,  2.42065359e-02,  1.03037059e-01,
        -2.83910576e-02,  2.38162447e-02, -2.47850791e-02,
        -2.99192108e-02, -1.15898184e-01,  8.29635595e-04,
        -1.13145873e-01, -1.41992150e

Inline comments are GPT generated