### Local Onnx Test

In [5]:
import os
os.getcwd()

'/workspaces/rag_prototype/models/embedder'

In [10]:
import onnx
import onnxruntime as ort
import numpy as np

# Load the ONNX model
onnx_model_path = "./gte-large/model/model.onnx"  # Path to your ONNX model file
onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)  # Verify the model's integrity

# Create an ONNX runtime session
session = ort.InferenceSession(onnx_model_path)

# Prepare dummy input (ensure the input shape matches the model's requirements)
# For example, if you are using a BERT-like model, the typical input would be input_ids and attention_mask
# You would need to tokenize your text and convert it to a numpy array before passing it to the model.

# Example sentence
sentence = "This is a test sentence."

# You will need to tokenize the sentence, just as you would when using a PyTorch model
# Here we use a simple placeholder (you should use the tokenizer for your specific model)
# This is an example; in reality, you'll need to use the tokenizer from the Hugging Face library

from transformers import AutoTokenizer

# Load tokenizer (use the same tokenizer you used for model training)
tokenizer = AutoTokenizer.from_pretrained("./gte-large/tokenizer")

# Tokenize the input sentence
inputs = tokenizer(sentence, return_tensors="np", padding=True, truncation=True, max_length=512)

# Prepare inputs for ONNX model (inputs should be numpy arrays)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
token_type_ids = inputs['token_type_ids']

# Ensure the inputs are in the right format (numpy arrays)
input_ids = np.array(input_ids, dtype=np.int64)
attention_mask = np.array(attention_mask, dtype=np.int64)
token_type_ids = np.array(token_type_ids, dtype=np.int64)
# Run inference with ONNX Runtime
# Set the input names as expected by the model (check the model's input names)
# Here, 'input_ids' and 'attention_mask' are the input names
outputs = session.run(["last_hidden_state"], {
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    'token_type_ids': token_type_ids
})

outputs

[array([[[ 0.08390327,  0.19795458,  0.29698628, ..., -0.0713546 ,
          -0.09769778, -0.40413493],
         [-0.1609579 ,  0.02804918,  0.15180624, ...,  0.09533711,
           0.2472866 , -0.13007842],
         [ 0.116238  , -0.04968553,  0.1525496 , ...,  0.13903278,
           0.26824445, -0.32201058],
         ...,
         [ 0.24339935, -0.09559675,  0.07080201, ..., -0.04756254,
           0.02960365, -0.6810834 ],
         [ 0.16566315, -0.22281522,  0.14389402, ..., -0.04828335,
           0.43901905, -0.37544316],
         [ 0.07758234, -0.2386691 , -0.06030796, ..., -0.03946404,
           0.15006924, -0.7060415 ]]], dtype=float32)]

In [14]:
outputs[0].shape

(1, 8, 768)

### Remote Onnx Test

In [5]:
import requests
import numpy as np

# URL for the Triton service

def emb_text(text, url="http://triton-direct-s3-route-triton-inference-services.apps.nebula.sl/v2/models/gte-base/infer"):
    
    # Payload for the request
    payload = {
        "inputs": [
            {
                "name": "TEXT",
                "datatype": "BYTES",
                "shape": [1],
                "data": [text]
            },
        ]
    }

    # Sending the POST request and printing the response
    try:
        response = requests.post(url, json=payload)
        outputs = response.json()['outputs'][0]
        data = outputs['data']
        shape = outputs['shape']
    
        input_tensor =  np.array(data)
    
    
        # Shape information from input tensor (this is provided in the 'shape' argument)
        batch_size, seq_len, embedding_dim = shape
    
        # Reshape the 1D array into the original 3D shape (batch_size, seq_len, embedding_dim)
        input_data = input_tensor.reshape(batch_size, seq_len, embedding_dim)
    
        # Apply mean pooling across the sequence length (axis=1)
        pooled_embeddings = np.mean(input_data, axis=1)  # Shape will be [batch_size, embedding_dim]
        
        return pooled_embeddings.tolist()[0]
            
    except requests.exceptions.RequestException as e:
        print("Request failed:", str(e))

In [7]:
len(emb_text('Hello what is your name'))

768