In [None]:
# Using nvcr.io/nvidia/tritonserver:24.01-py3 (a previous version of triton which has onnxruntime backend installed (cuDNN upgrade issue in the newer one))

# Convert model to tensorrt (comes installed with the image)
# /usr/src/tensorrt/bin/trtexec --onnx=model.onnx --saveEngine=model.trt --fp16

# Start Triton
# My folder is network stored, so all packages should be accesible

# /opt/tritonserver/bin/tritonserver --model-repository=/workspace/model_repository

In [3]:
import importlib

# Function to check module installation and version
def check_module(module_name):
    module = importlib.util.find_spec(module_name)
    if module:
        imported_module = importlib.import_module(module_name)
        version = getattr(imported_module, '__version__', 'Version not found')
        print(f"{module_name} is installed. Version: {version}")
    else:
        print(f"{module_name} is NOT installed.")

# Check torch and transformers
check_module("torch")
check_module("transformers")


torch is installed. Version: 2.6.0+cu126
transformers is installed. Version: 4.49.0


In [11]:
import tritonclient.http as httpclient
import numpy as np
from transformers import AutoTokenizer

# Initialize tokenizer and Triton client
tokenizer = AutoTokenizer.from_pretrained("WhereIsAI/UAE-Large-V1")
client = httpclient.InferenceServerClient(url="localhost:8000")

MODEL_NAME="UAE-Large-V1"

# Example input text
text = "a black thing"
inputs = tokenizer(text, return_tensors="np")

# Triton requires token_type_ids along with input_ids and attention_mask
input_ids = httpclient.InferInput("input_ids", inputs["input_ids"].shape, "INT64")
attention_mask = httpclient.InferInput("attention_mask", inputs["attention_mask"].shape, "INT64")

# Ensure token_type_ids exist (some models use it, some don't)
if "token_type_ids" not in inputs:
    inputs["token_type_ids"] = np.zeros_like(inputs["input_ids"])

token_type_ids = httpclient.InferInput("token_type_ids", inputs["token_type_ids"].shape, "INT64")

# Set data for inputs
input_ids.set_data_from_numpy(inputs["input_ids"])
attention_mask.set_data_from_numpy(inputs["attention_mask"])
token_type_ids.set_data_from_numpy(inputs["token_type_ids"])

# Define the output tensor
outputs = httpclient.InferRequestedOutput("last_hidden_state")

# Send inference request
response = client.infer(model_name=MODEL_NAME, inputs=[input_ids, attention_mask, token_type_ids], outputs=[outputs])

# Print response
print(response.as_numpy("last_hidden_state")[0][0])


[-0.34092474 -0.7468907  -0.06713652 ...  0.06433069 -0.00998584
  0.11809338]


In [2]:
#image embeddings

In [10]:
import tritonclient.http as httpclient
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoImageProcessor
from PIL import Image
import requests

# Triton server details
TRITON_SERVER_URL = "localhost:8000"
MODEL_NAME = "nomic-embed-vision"

# Load the image processor (same as used in ONNX model)
processor = AutoImageProcessor.from_pretrained("/workspace/onnx-nomic-embed-v1.5")

# Load an image
image_url = "https://m.media-amazon.com/images/I/61sADwl+YWL._AC_UL320_.jpg"  # Replace with your own image
image = Image.open(requests.get(image_url, stream=True).raw)

# Preprocess image
inputs = processor(image, return_tensors="np")
input_data = inputs["pixel_values"]  # Shape should be (1, 3, 224, 224)

# Initialize Triton client
client = httpclient.InferenceServerClient(url=TRITON_SERVER_URL)

# Prepare request
input_tensor = httpclient.InferInput("pixel_values", input_data.shape, "FP32")
input_tensor.set_data_from_numpy(input_data, binary_data=True)

# Request both outputs
output_tensor = httpclient.InferRequestedOutput("last_hidden_state", binary_data=True)

# Run inference
response = client.infer(model_name=MODEL_NAME, inputs=[input_tensor], outputs=[output_tensor])

# Extract embeddings
output = response.as_numpy("last_hidden_state")  # Shape: (1, 197, 768)

# Normalize embeddings (Optional: Usually done to get unit vectors)
img_embeddings = torch.tensor(output)
normalized_embeddings = F.normalize(img_embeddings[:, 0], p=2, dim=1)

# Print the shape and embeddings
# print("Image Embeddings Shape:", normalized_embeddings.shape)
normalized_embeddings

tensor([[-5.9564e-02,  1.4224e-02, -4.2698e-02,  1.7461e-02, -5.0548e-03,
         -1.6759e-02, -1.1440e-02, -2.8139e-02,  2.2130e-03, -5.4862e-02,
         -5.1926e-02, -1.5328e-02, -3.9420e-02, -5.9082e-02, -4.1330e-02,
         -5.6235e-03, -3.6205e-02,  2.5260e-02, -1.5978e-02, -7.5782e-02,
         -3.2285e-02, -4.3216e-02, -8.1534e-03,  1.3770e-03, -5.7407e-02,
         -3.4286e-02, -2.8380e-03, -1.3348e-02,  2.7440e-02, -1.5356e-02,
         -8.4235e-03, -3.0112e-02, -4.9091e-02, -1.0906e-02, -3.4976e-02,
         -3.2410e-02, -2.8491e-02, -5.8483e-02,  1.9913e-02, -3.7894e-02,
         -5.7533e-03, -1.3862e-02, -2.3517e-02, -2.1590e-02,  1.9630e-02,
         -2.6699e-02,  9.3951e-03, -2.5720e-02, -1.4409e-02, -1.1951e-02,
         -1.3215e-02, -6.8224e-02,  1.4171e-02, -2.6513e-02, -7.3316e-02,
         -2.8856e-02, -2.4893e-02, -5.4165e-02, -3.4994e-02, -3.1273e-02,
          5.0158e-03, -6.3911e-03, -4.3383e-02, -1.1057e-02, -1.6693e-02,
         -3.5838e-02,  1.5773e-02,  5.