In [None]:
# Using nvcr.io/nvidia/tritonserver:24.01-py3 (a previous version of triton which has onnxruntime backend installed (cuDNN upgrade issue in the newer one))

# # First check support for tensorrt for the onnx model
# /opt/tensorrt/bin/trtexec --onnx=/workspace/model_repository/nomic-embed-vision/1/model.onnx --verbose
# /opt/tensorrt/bin/trtexec --onnx=/workspace/model_repository/UAE-Large-V1/1/model.onnx --verbose

# # from onnx nomic-embed-vision to tensortt plan file
# /opt/tensorrt/bin/trtexec --onnx=/workspace/model_repository/nomic-embed-vision/1/model.onnx --saveEngine=/workspace/model_repository/nomic-embed-vision/1/model.plan --fp16 --memPoolSize=workspace:12000

# # now do the same for the text embedding model
# /opt/tensorrt/bin/trtexec --onnx=/workspace/model_repository/UAE-Large-V1/1/model.onnx --saveEngine=/workspace/model_repository/UAE-Large-V1/1/model.plan --fp16 --memPoolSize=workspace:12000

# # Start Triton
# # My folder is network stored, so all packages should be accesible

# /opt/tritonserver/bin/tritonserver --model-repository=/workspace/model_repository

In [None]:
# CMD
# /bin/bash -c "gdown --folder https://drive.google.com/drive/folders/1E2D2ekxGa4uQ2mu9zrURKb3f8l85fFjS -O /workspace/model_repository && /opt/tritonserver/bin/tritonserver --model-repository=/workspace/model_repository"

In [3]:
import importlib

# Function to check module installation and version
def check_module(module_name):
    module = importlib.util.find_spec(module_name)
    if module:
        imported_module = importlib.import_module(module_name)
        version = getattr(imported_module, '__version__', 'Version not found')
        print(f"{module_name} is installed. Version: {version}")
    else:
        print(f"{module_name} is NOT installed.")

# Check torch and transformers
check_module("torch")
check_module("transformers")


torch is installed. Version: 2.6.0+cu126
transformers is installed. Version: 4.49.0


In [21]:
import tritonclient.http as httpclient
import numpy as np
from transformers import AutoTokenizer

# Initialize tokenizer and Triton client
tokenizer = AutoTokenizer.from_pretrained("WhereIsAI/UAE-Large-V1")
client = httpclient.InferenceServerClient(url="213.192.2.74:40081")

MODEL_NAME = "UAE-Large-V1"

# Example input text
text = "a black thing"
inputs = tokenizer(text, return_tensors="np")

# Convert data types to INT64 (Triton requirement)
inputs["input_ids"] = inputs["input_ids"].astype(np.int64)
inputs["attention_mask"] = inputs["attention_mask"].astype(np.int64)

# Ensure token_type_ids exist
if "token_type_ids" not in inputs:
    inputs["token_type_ids"] = np.zeros_like(inputs["input_ids"])

inputs["token_type_ids"] = inputs["token_type_ids"].astype(np.int64)

# Triton input setup
input_ids = httpclient.InferInput("input_ids", inputs["input_ids"].shape, "INT64")
attention_mask = httpclient.InferInput("attention_mask", inputs["attention_mask"].shape, "INT64")
token_type_ids = httpclient.InferInput("token_type_ids", inputs["token_type_ids"].shape, "INT64")

# Set input data
input_ids.set_data_from_numpy(inputs["input_ids"])
attention_mask.set_data_from_numpy(inputs["attention_mask"])
token_type_ids.set_data_from_numpy(inputs["token_type_ids"])

# Define the output tensor
outputs = httpclient.InferRequestedOutput("last_hidden_state")

# Send inference request
try:
    response = client.infer(model_name=MODEL_NAME,inputs=[input_ids, attention_mask, token_type_ids],outputs=[outputs])
    
    # Print response
    print(response.as_numpy("last_hidden_state")[0][0])

except Exception as e:
    print("Error during inference:", e)


Error during inference: [Errno 10061] [WinError 10061] No connection could be made because the target machine actively refused it.


In [2]:
#image embeddings

In [48]:
import tritonclient.http as httpclient
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoImageProcessor
from PIL import Image
import requests

# Triton server details
TRITON_SERVER_URL = "http://bjcfl4pleq6n1n.runpod.io"
MODEL_NAME = "nomic-embed-vision"

# Load the image processor (same as used in ONNX model)
processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")

# Load an image
image_url = "https://m.media-amazon.com/images/I/61sADwl+YWL._AC_UL320_.jpg"  # Replace with your own image
image = Image.open(requests.get(image_url, stream=True).raw)

# Preprocess image
inputs = processor(image, return_tensors="np")
input_data = inputs["pixel_values"]  # Shape should be (1, 3, 224, 224)

# Initialize Triton client
client = httpclient.InferenceServerClient(url=TRITON_SERVER_URL)

# Prepare request
input_tensor = httpclient.InferInput("pixel_values", input_data.shape, "FP32")
input_tensor.set_data_from_numpy(input_data, binary_data=True)

# Request both outputs
output_tensor = httpclient.InferRequestedOutput("last_hidden_state", binary_data=True)

# Run inference
response = client.infer(model_name=MODEL_NAME, inputs=[input_tensor], outputs=[output_tensor])

# Extract embeddings
output = response.as_numpy("last_hidden_state")  # Shape: (1, 197, 768)

# Normalize embeddings (Optional: Usually done to get unit vectors)
img_embeddings = torch.tensor(output)
normalized_embeddings = F.normalize(img_embeddings[:, 0], p=2, dim=1)

# Print the shape and embeddings
# print("Image Embeddings Shape:", normalized_embeddings.shape)
normalized_embeddings

KeyboardInterrupt: 

In [8]:
import requests

RUNPOD_API_ENDPOINT = "https://wg0gr9xjg19lm4-5000.proxy.runpod.net/"

payload = ["a black thing", "hi there", "a red ball"]
response = requests.post(f"{RUNPOD_API_ENDPOINT}/infer_text", json=payload)

print(response.json())  # This will return a batch output


{'result': [[[-0.3409247398376465, -0.7468907237052917, -0.06713651865720749, 0.3817450702190399, -0.12461693584918976, -0.13136720657348633, -0.2632583975791931, 0.017859455198049545, 0.221467062830925, 0.5239472985267639, 0.38581356406211853, 0.0959547832608223, 0.2978442311286926, -0.15508081018924713, -0.05440891161561012, 0.7519263029098511, -0.22261089086532593, -0.23415395617485046, -0.5023572444915771, -0.12428933382034302, 0.2853080928325653, -0.33468127250671387, -0.541479229927063, -1.0916624069213867, 0.2847224473953247, 0.5853798389434814, -0.5848057270050049, 0.3948674499988556, 0.6893212199211121, 1.090098261833191, 0.05428646132349968, 0.15277783572673798, 0.6157644391059875, -0.8397108912467957, 0.17000624537467957, -0.4304691255092621, 0.49427446722984314, -0.7189001441001892, 0.08794411271810532, -0.0031746572349220514, 0.331777423620224, 0.27496570348739624, 0.28458288311958313, -0.4254235625267029, -0.4341040551662445, -0.6661328077316284, -0.03156675025820732, 0.0

In [7]:
import requests
from io import BytesIO

RUNPOD_API_ENDPOINT = "https://wg0gr9xjg19lm4-5000.proxy.runpod.net/"

# URLs of the images
image_urls = [
    "https://m.media-amazon.com/images/I/61EWjAcjlaL._AC_UL320_.jpg",
    "https://m.media-amazon.com/images/I/61m6h0YNY8L._AC_UL320_.jpg",
    "https://m.media-amazon.com/images/I/61-14Rg8UdL._AC_UL320_.jpg"
]

# Download images and prepare them as files
files = []
for i, url in enumerate(image_urls):
    response = requests.get(url)  # Download image
    image_bytes = BytesIO(response.content)  # Convert to file-like object
    files.append(("images", (f"image_{i}.jpg", image_bytes, "image/jpeg")))  # Append to list

# Send request
response = requests.post(f"{RUNPOD_API_ENDPOINT}/infer_image", files=files)

# Print response
print(response.json())  # Returns batch image embeddings


{'image_embeddings': [[-0.061458081007003784, -0.008738373406231403, -0.05642508715391159, 0.016185548156499863, -0.02483333647251129, -0.0007117125205695629, -0.053615882992744446, -0.016769198700785637, -0.07541747391223907, -0.024860035628080368, -0.07287455350160599, -0.018358804285526276, -0.024585586041212082, -0.07163390517234802, -0.07236067205667496, -0.018444528803229332, 0.019712768495082855, 0.0038870361167937517, -0.019686244428157806, -0.06525561958551407, -0.023868095129728317, -0.01305601466447115, 0.002554631093516946, -0.02139314077794552, -0.02882574498653412, -0.028426658362150192, 0.02404855750501156, -0.00047413367428816855, 0.02330038696527481, 0.04185372218489647, 0.013399124145507812, -0.00859767571091652, -0.022257337346673012, -0.041325297206640244, -0.08330060541629791, 0.022760534659028053, -0.06674235314130783, -0.019924653694033623, 0.0315115712583065, -0.04445299506187439, -0.015602088533341885, -0.021107053384184837, -0.04285024479031563, -0.05666146799

In [9]:
import requests
import time

RUNPOD_API_ENDPOINT = "https://wg0gr9xjg19lm4-5000.proxy.runpod.net/"

def wait_for_server():
    for _ in range(10):  # Check for 10 attempts (adjust if needed)
        try:
            response = requests.get(f"{RUNPOD_API_ENDPOINT}/health")
            if response.status_code == 200 and response.json().get("status") == "ok":
                print("✅ Triton is ready!")
                return True
        except requests.exceptions.RequestException:
            pass
        
        print("⏳ Waiting for Triton server to be ready...")
        time.sleep(3)  # Wait 3 seconds before retrying
    
    print("❌ Triton server is not available after multiple attempts.")
    return False

# # ✅ Wait until Triton is ready before making requests
# if wait_for_server():
#     print("🚀 Proceed with inference!")
# else:
#     print("🚨 Cannot proceed with inference. Server is unavailable.")
wait_for_server()

✅ Triton is ready!


True