In [18]:
import tritonclient.http as httpclient
import numpy as np
from transformers import AutoTokenizer

# Initialize tokenizer and Triton client
tokenizer = AutoTokenizer.from_pretrained("WhereIsAI/UAE-Large-V1")
client = httpclient.InferenceServerClient(url="localhost:8000")

# Example input text
text = "a black thing"
inputs = tokenizer(text, return_tensors="np")

# Triton requires token_type_ids along with input_ids and attention_mask
input_ids = httpclient.InferInput("input_ids", inputs["input_ids"].shape, "INT64")
attention_mask = httpclient.InferInput("attention_mask", inputs["attention_mask"].shape, "INT64")

# Ensure token_type_ids exist (some models use it, some don't)
if "token_type_ids" not in inputs:
    inputs["token_type_ids"] = np.zeros_like(inputs["input_ids"])

token_type_ids = httpclient.InferInput("token_type_ids", inputs["token_type_ids"].shape, "INT64")

# Set data for inputs
input_ids.set_data_from_numpy(inputs["input_ids"])
attention_mask.set_data_from_numpy(inputs["attention_mask"])
token_type_ids.set_data_from_numpy(inputs["token_type_ids"])

# Define the output tensor
outputs = httpclient.InferRequestedOutput("last_hidden_state")

# Send inference request
response = client.infer("UAE-Large-V1", inputs=[input_ids, attention_mask, token_type_ids], outputs=[outputs])

# Print response
print(response.as_numpy("last_hidden_state")[0][0])


[-0.3409501  -0.7467628  -0.06714214 ...  0.0640513  -0.00992554
  0.11812457]
