In [1]:
import numpy as np
import tritonclient.http as httpclient
from tritonclient.utils import triton_to_np_dtype
from PIL import Image
import clip
import torch

In [2]:
# Model settings (update these if needed)
MODEL_NAME = "clip_visual"
MODEL_VERSION = "1"  # or leave empty for the latest version

# Triton server URL (default HTTP port is 8000)
TRITON_URL = "localhost:8000"

In [3]:
# Load CLIP model to get the preprocess function (we only need the preprocess)
device = "cuda"  # Preprocessing can be done on CPU
model, preprocess = clip.load("ViT-B/32", device=device)

In [4]:
# Load and preprocess the image
image_path = "/home/vicky/Product_Matching_Pipeline/Dataset/Adidas-2.jpg"  # Update with your image file path
image = Image.open(image_path).convert("RGB")
input_tensor = preprocess(image)  # returns a tensor of shape [3, 224, 224] in float32
print("Input tensor shape:", input_tensor.shape)

Input tensor shape: torch.Size([3, 224, 224])


In [5]:
# Convert the tensor to numpy array and cast to FP16
input_np = input_tensor.numpy().astype(np.float16)
# Add a batch dimension: [1, 3, 224, 224]
input_np = np.expand_dims(input_np, axis=0)

In [6]:
# Create Triton client
triton_client = httpclient.InferenceServerClient(url=TRITON_URL)

In [10]:
# Prepare the inference input object (input name must match your model configuration)
infer_input = httpclient.InferInput("Input_Image", input_np.shape, "FP16")
infer_input.set_data_from_numpy(input_np)

# Prepare the inference output object (output name must match your model configuration)
infer_output = httpclient.InferRequestedOutput("Image_Embeddings")

In [11]:
# Send the inference request
response = triton_client.infer(
    model_name=MODEL_NAME,
    model_version=MODEL_VERSION,
    inputs=[infer_input],
    outputs=[infer_output]
)

# Retrieve and print the output embedding
output_embedding = response.as_numpy("Image_Embeddings")
print("Output embedding shape:", output_embedding.shape)
print("Output embedding:", output_embedding.tolist())

Output embedding shape: (1, 512)
Output embedding: [[-0.206787109375, 0.107421875, -0.034149169921875, -0.42333984375, -0.053955078125, 0.248291015625, -0.60205078125, 0.24560546875, 0.312255859375, 0.059600830078125, 0.2086181640625, -0.31787109375, -0.1197509765625, 0.130615234375, 0.1015625, 0.276123046875, 0.84814453125, -0.1502685546875, 0.2276611328125, 0.230712890625, -0.22705078125, -0.20751953125, -0.1502685546875, -0.50927734375, 0.00897216796875, 0.318359375, -0.3203125, -0.638671875, 0.046966552734375, -0.310546875, -0.0989990234375, 0.3857421875, -0.396728515625, -0.013275146484375, 0.00913238525390625, 0.1016845703125, -0.09808349609375, 0.2440185546875, -0.42919921875, 2.150390625, -0.4423828125, -0.261474609375, 0.412353515625, 0.322021484375, -0.042694091796875, -0.50537109375, -0.239990234375, 0.06695556640625, -0.1947021484375, 0.0202789306640625, 0.2021484375, -0.052520751953125, -0.27099609375, -0.382568359375, -0.49267578125, 0.4404296875, -0.06329345703125, -0.04

In [15]:
from transformers import CLIPTokenizer
import tritonclient.http as httpclient
import numpy as np

# Initialize tokenizer and Triton client
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
triton_client = httpclient.InferenceServerClient(url="localhost:8000")

# Tokenize text
text = "yo whats up my man"
inputs = tokenizer(text, return_tensors="np", padding="max_length", max_length=77, truncation=True)
input_ids = inputs["input_ids"].astype(np.int64)

# Prepare input for Triton
input_tensor = httpclient.InferInput("input", input_ids.shape, "INT64")
input_tensor.set_data_from_numpy(input_ids)

# Send inference request
outputs = [httpclient.InferRequestedOutput("output")]
response = triton_client.infer(model_name="clip_text", inputs=[input_tensor], outputs=outputs)

# Get the text embedding
embedding = response.as_numpy("output")
print(embedding.shape)  # Should be [1, 512]

(1, 512)


In [16]:
print("Output embedding shape:", embedding.shape)
print("Output embedding:", embedding.tolist())

Output embedding shape: (1, 512)
Output embedding: [[0.11036831140518188, 0.2619907259941101, 0.2587181031703949, 0.12918494641780853, 0.0820734053850174, -0.17723360657691956, 0.28629088401794434, -0.9022852182388306, -0.20147418975830078, 0.15665718913078308, -0.30644097924232483, -0.14183486998081207, -0.019492819905281067, -0.10165369510650635, -0.044385023415088654, -0.16183780133724213, 0.3703668713569641, 0.2650766372680664, 0.27244430780410767, -0.2469407021999359, 0.2232109010219574, -0.4336828291416168, -0.02304781973361969, 0.04009689390659332, -0.006294813007116318, 0.0350809171795845, 0.08644594252109528, -0.20119206607341766, -0.05422578006982803, -0.10712800920009613, 0.18166011571884155, 0.0647159218788147, -0.007670342922210693, -0.13615360856056213, 0.46679607033729553, -0.07184448838233948, 0.08853228390216827, 0.048489101231098175, 0.01720622181892395, 0.32849907875061035, 0.030947256833314896, 0.16070733964443207, -0.17894361913204193, -0.01256752759218216, 0.10163

In [15]:
import numpy as np
import tritonclient.http as httpclient

# Step 1: Create a client connected to the Triton server
client = httpclient.InferenceServerClient(url="localhost:8000")

# Step 2: Verify that the server is live and ready
if not client.is_server_live():
    print("Error: Triton server is not live.")
    exit(1)
if not client.is_server_ready():
    print("Error: Triton server is not ready.")
    exit(1)

# Step 3: Prepare input data
# For demonstration, we generate random token IDs (replace with your actual input data)
batch_size = 1
sequence_length = 77  # Example for CLIP models; adjust as needed
vocab_size = 49408    # Example vocab size; adjust as needed
input_data = np.random.randint(0, vocab_size, size=(batch_size, sequence_length), dtype=np.int64)

# Step 4: Create InferInput for the input tensor
# Adjust the name, shape, and datatype based on your model's input
inputs = [httpclient.InferInput("input", input_data.shape, "INT64")]
inputs[0].set_data_from_numpy(input_data)

# Step 5: Send the inference request
# Replace "clip_onnx" with your actual model name


results = client.infer(model_name="clip_text", inputs=inputs)


# Step 6: Retrieve and print the output
# Adjust the output name based on your model's output tensor
output = results.as_numpy("output")
print("Inference output shape:", output.shape)
print("Sample output:", output.tolist())  # Print first few elements of the first sample

Inference output shape: (1, 512)
Sample output: [[0.04384872317314148, 0.2758655548095703, 0.12861418724060059, -0.1678852140903473, -0.0757695809006691, 0.19222944974899292, -0.13032987713813782, 0.16471745073795319, 0.11584168672561646, -0.18255171179771423, 0.32220658659935, 0.2414170652627945, -0.048784930258989334, -0.31080955266952515, 0.24958744645118713, -0.17493604123592377, -0.10543036460876465, -0.14024098217487335, 0.17658784985542297, 0.13849136233329773, 0.10778851807117462, -0.35784515738487244, -0.039488863199949265, -0.1904057413339615, -0.1530546396970749, 0.1356791853904724, -0.13766801357269287, -0.1996038556098938, -0.33985352516174316, 0.1431802362203598, -0.22356364130973816, -0.33686164021492004, -0.003579959273338318, 0.09819108992815018, 0.27528443932533264, -0.11555793136358261, 0.022561997175216675, 0.42334508895874023, 0.030185341835021973, -0.05783906579017639, -0.3455367982387543, 0.32281559705734253, 0.10840123146772385, -0.06245678663253784, 0.486441701

In [16]:
print("Output shape:", output.shape)
print("Raw output sample:", output[0][:10])


Output shape: (1, 512)
Raw output sample: [ 0.04384872  0.27586555  0.12861419 -0.16788521 -0.07576958  0.19222945
 -0.13032988  0.16471745  0.11584169 -0.18255171]
