In [None]:
import requests
import numpy as np
import torch

# (1) === Fetch from Jina API ===
JINA_API_KEY = "jina_7f50a6d5bbbe45c6a75ff4dbfd946255mZMrkbGsQEH3dm8CONGY3yV6d1kv"  # ← replace with your key
api_url = "https://api.jina.ai/v1/embeddings"
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {JINA_API_KEY}"
}

# We’ll compare for just one text + one image:
single_payload = {
    "model": "jina-clip-v1",
    "input": [
        {"text": "A blue cat"},
        {"image": "https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg"}
    ]
}

resp = requests.post(api_url, headers=headers, json=single_payload)
resp.raise_for_status()
api_output = resp.json()

# The API returns a list of embeddings (one per item in “input”).
# We expect api_output["embeddings"] to be e.g. [[…], […]] of size 2×dim
api_text_emb_api = np.array(api_output["embeddings"][0])
api_img_emb_api  = np.array(api_output["embeddings"][1])

print("API embeddings shapes:",
      api_text_emb_api.shape, api_img_emb_api.shape)

# (2) === Compute locally using transformers + jina-clip ===
# Make sure you’ve already: pip install transformers einops timm pillow torch
from transformers import AutoProcessor, AutoModel

# Load the Jina-CLIP model & processor
model = AutoModel.from_pretrained("jinaai/jina-clip-v1", trust_remote_code=True)
processor = AutoProcessor.from_pretrained("jinaai/jina-clip-v1")

# Prepare text
texts = ["A blue cat"]
# Prepare image (will download behind the scenes via PIL)
from PIL import Image
import requests as _req
from io import BytesIO

img_url = "https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg"
img_data = Image.open(BytesIO(_req.get(img_url).content)).convert("RGB")

# Tokenize/process inputs
inputs = processor(
    text=texts, 
    images=[img_data], 
    return_tensors="pt", 
    padding=True
)

with torch.no_grad():
    outputs = model(**inputs)

# The model’s output has a ‘text_embeds’ and ‘image_embeds’ field
# depending on Jina’s implementation. Check key names:
if hasattr(outputs, "text_embeds") and hasattr(outputs, "image_embeds"):
    txt_emb_loc = outputs.text_embeds.cpu().numpy()[0]
    img_emb_loc = outputs.image_embeds.cpu().numpy()[0]
else:
    # Some CLIP variants name them differently; fallback:
    txt_emb_loc = outputs.get("text_embeds").cpu().numpy()[0]
    img_emb_loc = outputs.get("image_embeds").cpu().numpy()[0]

print("Local embeddings shapes:", txt_emb_loc.shape, img_emb_loc.shape)

# (3) === Compare API vs. Local ===

def cosine_similarity(a: np.ndarray, b: np.ndarray):
    a_norm = a / np.linalg.norm(a)
    b_norm = b / np.linalg.norm(b)
    return float(np.dot(a_norm, b_norm))

# Cosine between text‐embeddings (API vs local)
cos_text = cosine_similarity(api_text_emb_api, txt_emb_loc)

# Cosine between image‐embeddings (API vs local)
cos_img  = cosine_similarity(api_img_emb_api, img_emb_loc)

# L2 norm differences
l2_text = np.linalg.norm(api_text_emb_api - txt_emb_loc)
l2_img  = np.linalg.norm(api_img_emb_api  - img_emb_loc)

print(f"\nText embedding cosine (API vs Local): {cos_text:.6f}")
print(f"Text embedding L2 difference     : {l2_text:.6f}")

print(f"\nImage embedding cosine (API vs Local): {cos_img:.6f}")
print(f"Image embedding L2 difference       : {l2_img:.6f}")

# Decide if “same or not” within a tiny numerical tolerance:
TOL = 1e-4
print("\n== SUMMARY ==")
print("Text embeddings match? ",
      "YES" if l2_text < TOL else "NO, Δ={:.6e}".format(l2_text))
print("Image embeddings match?",
      "YES" if l2_img < TOL else "NO, Δ={:.6e}".format(l2_img))
