In [None]:
import numpy as np
from PIL import Image
import requests
from transformers import AutoProcessor, AutoModel
import torch

In [None]:
model = AutoModel.from_pretrained("google/medsiglip-448").to(device)
processor = AutoProcessor.from_pretrained("google/medsiglip-448")

In [None]:
# Download sample image
imgs = [Image.open("dataset/images/3445096909671059178.png").convert("RGB"), Image.open("dataset/images/-5669089898008966381.png").convert("RGB")]

texts = [
    "a photo of an arm with no rash",
    "a photo of an arm with a rash",
    "a photo of a leg with no rash",
    "a photo of a leg with a rash"
]

inputs = processor(text=texts, images=imgs, padding="max_length", return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = torch.softmax(logits_per_image, dim=1)

for n_img, img in enumerate(imgs):
    display(img)  # Note this is an IPython function that will only work in a Jupyter notebook environment
    for i, label in enumerate(texts):
        print(f"{probs[n_img][i]:.2%} that image is '{label}'")

# Get the image and text embeddings
print(f"image embeddings: {outputs.image_embeds}")
print(f"text embeddings: {outputs.text_embeds}")


In [None]:
# Get the seconf image so the 
image2 = Image.open("dataset/images/-5669089898008966381.png").convert("RGB")

with torch.no_grad():
    outputs = model(**inputs)
    
    # Extract embeddings
    image_embedding = outputs.image_embeds  # Shape: [1, embedding_dim]
    text_embeddings = outputs.text_embeds   # Shape: [num_texts, embedding_dim]

print(f"Image embedding shape: {image_embedding.shape}")
print(f"Text embeddings shape: {text_embeddings.shape}")
print(f"\nImage embedding (first 10 values): {image_embedding[0][:10]}")

# Compute similarity
image_embedding_norm = torch.nn.functional.normalize(image_embedding, p=2, dim=-1)
embedding_list = image_embedding_norm[0].cpu().tolist()

text_embeddings_norm = torch.nn.functional.normalize(text_embeddings, p=2, dim=-1)

similarities = torch.matmul(image_embedding_norm, text_embeddings_norm.T)
print(f"\nSimilarity scores:")
for i, text in enumerate(texts):
    print(f"  '{text}': {similarities[0][i]:.4f}")

# If you want JUST the image embedding for storage/comparison later
# You can extract it from this output
print(f"\nImage embedding can be saved: {image_embedding.shape}")

**Call Gemini with Text Embedding**

In [None]:
from google import genai

client = genai.Client(api_key="AIzaSyDXjCnAAfPT4C6fgEqT6GFZFInyPV7zHow")

print(f"Image embedding shape: {image_embedding.shape}")
print(f"Embedding vector length: {len(embedding_list)}")

# Create prompt
prompt = f"""Analyze this image and its SigLIP embedding.

Embedding Statistics:
- Dimensions: {len(embedding_list)}
- Range: [{min(embedding_list):.4f}, {max(embedding_list):.4f}]
- Mean: {sum(embedding_list)/len(embedding_list):.4f}
- First 20 values: {embedding_list[:20]}

Please:
1. Describe what you see in the image in detail
2. Generate a natural language caption for this image
3. Explain what visual features these embedding values might represent
"""

# Call Gemini API
try:
    response = client.models.generate_content(
        model='gemini-2.0-flash-exp',
        contents=[prompt, image2]
    )
    
    print("\n" + "=" * 80)
    print("GEMINI ANALYSIS")
    print("=" * 80)
    print(response.text)
    print("=" * 80)
    
except Exception as e:
    print(f"Error calling Gemini: {e}")
    print(f"Error type: {type(e)}")

In [None]:
import ollama
import io
import base64
from PIL import Image

model_name = "puyangwang/medgemma-27b-it:q8"  # Try the simpler model name first

ollama_client = ollama.Client()

# Verify the model exists and check its details
try:
    model_info = ollama_client.show(model_name)
    print(f"Model info: {model_info}")
except Exception as e:
    print(f"Model info error: {e}")

# Convert PIL Image to base64 - try JPEG format
buffered = io.BytesIO()
image2.save(buffered, format="JPEG")  # Try JPEG instead of PNG
image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')

# Alternative: Pass the raw bytes instead of base64
buffered2 = io.BytesIO()
image2.save(buffered2, format="JPEG")
image_bytes = buffered2.getvalue()

ollama_prompt = "Describe this image in detail and generate a natural language caption for it."

# Try method 1: with base64 string
try:
    print("Attempting with base64 string...")
    response = ollama_client.chat(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': ollama_prompt,
                'images': [image_base64]
            },
        ]
    )
    print("Success!")
    print(response['message']['content'])
except ollama.ResponseError as e:
    print(f"Base64 method failed: {e}")
    
    # Try method 2: with raw bytes
    try:
        print("\nAttempting with raw bytes...")
        response = ollama_client.chat(
            model=model_name,
            messages=[
                {
                    'role': 'user',
                    'content': ollama_prompt,
                    'images': [image_bytes]
                },
            ]
        )
        print("Success!")
        print(response['message']['content'])
    except Exception as e2:
        print(f"Raw bytes method also failed: {e2}")