In [None]:
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoProcessor, AutoModel
from PIL import Image
import numpy as np

device = "cuda"

# ----------------------------
# Load Vision Encoder
# ----------------------------

vision_name = "google/siglip-base-patch16-224"
processor = AutoProcessor.from_pretrained(vision_name, use_fast=True)
vision_model = AutoModel.from_pretrained(vision_name).vision_model.to(device)
vision_model.eval()

# ----------------------------
# Load Image
# ----------------------------

image_path = "sample.png"
image = Image.open(image_path).convert("RGB")

inputs = processor(images=image, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = vision_model(**inputs)
    patch_tokens = outputs.last_hidden_state  # (1, N, 768)

print("Patch tokens shape:", patch_tokens.shape)

# Remove batch dim
patch_tokens = patch_tokens.squeeze(0)

print("Number of tokens:", patch_tokens.shape[0])
print("Embedding dim:", patch_tokens.shape[1])

Patch tokens shape: torch.Size([1, 196, 768])
Number of tokens: 196
Embedding dim: 768


In [11]:
patch_only = patch_tokens

print("Patch Token Shape:", patch_only.shape)
print("Mean patch norm:", torch.norm(patch_only.mean(dim=0)))

Patch Token Shape: torch.Size([196, 768])
Mean patch norm: tensor(35.8578, device='cuda:0')
