In [1]:
import torch
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

model.eval()


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [17]:
from PIL import Image
import numpy as np

def embed_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        image_features = model.get_image_features(**inputs)

    # Normalize (important for cosine similarity)
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    return image_features.cpu().numpy()[0]


In [18]:
import pandas as pd
import os
from tqdm import tqdm

CSV_PATH = "micrographs_metadata.csv"
IMAGE_ROOT = "./"

df = pd.read_csv(CSV_PATH)
df = df[["image_local_path", "categories", "technique"]].dropna()


In [19]:
print(len(df))
print(df.head())


867
                    image_local_path      categories  \
0  ./miclib_output/images/000001.jpg  Metal or alloy   
1  ./miclib_output/images/000002.jpg  Metal or alloy   
2  ./miclib_output/images/000003.jpg  Metal or alloy   
3  ./miclib_output/images/000004.jpg  Metal or alloy   
4  ./miclib_output/images/000005.jpg  Metal or alloy   

                            technique  
0          Reflected light microscopy  
1  Scanning electron microscopy (SEM)  
2          Reflected light microscopy  
3  Scanning electron microscopy (SEM)  
4          Reflected light microscopy  


In [20]:
missing = 0
for p in df["image_local_path"].head(10):
    full = os.path.join(IMAGE_ROOT, p)
    print(full, "->", os.path.exists(full))



././miclib_output/images/000001.jpg -> True
././miclib_output/images/000002.jpg -> True
././miclib_output/images/000003.jpg -> True
././miclib_output/images/000004.jpg -> True
././miclib_output/images/000005.jpg -> True
././miclib_output/images/000006.jpg -> True
././miclib_output/images/000007.jpg -> True
././miclib_output/images/000008.jpg -> True
././miclib_output/images/000009.jpg -> True
././miclib_output/images/000010.jpg -> True


In [21]:
embeddings = []
valid_rows = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    img_path = os.path.join(IMAGE_ROOT, row["image_local_path"])
    if not os.path.exists(img_path):
        continue

    try:
        emb = embed_image(img_path)
        embeddings.append(emb)
        valid_rows.append(row)
    except Exception as e:
        print(f"Skipping {img_path}: {e}")


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 867/867 [00:24<00:00, 35.96it/s]


In [22]:
import numpy as np

X_img = np.vstack(embeddings)  # shape: (N, 512)
df_valid = pd.DataFrame(valid_rows)


In [23]:
np.save("clip_image_embeddings.npy", X_img)
df_valid.to_csv("clip_metadata_clean.csv", index=False)


In [24]:
print(X_img.shape)   # should be (~700–800, 512)


(867, 512)


In [25]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(X_img[:5], X_img[:5])
print(sim)


[[0.99999964 0.8663573  0.86464655 0.8218291  0.83858883]
 [0.8663573  0.9999997  0.84768605 0.84433806 0.88632524]
 [0.86464655 0.84768605 1.0000002  0.8838994  0.9132266 ]
 [0.8218291  0.84433806 0.8838994  1.0000001  0.9137329 ]
 [0.83858883 0.88632524 0.9132266  0.9137329  1.0000004 ]]
