In [2]:
import json
import os
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np
from tqdm.auto import tqdm

# Load categories
categories_path = '/scratch/ssd004/scratch/junejory/categories.json'
with open(categories_path, 'r') as f:
    categories = json.load(f)

# Generate sentences for each category
clip_labels = [f"a photo of a {category['name']}" for category in categories]

# Load annotations
annotations_path = '/scratch/ssd004/scratch/junejory/val2019.json'
with open(annotations_path, 'r') as f:
    annotations = json.load(f)


In [3]:
clip_labels

['a photo of a Amanita vaginata',
 'a photo of a Amanita flavoconia',
 'a photo of a Amanita calyptroderma',
 'a photo of a Amanita jacksonii',
 'a photo of a Amanita phalloides',
 'a photo of a Amanita muscaria',
 'a photo of a Amanita bisporigera',
 'a photo of a Amanita gemmata',
 'a photo of a Amanita pantherina',
 'a photo of a Amanita velosa',
 'a photo of a Amanita augusta',
 'a photo of a Amanita rubescens',
 'a photo of a Cicindela aurulenta',
 'a photo of a Cicindela punctulata',
 'a photo of a Cicindela ocellata',
 'a photo of a Cicindela campestris',
 'a photo of a Cicindela oregona',
 'a photo of a Cicindela tranquebarica',
 'a photo of a Cicindela formosa',
 'a photo of a Cicindela scutellaris',
 'a photo of a Cicindela duodecimguttata',
 'a photo of a Cicindela sexguttata',
 'a photo of a Cicindela repanda',
 'a photo of a Argia sedula',
 'a photo of a Argia tibialis',
 'a photo of a Argia plana',
 'a photo of a Argia translata',
 'a photo of a Argia moesta',
 'a photo o

In [14]:
import open_clip

# Load the BioCLIP model and its preprocessors
model, _, preprocess = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
model.eval()  # Ensure model is in evaluation mode = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip')
device = "cuda" if torch.cuda.is_available() else "cpu"


In [9]:
import gc
torch.cuda.empty_cache()
gc.collect()

7051

In [12]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [19]:
#label_tokens = tokenizer(clip_labels, return_tensors='pt', padding=True)
model.to("cuda")

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [None]:
# from torchvision.transforms import Resize

# # Resize images before preprocessing
# resize = Resize((256, 256))  # Adjust size as necessary

# # Modify image loading and preprocessing
# images = [resize(Image.open(image_paths[j])) for j in range(i, i_end)]
# image_tensors = torch.stack([preprocess(image).unsqueeze(0) for image in images]).to(model.device)


In [21]:
image_dir = '/scratch/ssd004/scratch/junejory/val2019'
image_paths = []
image_labels = []


for annotation in annotations['annotations']:
    image_info = next(image for image in annotations['images'] if image['id'] == annotation['image_id'])
    image_path = os.path.join(image_dir, image_info['file_name'])
    image_paths.append(image_path)
    image_labels.append(annotation['category_id'])

In [None]:
image_path

In [45]:
tokenized_labels = tokenizer(clip_labels).to(device)

# Ensure tokenized labels are in the right format
# Note: This assumes the tokenizer returns a tensor of shape [num_labels, max_seq_length]
if tokenized_labels.ndimension() == 1:
    tokenized_labels = tokenized_labels.unsqueeze(0)

# Get the label embeddings
with torch.no_grad():
    label_emb = model.encode_text(tokenized_labels)

# label_embeddings now contains the embeddings for your labels
label_emb /= label_emb.norm(dim=-1, keepdim=True)

print(label_emb.shape) 

torch.Size([1010, 512])


In [22]:
for image_path, label_id in zip(image_paths, image_labels):
    # Load and preprocess image
    image = preprocess(Image.open(image_path)).unsqueeze(0)

In [46]:
preds = []
batch_size = 32

for i in tqdm(range(0, len(image_paths), batch_size)):
    i_end = min(i + batch_size, len(image_paths))
    images = [Image.open(image_paths[j]) for j in range(i, i_end)]
    image_tensors = torch.stack([preprocess(image) for image in images]).to(device)

    with torch.no_grad(), torch.cuda.amp.autocast():
        img_emb = model.encode_image(image_tensors)
        img_emb /= img_emb.norm(dim=-1, keepdim=True)
        scores = (100.0 * img_emb @ label_emb.T).softmax(dim=-1)

    batch_preds = torch.argmax(scores, dim=1).cpu().numpy()
    preds.extend(batch_preds)

# Calculate accuracy
correct = sum(1 for pred, label in zip(preds, image_labels) if pred == label)
accuracy = correct / len(image_labels)
print(f"Accuracy: {accuracy:.4f}")

  0%|          | 0/95 [00:00<?, ?it/s]

Accuracy: 0.7063
