In [1]:
import json
import os
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np
from tqdm.auto import tqdm


In [2]:
train_dir = '/datasets/inat_comp/2021/train_mini'
val_dir = '/datasets/inat_comp/2021/val'
train_json = '/datasets/inat_comp/2021/train_mini.json'
val_json = '/datasets/inat_comp/2021/val.json'
source = '/datasets/inat_comp/2021/'
test_json = '/datasets/inat_comp/2021/public_test.json'
test = '/datasets/inat_comp/2021/public_test'

In [3]:
import os

print(os.listdir(source)) # here '.' for current directory.

['train.json', 'train_mini.json.tar.gz', 'train.tar.gz', 'public_test.json', 'train_mini.json', 'train', 'val.json', 'val.tar.gz', 'train.json.tar.gz', 'public_test', 'train_mini', 'val', 'val.json.tar.gz', 'public_test.tar.gz', 'public_test.json.tar.gz', 'train_mini.tar.gz']


In [4]:
def load_json(json_path):
    with open(json_path, 'r') as f:
        return json.load(f)

In [5]:
train_data = load_json(train_json)
val_data = load_json(val_json)
test_data = load_json(test_json)

In [6]:
data = val_data
data.keys()

dict_keys(['info', 'images', 'categories', 'annotations', 'licenses'])

In [7]:
val_data['categories'][0]

{'id': 0,
 'name': 'Lumbricus terrestris',
 'common_name': 'Common Earthworm',
 'supercategory': 'Animalia',
 'kingdom': 'Animalia',
 'phylum': 'Annelida',
 'class': 'Clitellata',
 'order': 'Haplotaxida',
 'family': 'Lumbricidae',
 'genus': 'Lumbricus',
 'specific_epithet': 'terrestris',
 'image_dir_name': '00000_Animalia_Annelida_Clitellata_Haplotaxida_Lumbricidae_Lumbricus_terrestris'}

In [8]:
len((val_data['categories']))

10000

In [9]:
labels = []
for i in range(10000):
        img_info = val_data['categories'][i]
        a = img_info.get('kingdom') + ' ' + img_info.get('phylum') + ' ' + img_info.get('class') + ' ' + img_info.get('order') + ' ' + img_info.get('family') + ' ' + img_info.get('genus') + ' ' + img_info.get('specific_epithet') + ' with common name ' + img_info.get('common_name') 
        #print(a)
        if a not in labels:
            labels.append(a)
labels

['Animalia Annelida Clitellata Haplotaxida Lumbricidae Lumbricus terrestris with common name Common Earthworm',
 'Animalia Annelida Polychaeta Sabellida Sabellidae Sabella spallanzanii with common name Mediterranean Fanworm',
 'Animalia Annelida Polychaeta Sabellida Serpulidae Serpula columbiana with common name Serpula columbiana',
 'Animalia Annelida Polychaeta Sabellida Serpulidae Spirobranchus cariniferus with common name Blue Tube Worm',
 'Animalia Arthropoda Arachnida Araneae Agelenidae Eratigena duellica with common name Giant House Spider',
 'Animalia Arthropoda Arachnida Araneae Antrodiaetidae Atypoides riversi with common name California Turret Spider',
 'Animalia Arthropoda Arachnida Araneae Araneidae Aculepeira ceropegia with common name Oak Spider',
 'Animalia Arthropoda Arachnida Araneae Araneidae Agalenatea redii with common name Gorse Orbweaver',
 'Animalia Arthropoda Arachnida Araneae Araneidae Araneus bicentenarius with common name Giant Lichen Orbweaver',
 'Animalia 

In [10]:
len(labels)

10000

In [11]:
val_data['images']

[{'id': 2686843,
  'width': 284,
  'height': 222,
  'file_name': 'val/03938_Animalia_Chordata_Aves_Passeriformes_Meliphagidae_Ptilotula_penicillata/df8edd4c-fbb4-4886-8600-a429e5efac23.jpg',
  'license': 2,
  'rights_holder': 'megatherium',
  'date': '2007-10-31 00:00:00+00:00',
  'latitude': -21.93073,
  'longitude': 114.12239,
  'location_uncertainty': None},
 {'id': 2686844,
  'width': 500,
  'height': 375,
  'file_name': 'val/03583_Animalia_Chordata_Aves_Cuculiformes_Cuculidae_Coccyzus_erythropthalmus/fc35080c-5ace-4485-a21f-b1447f27efc7.jpg',
  'license': 1,
  'rights_holder': 'rpayne',
  'date': '2011-07-15 00:00:00+00:00',
  'latitude': 44.02901,
  'longitude': -73.17711,
  'location_uncertainty': None},
 {'id': 2686845,
  'width': 500,
  'height': 380,
  'file_name': 'val/05585_Fungi_Basidiomycota_Agaricomycetes_Agaricales_Strophariaceae_Pholiota_squarrosa/4a1df4d1-4cd8-469d-8496-7b025a5b73cd.jpg',
  'license': 2,
  'rights_holder': 'megatherium',
  'date': '2011-09-14 00:00:00

In [12]:
for i in range(5):
    print(val_data['annotations'][i])

{'id': 2686843, 'image_id': 2686843, 'category_id': 3938}
{'id': 2686844, 'image_id': 2686844, 'category_id': 3583}
{'id': 2686845, 'image_id': 2686845, 'category_id': 5585}
{'id': 2686846, 'image_id': 2686846, 'category_id': 4487}
{'id': 2686847, 'image_id': 2686847, 'category_id': 5282}


In [13]:
for i in range(5):
    print(val_data['categories'][i])

{'id': 0, 'name': 'Lumbricus terrestris', 'common_name': 'Common Earthworm', 'supercategory': 'Animalia', 'kingdom': 'Animalia', 'phylum': 'Annelida', 'class': 'Clitellata', 'order': 'Haplotaxida', 'family': 'Lumbricidae', 'genus': 'Lumbricus', 'specific_epithet': 'terrestris', 'image_dir_name': '00000_Animalia_Annelida_Clitellata_Haplotaxida_Lumbricidae_Lumbricus_terrestris'}
{'id': 1, 'name': 'Sabella spallanzanii', 'common_name': 'Mediterranean Fanworm', 'supercategory': 'Animalia', 'kingdom': 'Animalia', 'phylum': 'Annelida', 'class': 'Polychaeta', 'order': 'Sabellida', 'family': 'Sabellidae', 'genus': 'Sabella', 'specific_epithet': 'spallanzanii', 'image_dir_name': '00001_Animalia_Annelida_Polychaeta_Sabellida_Sabellidae_Sabella_spallanzanii'}
{'id': 2, 'name': 'Serpula columbiana', 'common_name': 'Serpula columbiana', 'supercategory': 'Animalia', 'kingdom': 'Animalia', 'phylum': 'Annelida', 'class': 'Polychaeta', 'order': 'Sabellida', 'family': 'Serpulidae', 'genus': 'Serpula', '

In [14]:
for i in range(5):
    print(val_data['images'][i])

{'id': 2686843, 'width': 284, 'height': 222, 'file_name': 'val/03938_Animalia_Chordata_Aves_Passeriformes_Meliphagidae_Ptilotula_penicillata/df8edd4c-fbb4-4886-8600-a429e5efac23.jpg', 'license': 2, 'rights_holder': 'megatherium', 'date': '2007-10-31 00:00:00+00:00', 'latitude': -21.93073, 'longitude': 114.12239, 'location_uncertainty': None}
{'id': 2686844, 'width': 500, 'height': 375, 'file_name': 'val/03583_Animalia_Chordata_Aves_Cuculiformes_Cuculidae_Coccyzus_erythropthalmus/fc35080c-5ace-4485-a21f-b1447f27efc7.jpg', 'license': 1, 'rights_holder': 'rpayne', 'date': '2011-07-15 00:00:00+00:00', 'latitude': 44.02901, 'longitude': -73.17711, 'location_uncertainty': None}
{'id': 2686845, 'width': 500, 'height': 380, 'file_name': 'val/05585_Fungi_Basidiomycota_Agaricomycetes_Agaricales_Strophariaceae_Pholiota_squarrosa/4a1df4d1-4cd8-469d-8496-7b025a5b73cd.jpg', 'license': 2, 'rights_holder': 'megatherium', 'date': '2011-09-14 00:00:00+00:00', 'latitude': 59.32726, 'longitude': 18.13697,

In [15]:
import os
import json


# Extract image paths and corresponding labels
image_paths = []
image_labels = []

# Create a mapping from image_id to category_id (labels)
image_id_to_label = {ann['image_id']: ann['category_id'] for ann in val_data['annotations']}

# Iterate over all images in the validation set
for img_info in val_data['images']:
    image_id = img_info['id']
    image_file = img_info['file_name']
    image_path = os.path.join(source, image_file)
    
    # Add the image path
    image_paths.append(image_path)
    
    # Add the corresponding label, if available
    label = image_id_to_label.get(image_id, None)  # Use get to handle missing keys gracefully
    if label is not None:
        image_labels.append(label)
    else:
        print(f"Warning: No label found for image_id {image_id}")

# Check for consistency
assert len(image_paths) == len(image_labels), "Mismatch between number of images and labels"

print(f"Total images: {len(image_paths)}")
print(f"Total labels: {len(image_labels)}")

# Optionally, print first few image paths and labels to verify
for i in range(min(10, len(image_paths))):
    print(f"Image path: {image_paths[i]}")
    print(f"Label: {image_labels[i]}")

Total images: 100000
Total labels: 100000
Image path: /datasets/inat_comp/2021/val/03938_Animalia_Chordata_Aves_Passeriformes_Meliphagidae_Ptilotula_penicillata/df8edd4c-fbb4-4886-8600-a429e5efac23.jpg
Label: 3938
Image path: /datasets/inat_comp/2021/val/03583_Animalia_Chordata_Aves_Cuculiformes_Cuculidae_Coccyzus_erythropthalmus/fc35080c-5ace-4485-a21f-b1447f27efc7.jpg
Label: 3583
Image path: /datasets/inat_comp/2021/val/05585_Fungi_Basidiomycota_Agaricomycetes_Agaricales_Strophariaceae_Pholiota_squarrosa/4a1df4d1-4cd8-469d-8496-7b025a5b73cd.jpg
Label: 5585
Image path: /datasets/inat_comp/2021/val/04487_Animalia_Chordata_Aves_Procellariiformes_Diomedeidae_Phoebastria_nigripes/5a89f160-b085-4990-a66a-aac18059b7da.jpg
Label: 4487
Image path: /datasets/inat_comp/2021/val/05282_Animalia_Mollusca_Gastropoda_Neogastropoda_Buccinidae_Sinistrofulgur_sinistrum/3b188349-0528-4aff-bc15-6218e16f8e10.jpg
Label: 5282
Image path: /datasets/inat_comp/2021/val/03253_Animalia_Chordata_Aves_Anseriformes

In [16]:
import open_clip

# Load the BioCLIP model and its preprocessors
model, _, preprocess = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
model.eval()  # Ensure model is in evaluation mode = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip')
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [21]:
tokenized_labels = tokenizer(labels).to(device)

# Ensure tokenized labels are in the right format
# Note: This assumes the tokenizer returns a tensor of shape [num_labels, max_seq_length]


In [23]:
if tokenized_labels.ndimension() == 1:
    tokenized_labels = tokenized_labels.unsqueeze(0)

# Get the label embeddings
label_emb_list = []
for label in tokenized_labels:
    with torch.no_grad():
        emb = model.encode_text(label.unsqueeze(0)).cpu()
    label_emb_list.append(emb)
label_emb = torch.cat(label_emb_list, dim=0).to(device)


# label_embeddings now contains the embeddings for your labels
#label_emb /= label_emb.norm(dim=-1, keepdim=True)

print(label_emb.shape) 

torch.Size([10000, 512])


In [18]:
import gc
torch.cuda.empty_cache()
gc.collect()

1305

In [19]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [18]:
image_paths

['/datasets/inat_comp/2021/val/03938_Animalia_Chordata_Aves_Passeriformes_Meliphagidae_Ptilotula_penicillata/df8edd4c-fbb4-4886-8600-a429e5efac23.jpg',
 '/datasets/inat_comp/2021/val/03583_Animalia_Chordata_Aves_Cuculiformes_Cuculidae_Coccyzus_erythropthalmus/fc35080c-5ace-4485-a21f-b1447f27efc7.jpg',
 '/datasets/inat_comp/2021/val/05585_Fungi_Basidiomycota_Agaricomycetes_Agaricales_Strophariaceae_Pholiota_squarrosa/4a1df4d1-4cd8-469d-8496-7b025a5b73cd.jpg',
 '/datasets/inat_comp/2021/val/04487_Animalia_Chordata_Aves_Procellariiformes_Diomedeidae_Phoebastria_nigripes/5a89f160-b085-4990-a66a-aac18059b7da.jpg',
 '/datasets/inat_comp/2021/val/05282_Animalia_Mollusca_Gastropoda_Neogastropoda_Buccinidae_Sinistrofulgur_sinistrum/3b188349-0528-4aff-bc15-6218e16f8e10.jpg',
 '/datasets/inat_comp/2021/val/03253_Animalia_Chordata_Aves_Anseriformes_Anatidae_Mareca_americana/d5359888-d956-46a0-9dfb-87099ee1e6dc.jpg',
 '/datasets/inat_comp/2021/val/05813_Plantae_Tracheophyta_Liliopsida_Alismatales_H

In [24]:
import os
from PIL import Image
from tqdm import tqdm

# val_dir = '/datasets/inat_comp/2021/val'
# image_paths = []

# # Collect all image paths from the directory
# for root, _, files in os.walk(val_dir):
#     for file in files:
#         if file.lower().endswith(('.jpg', '.jpeg', '.png')):
#             image_paths.append(os.path.join(root, file))

# # Ensure image_paths contains correct paths
# print(f"Number of images found: {len(image_paths)}")

# Assuming image_labels are correctly set up, e.g., [0, 1, 2, ..., num_images-1]
# image_labels = ...

batch_size = 32
preds = []

for i in tqdm(range(0, len(image_paths), batch_size)):
    i_end = min(i + batch_size, len(image_paths))
    images = [Image.open(image_paths[j]) for j in range(i, i_end)]
    
    # Preprocess images and move them to the device (GPU)
    image_tensors = torch.stack([preprocess(image) for image in images]).to(device)

    # Inference with mixed precision
    with torch.no_grad(), torch.cuda.amp.autocast():
        img_emb = model.encode_image(image_tensors)
        img_emb /= img_emb.norm(dim=-1, keepdim=True)
        
        # Compute the similarity scores between image embeddings and label embeddings
        scores = (100.0 * img_emb @ label_emb.T).softmax(dim=-1)

    # Get the predicted class for each image
    batch_preds = torch.argmax(scores, dim=1).cpu().numpy()
    preds.extend(batch_preds)

# Calculate accuracy
correct = sum(1 for pred, label in zip(preds, image_labels) if pred == label)
accuracy = correct / len(image_labels)

print(f"Accuracy: {accuracy:.4f}")


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3125/3125 [19:53<00:00,  2.62it/s]

Accuracy: 0.6398





In [None]:
#NOW to evaluate on just the iNAT clip

In [25]:
import open_clip

model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip-vit-b-16-inat-only')
tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip-vit-b-16-inat-only')
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [26]:
tokenized_labels = tokenizer(labels).to(device)
if tokenized_labels.ndimension() == 1:
    tokenized_labels = tokenized_labels.unsqueeze(0)

# Get the label embeddings
label_emb_list = []
for label in tokenized_labels:
    with torch.no_grad():
        emb = model.encode_text(label.unsqueeze(0)).cpu()
    label_emb_list.append(emb)
label_emb = torch.cat(label_emb_list, dim=0).to(device)


# label_embeddings now contains the embeddings for your labels
#label_emb /= label_emb.norm(dim=-1, keepdim=True)

print(label_emb.shape) 
# Ensure tokenized labels are in the right format
# Note: This assumes the tokenizer returns a tensor of shape [num_labels, max_seq_length]


torch.Size([10000, 512])


In [27]:
import os
from PIL import Image
from tqdm import tqdm

# val_dir = '/datasets/inat_comp/2021/val'
# image_paths = []

# # Collect all image paths from the directory
# for root, _, files in os.walk(val_dir):
#     for file in files:
#         if file.lower().endswith(('.jpg', '.jpeg', '.png')):
#             image_paths.append(os.path.join(root, file))

# # Ensure image_paths contains correct paths
# print(f"Number of images found: {len(image_paths)}")

# Assuming image_labels are correctly set up, e.g., [0, 1, 2, ..., num_images-1]
# image_labels = ...

batch_size = 32
preds = []

for i in tqdm(range(0, len(image_paths), batch_size)):
    i_end = min(i + batch_size, len(image_paths))
    images = [Image.open(image_paths[j]) for j in range(i, i_end)]
    
    # Preprocess images and move them to the device (GPU)
    image_tensors = torch.stack([preprocess(image) for image in images]).to(device)

    # Inference with mixed precision
    with torch.no_grad(), torch.cuda.amp.autocast():
        img_emb = model.encode_image(image_tensors)
        img_emb /= img_emb.norm(dim=-1, keepdim=True)
        
        # Compute the similarity scores between image embeddings and label embeddings
        scores = (100.0 * img_emb @ label_emb.T).softmax(dim=-1)

    # Get the predicted class for each image
    batch_preds = torch.argmax(scores, dim=1).cpu().numpy()
    preds.extend(batch_preds)

# Calculate accuracy
correct = sum(1 for pred, label in zip(preds, image_labels) if pred == label)
accuracy = correct / len(image_labels)

print(f"Accuracy: {accuracy:.4f}")


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3125/3125 [16:48<00:00,  3.10it/s]

Accuracy: 0.7195



