In [1]:
import os
import timm
import torch
import open_clip
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import torchvision.transforms as T
from torchvision import transforms
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
file_path = "/fs/ess/PAS2136/Hawaii-2025/beetles_intake/BeetlePUUM/1. Completed_Data/CanonBeetles.csv"
canon_df = pd.read_csv(file_path)

In [3]:
canon_df["ImageFilePath"] = canon_df["cropped_image_path"].apply(lambda x: f"/fs/ess/PAS2136/Hawaii-2025/beetles_intake/BeetlePUUM/CANON/{x}")
# canon_df.rename(columns={"cropped_image_path": "ImageFilePath"}, inplace=True)

In [4]:
cols = ['ImageFilePath', 'Genus', 'ScientificName']
df = canon_df[cols]
df.head(2)

Unnamed: 0,ImageFilePath,Genus,ScientificName
0,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax,Mecyclothorax konanus
1,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax,Mecyclothorax konanus


In [5]:
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms("hf-hub:imageomics/bioclip")
tokenizer = open_clip.get_tokenizer("hf-hub:imageomics/bioclip")


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


for param in model.parameters() :
    param.requires_grad = False


species_list = df["ScientificName"].unique().tolist() 

text_inputs = tokenizer(species_list).to(device)

correct_predictions = total_images = 0

In [7]:
df["PredSpecies"] = None


for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing Images") : 
    
    image_path = row["ImageFilePath"]

    if not os.path.exists(image_path):
        print(f"File not found: {image_path}")
        continue

    image = Image.open(image_path).convert("RGB")
    image_input = preprocess_val(image).unsqueeze(0).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)

    
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    
    similarity = (image_features @ text_features.T).squeeze(0)
    best_match_idx = similarity.argmax().item()
    predicted_species = species_list[best_match_idx]

    
    df.at[idx, "PredSpecies"] = predicted_species



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["PredSpecies"] = None
Processing Images: 100%|██████████| 1622/1622 [41:15<00:00,  1.53s/it]


In [11]:
correct_predictions = (df["PredSpecies"] == df["ScientificName"]).sum()

total_images = len(df)

accuracy = correct_predictions / total_images if total_images > 0 else 0

print(f"BioCLIP Zero-shot Classification accuracy: {accuracy*100:.4f}% ({correct_predictions}/{total_images})")


BioCLIP Zero-shot Classification accuracy: 23.4895% (381/1622)


In [9]:
df.head(2)

Unnamed: 0,ImageFilePath,Genus,ScientificName,PredSpecies
0,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax,Mecyclothorax konanus,Mecyclothorax gagnei
1,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax,Mecyclothorax konanus,Mecyclothorax gagnei


In [13]:
df.to_csv('/users/PAS2136/rayees/3. Benchmarking/Beetle-PUUM/1.BioCLIP-zero-shot-species.csv',index=False)