In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import torch
import open_clip
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
from torchvision import transforms
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
file_path = "/fs/ess/PAS2136/Hawaii-2025/beetles_intake/BeetlePUUM/1. Completed_Data/CanonBeetles.csv"
canon_df = pd.read_csv(file_path)

In [4]:
canon_df["ImageFilePath"] = canon_df["cropped_image_path"].apply(lambda x: f"/fs/ess/PAS2136/Hawaii-2025/beetles_intake/BeetlePUUM/CANON/{x}")

In [5]:
cols = ['ImageFilePath', 'Genus', 'ScientificName']
df = canon_df[cols]
df.head(2)

Unnamed: 0,ImageFilePath,Genus,ScientificName
0,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax,Mecyclothorax konanus
1,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax,Mecyclothorax konanus


In [6]:
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms("hf-hub:imageomics/bioclip")
tokenizer = open_clip.get_tokenizer("hf-hub:imageomics/bioclip")


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


for param in model.parameters() :
    param.requires_grad = False


species_list = df["ScientificName"].unique().tolist() 

text_inputs = tokenizer(species_list).to(device)

correct_predictions = total_images = 0

In [8]:
os.chdir("/users/PAS2136/rayees/pybioclip")

In [9]:
from bioclip.predict import TreeOfLifeClassifier, Rank

In [10]:
bioClip = TreeOfLifeClassifier()

df['Predictions'] = None

for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Images") :
    
    im_path = row["ImageFilePath"]
    
    predictions = bioClip.predict(str(im_path), Rank.SPECIES)
    
    top_pred = max(predictions, key=lambda x: x['score'])
    
    df.at[index, "Predictions"] = top_pred
    

Processing Images: 100%|██████████| 1622/1622 [03:38<00:00,  7.41it/s]


In [11]:
df['PredSpecies'] = df['Predictions'].apply(lambda x: x['species'] if isinstance(x, dict) else None)
df.head(2)

Unnamed: 0,ImageFilePath,Genus,ScientificName,Predictions,PredSpecies
0,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax,Mecyclothorax konanus,{'file_name': '/fs/ess/PAS2136/Hawaii-2025/bee...,Pterostichus riparius
1,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax,Mecyclothorax konanus,{'file_name': '/fs/ess/PAS2136/Hawaii-2025/bee...,Masoreus wetterhallii


In [12]:
correct_predictions = (df["PredSpecies"] == df["ScientificName"]).sum()

total_images = len(df)

accuracy = correct_predictions / total_images if total_images > 0 else 0

print(f"BioCLIP Open-Ended SPECIES Classification Accuracy: {accuracy*100:.4f}% ({correct_predictions}/{total_images})")


BioCLIP Open-Ended SPECIES Classification Accuracy: 1.2947% (21/1622)


In [13]:
df.to_csv('/users/PAS2136/rayees/3. Benchmarking/Beetle-PUUM/3.BioCLIP-open-ended-species.csv',index=False)