In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import torch
import tempfile
import open_clip
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm
from torchvision import transforms
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:

model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms("hf-hub:imageomics/bioclip")

tokenizer = open_clip.get_tokenizer("hf-hub:imageomics/bioclip")


device = "cuda" if torch.cuda.is_available() else "cpu"

model.to(device)


for param in model.parameters():
    param.requires_grad = False
    

In [4]:
file_path = "/fs/ess/PAS2136/Rayeed/Carabids-100K-V2/AllCarabids.csv"
insect_df = pd.read_csv(file_path, sep="\t")
insect_df.rename(columns={"filepath": "ImageFilePath", "class": "ScientificName"}, inplace=True)
insect_df['ImageFilePath'] = "/fs/ess/PAS2136/Rayeed/Carabids-100K-V2/" + insect_df['ImageFilePath']
df = insect_df[['ImageFilePath', 'ScientificName']]
df = df.fillna("Unknown")

species_list = df[df["ScientificName"] != "unknown"]["ScientificName"].unique().tolist()
text_inputs = tokenizer(species_list).to(device)

df = df.iloc[:100]

df.head(2)

Unnamed: 0,ImageFilePath,ScientificName
0,/fs/ess/PAS2136/Rayeed/Carabids-100K-V2/Insect...,Cicindela sexguttata
1,/fs/ess/PAS2136/Rayeed/Carabids-100K-V2/Insect...,Cicindela sexguttata


In [None]:



df["PredSpecies"] = None  




for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing Images"):

    image_path = row["ImageFilePath"]

    if not os.path.exists(image_path):
        print(f"File not found: {image_path}")
        continue

    image = Image.open(image_path).convert("RGB")
    image_input = preprocess_val(image).unsqueeze(0).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    similarity = (image_features @ text_features.T).squeeze(0)
    best_match_idx = similarity.argmax().item()
    predicted_species = species_list[best_match_idx]

    df.at[idx, "PredSpecies"] = predicted_species


df_filtered = df[df["ScientificName"] != "unknown"]

correct_predictions = (df_filtered["PredSpecies"] == df_filtered["ScientificName"]).sum()

total_images = len(df_filtered)

accuracy = correct_predictions / total_images if total_images > 0 else 0

print(f"BioCLIP Zero-Shot SPECIES Classification Accuracy: {accuracy*100:.4f}% ({correct_predictions}/{total_images})")



Processing Images:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:

# df.to_csv('/users/PAS2136/rayees/3. Benchmarking/AllCarabids/0.BioCLIP-zero-shot-species_overall.csv', index=False)
