In [45]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import torch
import open_clip
from PIL import Image
from tqdm import tqdm
import numpy as np

In [46]:
train_image_path = '/local/scratch1/siam/dataset/plant_clef/train/images_max_side_800'
train_label_path = '/local/scratch1/siam/dataset/plant_clef/train/PlantCLEF2024singleplanttrainingdata.csv'

test_image_path = '/local/scratch1/siam/dataset/plant_clef/test/data/PlantCLEF/PlantCLEF2025/DataOut/test/package/images/'
test_label_path = '/local/scratch1/siam/dataset/plant_clef/test/data/PlantCLEF/PlantCLEF2025/DataOut/test/package/PlantCLEF2025_test.csv'

In [47]:
train_data = pd.read_csv(train_label_path, sep=';', dtype={'partner': str})
train_data.head()

Unnamed: 0,image_name,organ,species_id,obs_id,license,partner,author,altitude,latitude,longitude,gbif_species_id,species,genus,family,dataset,publisher,references,url,learn_tag,image_backup_url
0,59feabe1c98f06e7f819f73c8246bd8f1a89556b.jpg,leaf,1396710,1008726402,cc-by-sa,,Gulyás Bálint,205.9261,47.59216,19.362895,5284517.0,Taxus baccata L.,Taxus,Taxaceae,plantnet,plantnet,https://identify.plantnet.org/fr/k-southwester...,https://bs.plantnet.org/image/o/59feabe1c98f06...,train,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
1,dc273995a89827437d447f29a52ccac86f65476e.jpg,leaf,1396710,1008724195,cc-by-sa,,vadim sigaud,323.752,47.906703,7.201746,5284517.0,Taxus baccata L.,Taxus,Taxaceae,plantnet,plantnet,https://identify.plantnet.org/fr/k-southwester...,https://bs.plantnet.org/image/o/dc273995a89827...,train,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
2,416235e7023a4bd1513edf036b6097efc693a304.jpg,leaf,1396710,1008721908,cc-by-sa,,fil escande,101.316,48.826774,2.352774,5284517.0,Taxus baccata L.,Taxus,Taxaceae,plantnet,plantnet,https://identify.plantnet.org/fr/k-southwester...,https://bs.plantnet.org/image/o/416235e7023a4b...,train,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
3,cbd18fade82c46a5c725f1f3d982174895158afc.jpg,leaf,1396710,1008699177,cc-by-sa,,Desiree Verver,5.107,52.190427,6.009677,5284517.0,Taxus baccata L.,Taxus,Taxaceae,plantnet,plantnet,https://identify.plantnet.org/fr/k-southwester...,https://bs.plantnet.org/image/o/cbd18fade82c46...,train,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
4,f82c8c6d570287ebed8407cefcfcb2a51eaaf56e.jpg,leaf,1396710,1008683100,cc-by-sa,,branebrane,165.339,45.794739,15.965862,5284517.0,Taxus baccata L.,Taxus,Taxaceae,plantnet,plantnet,https://identify.plantnet.org/fr/k-southwester...,https://bs.plantnet.org/image/o/f82c8c6d570287...,train,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...


In [49]:
train_data.columns

Index(['image_name', 'organ', 'species_id', 'obs_id', 'license', 'partner',
       'author', 'altitude', 'latitude', 'longitude', 'gbif_species_id',
       'species', 'genus', 'family', 'dataset', 'publisher', 'references',
       'url', 'learn_tag', 'image_backup_url'],
      dtype='object')

In [51]:
train_data.latitude.nunique(), train_data.longitude.nunique()

(573376, 577352)

In [41]:

class BioclipFeatureExtractor:
    def __init__(self):
        self.model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip-2')
        self.model.eval()
        self.tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip-2')
        self.preprocess = preprocess_train
        self.device=torch.device('cuda')
        self.model.to(self.device)
        print('Bioclip-2 Model Loaded...')

    def extract_features(self, image_path: str) -> torch.Tensor or None:
        try:
            image = Image.open(image_path).convert("RGB")
            image_input = self.preprocess(image).unsqueeze(0).to(self.device)

            with torch.no_grad():
                image_features = self.model.encode_image(image_input)
                image_features /= image_features.norm(dim=-1, keepdim=True)
                return image_features.squeeze(0).cpu().numpy() # Returns a (D,) tensor

        except FileNotFoundError:
            print(f"Warning: Image file not found at {image_path}. Skipping.")
            return None
        except Exception as e:
            print(f"Error processing image {image_path}: {e}. Skipping.")
            return None

In [42]:
e = BioclipFeatureExtractor()
results = []

for index, row in tqdm(train_data.iterrows(), total=train_data.shape[0], desc="Extracting Features"):
    image_name = row['image_name']
    species_id = str(row['species_id']) # Ensure species_id is treated as a string for pathing

    image_path = os.path.join(train_image_path, species_id, image_name)

    # Extract the features
    embedding = e.extract_features(image_path)

    if embedding is not None:
        # Append the results to the list
        results.append({
            'image_name': image_name,
            'species_id': species_id,
            # The embedding is stored as a PyTorch tensor (or you can convert to numpy: .numpy())
            'embedding': embedding
        })

Bioclip-2 Model Loaded...


Extracting Features: 100%|██████████| 10/10 [00:00<00:00, 40.70it/s]


In [43]:
output_file = "/local/scratch1/siam/dataset/plant_clef/train/image_embeddings_bioclip2.pkl"

embeddings_df = pd.DataFrame(results)
embeddings_df.to_pickle(output_file)
print(f"\nResults saved to {output_file}")


Results saved to /local/scratch1/siam/dataset/plant_clef/train/image_embeddings_bioclip2.pkl


In [44]:
results[0].get('embedding')

array([ 7.03643709e-02,  2.74434369e-02,  7.84270000e-03, -5.97115457e-02,
        3.65341008e-02, -4.85910103e-02, -5.45230992e-02,  4.99459393e-02,
       -3.35297398e-02, -6.73415810e-02,  1.94030628e-02,  1.95874851e-02,
       -9.03022885e-02, -3.74756232e-02, -5.81429899e-03, -5.17720804e-02,
        1.45088192e-02,  3.53175849e-02, -5.57828089e-03,  4.42939214e-02,
       -1.24318823e-02, -5.14190011e-02, -1.95386242e-02, -5.82501702e-02,
        1.64649449e-02, -5.02707697e-02,  5.25078699e-02,  6.42766478e-03,
       -4.65457048e-03,  3.02999020e-02, -1.38870049e-02,  4.38496359e-02,
       -1.20341796e-02, -4.60539535e-02,  8.46351311e-03, -4.23313258e-03,
        2.78633256e-02, -5.18191047e-02,  4.65981215e-02,  3.62986652e-03,
       -1.03324866e-02,  6.72764033e-02, -4.22265381e-02,  1.85070187e-02,
        2.09582318e-02, -7.57100806e-02,  4.11311351e-02, -6.04162971e-03,
        8.46059714e-03, -2.05094088e-02,  5.31934798e-02, -2.27000546e-02,
        5.84642403e-03, -