In [1]:
import sys
sys.path.insert(0, '../../wildlife-tools')
sys.path.insert(0, '..')

import pandas as pd
from prepare_wildlife_reid_10k import *
from wildlife_datasets import splits

In [2]:
prepare_functions = {
    'AAUZebraFish': prepare_aau_zebrafish,
    'AerialCattle2017': prepare_aerial_cattle_2017,
    'ATRW': prepare_atrw,
    'BelugaID': prepare_beluga_id,
    'BirdIndividualID': prepare_bird_individual_id,
    'CatIndividualImages': prepare_cat_individual_images,
    'CowDataset': prepare_cow_dataset,
    'Cows2021': prepare_cows2021,
    'CTai': prepare_ctai,
    'CZoo': prepare_czoo,
    'DogFaceNet': prepare_dog_facenet,
    'FriesianCattle2015': prepare_friesian_cattle_2015,
    'FriesianCattle2017': prepare_friesian_cattle_2017,
    'Giraffes': prepare_giraffes,
    'GiraffeZebraID': prepare_giraffe_zebra_id,
    'HyenaID2022': prepare_hyena_id_2022,
    'IPanda50': prepare_ipanda_50,
    'LeopardID2022': prepare_leopard_id_2022,
    'MPDD': prepare_mpdd,
    'NDD20': prepare_ndd20,
    'NyalaData': prepare_nyala_data,
    'OpenCows2020': prepare_open_cows_2020,
    'PolarBearVidID': prepare_polar_bear_vidid,
    'SealID': prepare_seal_id,
    'SeaStarReID2023': prepare_sea_star_reid_2023,
    'SeaTurtleID2022': prepare_sea_turtle_id_2022,
    'SMALST': prepare_smalst,
    'StripeSpotter': prepare_stripe_spotter,
    'WhaleSharkID': prepare_whaleshark_id,
    'ZindiTurtleRecall': prepare_zindi_turtle_recall,
}

In [3]:
species_conversion = {
    'Anthenea australiae': 'sea star',
    'Asteria rubens': 'sea star',
    'BND': 'doplhin',
    'Friesian cattle': 'cow',
    'WBD': 'doplhin',
    'amur tiger': 'tiger',
    'beluga whale': 'whale',
    'cat': 'cat',
    'chimpanzee': 'chimpanzee',
    'cow': 'cow',
    'dog': 'dog',
    'giraffe': 'giraffe',
    'giraffe_masai': 'giraffe',
    'great panda': 'panda',
    'great_tits': 'bird',
    'leopard': 'leopard',
    'loggerhead turtle': 'sea turtle',
    'nyala': 'nyala',
    'polar bear': 'polar bear',
    'ringed seal': 'seal',
    'sea turtle': 'sea turtle',
    'sociable_weavers': 'bird',
    'spotted hyena': 'hyena',
    'whale shark': 'shark',
    'zebra': 'zebra',
    'zebra_finch': 'zebra',
    'zebra_plains': 'zebra',
    'zebrafish': 'fish',
}

In [4]:
datasets_folder = '/data/wildlife_datasets/data'
new_root = os.path.join(datasets_folder, 'WildlifeReID-10k')
new_root_images = os.path.join(new_root, 'images')
new_root_metadata = os.path.join(new_root, 'metadata')
size = None
copy_files = False
save_clusters_prefix = 'clusters/cluster'

In [5]:
for name, prepare in prepare_functions.items():
    print(name)
    os.makedirs(f'{new_root_metadata}/{name}/', exist_ok=True)
    metadata = prepare(size=size, root=f'{datasets_folder}/{name}', new_root=f'{new_root_images}/{name}', copy_files=copy_files)
    metadata.to_csv(f'{new_root_metadata}/{name}/metadata.csv', index=False)

AAUZebraFish


100%|████████████████████████████████████████████████████████| 6672/6672 [00:00<00:00, 11448.16it/s]


AerialCattle2017


100%|████████████████████████████████████████████████████████| 4700/4700 [00:00<00:00, 12550.18it/s]


ATRW


100%|████████████████████████████████████████████████████████| 5415/5415 [00:00<00:00, 11703.71it/s]


BelugaID


100%|████████████████████████████████████████████████████████| 8559/8559 [00:00<00:00, 14166.81it/s]


BirdIndividualID


100%|██████████████████████████████████████████████████████| 52274/52274 [00:02<00:00, 20329.63it/s]


CatIndividualImages


100%|██████████████████████████████████████████████████████| 13021/13021 [00:01<00:00, 12426.98it/s]


CowDataset


100%|████████████████████████████████████████████████████████| 1485/1485 [00:00<00:00, 12244.23it/s]


Cows2021


100%|████████████████████████████████████████████████████████| 8670/8670 [00:00<00:00, 12102.48it/s]


CTai


100%|████████████████████████████████████████████████████████| 4662/4662 [00:00<00:00, 11679.32it/s]


CZoo


100%|████████████████████████████████████████████████████████| 2109/2109 [00:00<00:00, 12297.89it/s]


DogFaceNet


100%|████████████████████████████████████████████████████████| 8363/8363 [00:00<00:00, 12497.86it/s]


FriesianCattle2015


100%|██████████████████████████████████████████████████████████| 193/193 [00:00<00:00, 12229.22it/s]


FriesianCattle2017


100%|██████████████████████████████████████████████████████████| 940/940 [00:00<00:00, 12245.53it/s]


Giraffes


100%|████████████████████████████████████████████████████████| 1393/1393 [00:00<00:00, 20241.98it/s]


GiraffeZebraID


100%|████████████████████████████████████████████████████████| 6925/6925 [00:00<00:00, 14333.04it/s]


HyenaID2022


100%|████████████████████████████████████████████████████████| 3129/3129 [00:00<00:00, 13437.65it/s]


IPanda50


100%|████████████████████████████████████████████████████████| 6874/6874 [00:00<00:00, 20419.69it/s]


LeopardID2022


100%|████████████████████████████████████████████████████████| 6806/6806 [00:00<00:00, 13534.48it/s]


MPDD


100%|████████████████████████████████████████████████████████| 1657/1657 [00:00<00:00, 11913.23it/s]


NDD20


100%|████████████████████████████████████████████████████████| 2657/2657 [00:00<00:00, 13279.85it/s]


NyalaData


100%|████████████████████████████████████████████████████████| 1942/1942 [00:00<00:00, 11994.74it/s]


OpenCows2020


100%|████████████████████████████████████████████████████████| 4736/4736 [00:00<00:00, 20419.43it/s]


PolarBearVidID


100%|██████████████████████████████████████████████████████| 13918/13918 [00:01<00:00, 12607.13it/s]


SealID


100%|████████████████████████████████████████████████████████| 2080/2080 [00:00<00:00, 11898.98it/s]

SeaStarReID2023



100%|████████████████████████████████████████████████████████| 2187/2187 [00:00<00:00, 20241.10it/s]


SeaTurtleID2022


100%|████████████████████████████████████████████████████████| 8729/8729 [00:00<00:00, 13891.41it/s]


SMALST


100%|██████████| 12850/12850 [00:00<00:00, 26450.90it/s]


StripeSpotter


100%|██████████████████████████████████████████████████████████| 820/820 [00:00<00:00, 22207.70it/s]


WhaleSharkID


100%|████████████████████████████████████████████████████████| 7693/7693 [00:00<00:00, 14318.21it/s]


ZindiTurtleRecall


100%|██████████████████████████████████████████████████████| 12803/12803 [00:00<00:00, 21016.41it/s]


In [6]:
features = np.load('features/features_dino.npy')
for i in range(len(features)):
    features[i] /= np.linalg.norm(features[i])

In [7]:
for name in prepare_functions:
    print(name)
    metadata = pd.read_csv(f'{new_root_metadata}/{name}/metadata.csv')

    splitter = splits.OpenSetSplit(0.8, 0.1, seed=666)
    idx_train0, idx_test0 = splitter.split(metadata)[0]
    idx_train, idx_test = splitter.resplit_by_features(metadata, features[metadata.index], idx_train0, save_clusters_prefix=f'{save_clusters_prefix}_{name}')

    metadata.loc[metadata.index[idx_train], 'split'] = 'train'
    metadata.loc[metadata.index[idx_test], 'split'] = 'test'

    metadata.to_csv(f'{new_root_metadata}/{name}/metadata2.csv', index=False)

AAUZebraFish


100%|██████████| 6/6 [00:01<00:00,  3.24it/s]


AerialCattle2017


100%|██████████| 23/23 [00:04<00:00,  5.74it/s]


ATRW


100%|██████████| 182/182 [00:23<00:00,  7.64it/s]


BelugaID


100%|██████████| 788/788 [00:47<00:00, 16.60it/s]


BirdIndividualID


100%|██████████| 50/50 [00:24<00:00,  2.07it/s]


CatIndividualImages


100%|██████████| 509/509 [01:28<00:00,  5.76it/s]


CowDataset


100%|██████████| 13/13 [00:01<00:00,  6.87it/s]


Cows2021


100%|██████████| 179/179 [00:29<00:00,  6.05it/s]


CTai


100%|██████████| 71/71 [00:09<00:00,  7.49it/s]


CZoo


100%|██████████| 24/24 [00:03<00:00,  6.75it/s]


DogFaceNet


100%|██████████| 1393/1393 [00:44<00:00, 31.09it/s]


FriesianCattle2015


100%|██████████| 25/25 [00:00<00:00, 33.77it/s]


FriesianCattle2017


100%|██████████| 89/89 [00:04<00:00, 20.03it/s]


Giraffes


100%|██████████| 178/178 [00:06<00:00, 28.27it/s]


GiraffeZebraID


100%|██████████| 2056/2056 [00:39<00:00, 51.49it/s] 


HyenaID2022


100%|██████████| 256/256 [00:13<00:00, 19.54it/s]


IPanda50


100%|██████████| 50/50 [00:07<00:00,  6.33it/s]


LeopardID2022


100%|██████████| 430/430 [00:17<00:00, 24.80it/s]


MPDD


100%|██████████| 191/191 [00:07<00:00, 26.78it/s]


NDD20


100%|██████████| 82/82 [00:12<00:00,  6.75it/s]


NyalaData


100%|██████████| 237/237 [00:10<00:00, 22.43it/s]


OpenCows2020


100%|██████████| 46/46 [00:07<00:00,  6.02it/s]


PolarBearVidID


100%|██████████| 13/13 [00:05<00:00,  2.56it/s]


SealID


100%|██████████| 57/57 [00:08<00:00,  6.70it/s]


SeaStarReID2023


100%|██████████| 95/95 [00:17<00:00,  5.56it/s]


SeaTurtleID2022


100%|██████████| 438/438 [00:47<00:00,  9.20it/s]


SMALST


100%|██████████| 10/10 [00:04<00:00,  2.36it/s]


StripeSpotter


100%|██████████| 45/45 [00:03<00:00, 14.33it/s]


WhaleSharkID


100%|██████████| 543/543 [00:36<00:00, 15.08it/s]


ZindiTurtleRecall


100%|██████████| 2265/2265 [01:15<00:00, 29.84it/s]


In [8]:
results = []
for name in prepare_functions:
    metadata = pd.read_csv(f'{new_root_metadata}/{name}/metadata2.csv')

    df = metadata.copy()
    df['image_id'] = [f'{name}_{str(i).zfill(6)}' for i in range(len(metadata))]
    df['dataset'] = name
    df['identity'] = name + '_' + df['identity'].astype(str)
    df['path'] = name + '/' + df['path']
    df['species'] = df['species'].apply(lambda x: species_conversion[x])
    results.append(df)

combined_all = pd.concat(results).reset_index(drop=True)
idx = ~combined_all['date'].isnull()
idx = combined_all.index[idx]
combined_all.loc[idx, 'date'] = pd.to_datetime(combined_all.loc[idx, 'date'].astype(str).apply(lambda x: x[:10]), format='%Y-%m-%d')
combined_all['orientation'] = combined_all['orientation'].replace({'below': 'down', 'up': 'top', 'above': 'top'})
d = datasets.DatasetFactory(new_root_images, df=combined_all)
combined_all = d.finalize_catalogue(d.df)
combined_all.to_csv(f'{new_root}/metadata.csv', index=False)