In [1]:
import sys
sys.path.insert(0, '../../wildlife-tools')
sys.path.insert(0, '..')

import pandas as pd
from prepare_wildlife_reid_10k import *
from wildlife_datasets import splits

In [2]:
prepare_functions = {
    'AAUZebraFish': prepare_aau_zebrafish,
    'AerialCattle2017': prepare_aerial_cattle_2017,
    'ATRW': prepare_atrw,
    'BelugaID': prepare_beluga_id,
    'BirdIndividualID': prepare_bird_individual_id,
    'CatIndividualImages': prepare_cat_individual_images,
    'CowDataset': prepare_cow_dataset,
    'Cows2021': prepare_cows2021,
    'CTai': prepare_ctai,
    'CZoo': prepare_czoo,
    'DogFaceNet': prepare_dog_facenet,
    'FriesianCattle2015': prepare_friesian_cattle_2015,
    'FriesianCattle2017': prepare_friesian_cattle_2017,
    'Giraffes': prepare_giraffes,
    'GiraffeZebraID': prepare_giraffe_zebra_id,
    'HyenaID2022': prepare_hyena_id_2022,
    'IPanda50': prepare_ipanda_50,
    'LeopardID2022': prepare_leopard_id_2022,
    'MPDD': prepare_mpdd,
    'NDD20': prepare_ndd20,
    'NyalaData': prepare_nyala_data,
    'OpenCows2020': prepare_open_cows_2020,
    'PolarBearVidID': prepare_polar_bear_vidid,
    'SealID': prepare_seal_id,
    'SeaStarReID2023': prepare_sea_star_reid_2023,
    'SeaTurtleID2022': prepare_sea_turtle_id_2022,
    'SMALST': prepare_smalst,
    'StripeSpotter': prepare_stripe_spotter,
    'WhaleSharkID': prepare_whaleshark_id,
    'ZindiTurtleRecall': prepare_zindi_turtle_recall,
}

In [3]:
species_conversion = {
    'Anthenea australiae': 'sea star',
    'Asteria rubens': 'sea star',
    'BND': 'doplhin',
    'Friesian cattle': 'cow',
    'WBD': 'doplhin',
    'amur tiger': 'tiger',
    'beluga whale': 'whale',
    'cat': 'cat',
    'chimpanzee': 'chimpanzee',
    'cow': 'cow',
    'dog': 'dog',
    'giraffe': 'giraffe',
    'giraffe_masai': 'giraffe',
    'great panda': 'panda',
    'great_tits': 'bird',
    'leopard': 'leopard',
    'loggerhead turtle': 'sea turtle',
    'nyala': 'nyala',
    'polar bear': 'polar bear',
    'ringed seal': 'seal',
    'sea turtle': 'sea turtle',
    'sociable_weavers': 'bird',
    'spotted hyena': 'hyena',
    'whale shark': 'shark',
    'zebra': 'zebra',
    'zebra_finch': 'zebra',
    'zebra_plains': 'zebra',
    'zebrafish': 'fish',
}

In [4]:
#datasets_folder = '../data'
datasets_folder = '/data/wildlife_datasets/data'
new_root = os.path.join(datasets_folder, 'WildlifeReID-10k')
new_root_images = os.path.join(new_root, 'images')
new_root_metadata = os.path.join(new_root, 'metadata')
size = None
copy_files = False

In [5]:
for name, prepare in prepare_functions.items():
    print(name)
    os.makedirs(f'{new_root_metadata}/{name}/', exist_ok=True)
    metadata = prepare(size=size, root=f'{datasets_folder}/{name}', new_root=f'{new_root_images}/{name}', copy_files=copy_files)
    metadata.to_csv(f'{new_root_metadata}/{name}/metadata.csv', index=False)

AAUZebraFish


100%|████████████████████████████████████████████████████████| 6672/6672 [00:00<00:00, 13217.33it/s]


AerialCattle2017


100%|██████████████████████████████████████████████████████| 46340/46340 [00:03<00:00, 14615.25it/s]


ATRW


100%|████████████████████████████████████████████████████████| 5415/5415 [00:00<00:00, 13406.79it/s]


BelugaID


100%|████████████████████████████████████████████████████████| 8559/8559 [00:00<00:00, 14936.32it/s]


BirdIndividualID


100%|██████████████████████████████████████████████████████| 52274/52274 [00:02<00:00, 25025.32it/s]


CatIndividualImages


100%|██████████████████████████████████████████████████████| 13021/13021 [00:00<00:00, 14761.15it/s]


CowDataset


100%|████████████████████████████████████████████████████████| 1485/1485 [00:00<00:00, 14408.18it/s]


Cows2021


100%|████████████████████████████████████████████████████████| 8670/8670 [00:00<00:00, 13750.18it/s]


CTai


100%|████████████████████████████████████████████████████████| 4662/4662 [00:00<00:00, 13662.37it/s]


CZoo


100%|████████████████████████████████████████████████████████| 2109/2109 [00:00<00:00, 14317.21it/s]


DogFaceNet


100%|████████████████████████████████████████████████████████| 8363/8363 [00:00<00:00, 14381.79it/s]


FriesianCattle2015


100%|██████████████████████████████████████████████████████████| 193/193 [00:00<00:00, 14076.31it/s]


FriesianCattle2017


100%|██████████████████████████████████████████████████████████| 940/940 [00:00<00:00, 14258.29it/s]


Giraffes


100%|████████████████████████████████████████████████████████| 1393/1393 [00:00<00:00, 24536.64it/s]

GiraffeZebraID



100%|████████████████████████████████████████████████████████| 6925/6925 [00:00<00:00, 15249.29it/s]


HyenaID2022


100%|████████████████████████████████████████████████████████| 3129/3129 [00:00<00:00, 15222.14it/s]


IPanda50


100%|████████████████████████████████████████████████████████| 6874/6874 [00:00<00:00, 25019.37it/s]


LeopardID2022


100%|████████████████████████████████████████████████████████| 6806/6806 [00:00<00:00, 15296.44it/s]


MPDD


100%|████████████████████████████████████████████████████████| 1657/1657 [00:00<00:00, 13550.17it/s]


NDD20


100%|████████████████████████████████████████████████████████| 2657/2657 [00:00<00:00, 15594.37it/s]


NyalaData


100%|████████████████████████████████████████████████████████| 1942/1942 [00:00<00:00, 13682.58it/s]


OpenCows2020


100%|████████████████████████████████████████████████████████| 4736/4736 [00:00<00:00, 25251.92it/s]


PolarBearVidID


100%|██████████████████████████████████████████████████████| 13918/13918 [00:00<00:00, 14677.46it/s]


SealID


100%|████████████████████████████████████████████████████████| 2080/2080 [00:00<00:00, 13405.36it/s]


SeaStarReID2023


100%|████████████████████████████████████████████████████████| 2187/2187 [00:00<00:00, 24506.67it/s]


SeaTurtleID2022


100%|████████████████████████████████████████████████████████| 8729/8729 [00:00<00:00, 14824.32it/s]


SMALST


100%|██████████| 12850/12850 [00:00<00:00, 33110.84it/s]


StripeSpotter


100%|██████████████████████████████████████████████████████████| 820/820 [00:00<00:00, 24092.87it/s]


WhaleSharkID


100%|████████████████████████████████████████████████████████| 7693/7693 [00:00<00:00, 15063.85it/s]


ZindiTurtleRecall


100%|██████████████████████████████████████████████████████| 12803/12803 [00:00<00:00, 24104.21it/s]


In [6]:
for name in prepare_functions:
    metadata = pd.read_csv(f'{new_root_metadata}/{name}/metadata.csv')
    if name == 'NDD20':
        metadata['identity'] = metadata['identity'].astype(str)

    splitter = splits.ClosedSetSplit(0.8, seed=666)
    idx_train, idx_test = splitter.split(metadata)[0]

    metadata.loc[metadata.index[idx_train], 'split'] = 'train'
    metadata.loc[metadata.index[idx_test], 'split'] = 'test'

    metadata.to_csv(f'{new_root_metadata}/{name}/metadata2.csv', index=False)

In [7]:
results = []
for name in prepare_functions:
    metadata = pd.read_csv(f'{new_root_metadata}/{name}/metadata2.csv')

    df = metadata.copy()
    df['image_id'] = [f'{name}_{str(i).zfill(6)}' for i in range(len(metadata))]
    df['dataset'] = name
    df['identity'] = name + '_' + df['identity'].astype(str)
    df['path'] = name + '/' + df['path']
    df['species'] = df['species'].apply(lambda x: species_conversion[x])
    results.append(df)

combined_all = pd.concat(results)
d = datasets.DatasetFactory(new_root_images, df=combined_all)
combined_all = d.finalize_catalogue(d.df)
combined_all.to_csv(f'{new_root}/metadata.csv', index=False)

In [8]:
def compare_series(ser1, ser2, name=''):
    ser1 = ser1.to_numpy()
    ser2 = ser2.to_numpy()
    if len(ser1) != len(ser2):
        print(f'Series {name} have different lengths')
    else:
        if np.sum(ser1 != ser2) > 0:
            print(f'Series {name} differ at some values')

df_old = pd.read_csv('megadescriptor_split.csv')
df_new = pd.read_csv(f'{new_root}/metadata.csv')

for dataset, df_old_part in df_old.groupby('dataset'):
    if dataset in prepare_functions:
        print('    ' + dataset)
        df_new_part = df_new[df_new['dataset'] == dataset]
        df_new_part = df_new_part[df_new_part['split'] == 'train']
        
        for col in ['identity', 'path', 'split']:            
            compare_series(df_old_part[col], df_new_part[col], name=col)

    AAUZebraFish
    ATRW
    AerialCattle2017
    BelugaID
Series identity have different lengths
Series path have different lengths
Series split have different lengths
    BirdIndividualID
    CTai
    CZoo
    Cows2021
Series identity have different lengths
Series path have different lengths
Series split have different lengths
    FriesianCattle2015
Series identity have different lengths
Series path have different lengths
Series split have different lengths
    FriesianCattle2017
    GiraffeZebraID
    Giraffes
    HyenaID2022
    IPanda50
Series path differ at some values
    LeopardID2022
    NDD20
    NyalaData
    OpenCows2020
    SMALST
    SealID
    StripeSpotter
    WhaleSharkID
    ZindiTurtleRecall


In [9]:
for dataset, df_old_part in df_old.groupby('dataset'):
    if dataset in ['Giraffes', 'IPanda50']:
        df_new_part = df_new[df_new['dataset'] == dataset]
        df_new_part = df_new_part[df_new_part['split'] == 'train']
        old_path = df_old_part['path']
        new_path = df_new_part['path']
        if dataset == 'Giraffes':
            old_path = old_path.apply(lambda x: ('/').join(x.split('/')[-3:]))
            new_path = new_path.apply(lambda x: ('/').join(x.split('/')[-3:]))
        elif dataset == 'IPanda50':
            old_path = old_path.apply(lambda x: ('/').join(x.split('/')[2:]))
            new_path = new_path.apply(lambda x: ('/').join(x.split('/')[2:]))
        idx = old_path.to_numpy() != new_path.to_numpy()
        display(pd.DataFrame({'path_old': old_path.iloc[idx].values, 'path_new': new_path.iloc[idx].values}))

Unnamed: 0,path_old,path_new


Unnamed: 0,path_old,path_new
0,49_yuanrun/49_20150303-110300_س×ؤê_ش°بَ±كؤس...,49_yuanrun/49_20150303-110300_e__1655_00a704e4...
1,49_yuanrun/49_20150403-094000_幼年园_园润_找竹笋吃找到了奥莉...,49_yuanrun/49_20150403-094000____1890_0167f91b...
2,05_chengjiu/05_20160730-075820_1号别墅_成就屁股挂个竹子下水...,05_chengjiu/05_20160730-075820_1__125_094a0eef...
3,49_yuanrun/49_20150403-094000_幼年园_园润_找竹笋吃找到了奥莉...,49_yuanrun/49_20150403-094000____190_0a030ca1f...
4,05_chengjiu/05_20160730-075820_1号别墅_成就屁股挂个竹子下水...,05_chengjiu/05_20160730-075820_1__185_0d722b90...
...,...,...
91,05_chengjiu/05_20160728-112100_一号别墅_成就抱着冰玩_432...,05_chengjiu/05_20160728-112100___4320_ed957d9a...
92,12_jingjing/12_20180208-133750_幼儿园_晶亮走过来和晶晶一起抢...,12_jingjing/12_20180208-133750___1_ee86adf7c25...
93,24_qixi/24_20160704-151500_幼年园_七喜抬脚挠痒_1835_f3c...,24_qixi/24_20160704-151500___1835_f3caa9c0051c...
94,05_chengjiu/05_20160728-111100_一号别墅_成就玩冰超开心_26...,05_chengjiu/05_20160728-111100___2625_f5cb2f38...
