In [1]:
import sys
sys.path.insert(0, '..')

import os
import pandas as pd
import torchvision.transforms as T
from wildlife_datasets.datasets import WildlifeReID10k
from wildlife_datasets.preparation import prepare_functions, species_conversion
from wildlife_datasets.splits import OpenSetSplit, DisjointSetSplit, extract_data_split

In [None]:
img_size = 384
root_datasets = '/data/wildlife_datasets/data'
root = os.path.join(root_datasets, f'Datasets_{img_size}')
root_images = os.path.join(root, 'images')
root_metadata = os.path.join(root, 'metadata')
transform = T.Resize(size=img_size) # Change to None to remove resizing
copy_files = True # Change to False for speed-up once the data have been copied
names = [
    'ATRW',
    'BelugaID',
    'CTai',
    'ELPephants',
    'Giraffes',
    'GiraffeZebraID',
    'HumpbackWhaleID',
    'HyenaID2022',
    'IPanda50',
    'LeopardID2022',
    'MacaqueFaces',
    'NyalaData',
    'OpenCows2020',
    'SealID',
    'SeaTurtleID2022',
    'StripeSpotter',
    'WhaleSharkID',
    'ZindiTurtleRecall'
]
remove_str = ['[', ']']

In [3]:
for name, prepare in prepare_functions.items():
    if name in names:
        print(name)
        os.makedirs(f'{root_metadata}/{name}/', exist_ok=True)
        metadata_part = prepare(f'{root_datasets}/{name}', f'{root_images}/{name}', transform=transform, copy_files=copy_files, remove_str=remove_str)
        metadata_part.to_csv(f'{root_metadata}/{name}/metadata.csv', index=False)

ATRW


  0%|                                                                      | 0/5415 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████| 5415/5415 [00:00<00:00, 19731.07it/s]


BelugaID


100%|████████████████████████████████████████████████████████| 8559/8559 [00:00<00:00, 23318.84it/s]


CTai


100%|████████████████████████████████████████████████████████| 5078/5078 [00:00<00:00, 19480.88it/s]


ELPephants


100%|████████████████████████████████████████████████████████| 2078/2078 [00:00<00:00, 22039.56it/s]


Giraffes


100%|████████████████████████████████████████████████████████| 1393/1393 [00:00<00:00, 29651.88it/s]


GiraffeZebraID


100%|████████████████████████████████████████████████████████| 6925/6925 [00:00<00:00, 23483.51it/s]


HumpbackWhaleID


100%|██████████████████████████████████████████████████████| 15697/15697 [00:00<00:00, 30043.78it/s]


HyenaID2022


100%|████████████████████████████████████████████████████████| 3129/3129 [00:00<00:00, 21701.17it/s]


IPanda50


100%|████████████████████████████████████████████████████████| 6874/6874 [00:00<00:00, 29115.32it/s]


LeopardID2022


100%|████████████████████████████████████████████████████████| 6806/6806 [00:00<00:00, 21673.18it/s]


MacaqueFaces


100%|████████████████████████████████████████████████████████| 6280/6280 [00:00<00:00, 21287.75it/s]


NyalaData


100%|████████████████████████████████████████████████████████| 1942/1942 [00:00<00:00, 20811.26it/s]


OpenCows2020


100%|████████████████████████████████████████████████████████| 4736/4736 [00:00<00:00, 29373.61it/s]


SealID


100%|████████████████████████████████████████████████████████| 2080/2080 [00:00<00:00, 19750.95it/s]


SeaTurtleID2022


100%|████████████████████████████████████████████████████████| 8729/8729 [00:00<00:00, 22971.59it/s]


StripeSpotter


100%|██████████████████████████████████████████████████████████| 820/820 [00:00<00:00, 33633.18it/s]


WhaleSharkID


100%|████████████████████████████████████████████████████████| 7693/7693 [00:00<00:00, 23072.73it/s]


ZindiTurtleRecall


100%|██████████████████████████████████████████████████████| 12803/12803 [00:00<00:00, 31213.30it/s]


In [4]:
metadata = []
for name in prepare_functions:
    if name in names:
        metadata_part = pd.read_csv(f'{root_metadata}/{name}/metadata.csv')
        metadata_part['dataset'] = name
        metadata_part['identity'] = name + '_' + metadata_part['identity'].astype(str)
        metadata_part['path'] = 'images/' + name + '/' + metadata_part['path']
        metadata_part['species'] = metadata_part['species'].apply(lambda x: species_conversion[x])
        # Compute split
        splitter1 = OpenSetSplit(ratio_train=0.8, ratio_class_test=0.1, seed=666, open_in_test=False)
        idx_database_full, idx_query = splitter1.split(metadata_part)[0]
        splitter2 = DisjointSetSplit(ratio_class_test=0.2, seed=666)
        idx_train, idx_database = splitter2.split(metadata_part.iloc[idx_database_full])[0]
        if not set(idx_train).union(set(idx_database)).union(set(idx_query)) == set(range(len(metadata_part))):
            raise Exception('The division is not unique')
        if set(idx_train).intersection(set(idx_database)) != set():
            raise Exception('Intersection is non-empty')
        if set(idx_database).intersection(set(idx_query)) != set():
            raise Exception('Intersection is non-empty')
        if set(idx_train).intersection(set(idx_query)) != set():
            raise Exception('Intersection is non-empty')
        metadata_part.loc[idx_train, 'split'] = 'train'
        metadata_part.loc[idx_database, 'split'] = 'database'
        metadata_part.loc[idx_query, 'split'] = 'query'
        metadata.append(metadata_part)
metadata = pd.concat(metadata).reset_index(drop=True)
metadata = metadata.drop('image_id', axis=1)
metadata['image_id'] = range(len(metadata))
idx = ~metadata['date'].isnull()
idx = metadata.index[idx]
metadata.loc[idx, 'date'] = pd.to_datetime(metadata.loc[idx, 'date'].astype(str).apply(lambda x: x[:10]), format='%Y-%m-%d').astype(str)
metadata['orientation'] = metadata['orientation'].replace({'below': 'down', 'up': 'top', 'above': 'top'})
metadata.to_csv(f'{root}/metadata.csv', index=False)

In [5]:
dataset = WildlifeReID10k(root)
dataset.df = dataset.df.drop('date', axis=1)
len(dataset)

107037

In [12]:
summary = {}
for name, df_dataset in dataset.df.groupby('dataset'):
    idx_train = (df_dataset['split'] == 'train')
    idx_train = df_dataset[idx_train].index
    idx_non_train = (df_dataset['split'] != 'train')
    idx_non_train = df_dataset[idx_non_train].index
    idx_database_full = (df_dataset['split'] == 'train') + (df_dataset['split'] == 'database')
    idx_database_full = df_dataset[idx_database_full].index
    idx_query = (df_dataset['split'] == 'query')
    idx_query = df_dataset[idx_query].index

    split_data1 = extract_data_split(df_dataset, idx_database_full, idx_query)
    split_data2 = extract_data_split(df_dataset, idx_train, idx_non_train)

    if split_data1['id_split'] != 'closed-set':
        raise Exception('split is not closed set')
    summary[name] = {
        'n': split_data1['n'],
        'n_individuals': split_data1['n_ids'],
        'ratio_train': split_data1['n_train'] / split_data1['n'],
        'ratio_test': split_data1['n_test'] / split_data1['n'],
        'ratio_train_only': split_data2['n_train_only'] / split_data2['n'],
    }
summary = pd.DataFrame(summary).T
summary[['n', 'n_individuals']] = summary[['n', 'n_individuals']].astype(int)
summary

Unnamed: 0,n,n_individuals,ratio_train,ratio_test,ratio_train_only
ATRW,5415,182,0.801847,0.198153,0.07313
BelugaID,8559,788,0.787709,0.212291,0.077813
CTai,5078,78,0.800118,0.199882,0.073257
ELPephants,2078,274,0.792108,0.207892,0.082772
GiraffeZebraID,6925,2056,0.792491,0.207509,0.175884
Giraffes,1393,178,0.806174,0.193826,0.07107
HumpbackWhaleID,15697,5004,0.774097,0.225903,0.17583
HyenaID2022,3129,256,0.799936,0.200064,0.060403
IPanda50,6874,50,0.800262,0.199738,0.09165
LeopardID2022,6806,430,0.793418,0.206582,0.112989
