In [None]:
import sys
sys.path.insert(0, '../..')

import os
import shutil
import numpy as np
import pandas as pd

from wildlife_datasets.datasets import AnimalCLEF2025, AnimalCLEF2026, AnimalCLEF2026_TexasHornedLizards
from wildlife_datasets.datasets.utils import create_id
from wildlife_datasets.splits import analyze_split

Specify folders.

In [None]:
root_datasets = '/data/wildlife_datasets/data'
root = os.path.join(root_datasets, 'AnimalCLEF2026')
root_images = os.path.join(root, 'images')
root_metadata = os.path.join(root, 'metadata')
copy_files = True

dataset_2025 = AnimalCLEF2025(os.path.join(root_datasets, 'AnimalCLEF2025'))
names_2025 = list(dataset_2025.metadata['dataset'].unique())
names_2026 = names_2025 + ['TexasHornedLizards']
for name in names_2026:
    os.makedirs(f'{root_metadata}/{name}/', exist_ok=True)

In [None]:
def create_metadata(dataset, name, copy_files, root):
    dataset = dataset.get_subset(dataset.df['dataset'] == name)
    metadata = dataset.metadata

    idx_query = metadata['split'] == 'query'
    identity_database = metadata.loc[~idx_query, 'identity'].unique()
    idx_known = metadata['identity'].isin(identity_database)

    metadata['split'] = metadata['split'].apply(lambda x: 'train' if x == 'database' else 'test')
    metadata.loc[idx_query*~idx_known, 'Usage'] = 'Private'
    metadata.loc[idx_query*idx_known, 'Usage'] = 'Public'
    metadata['image_id'] = range(len(metadata))
    metadata['basename'] = metadata['path'].apply(os.path.basename)

    for i, row in metadata.iterrows():
        path0 = os.path.join(dataset.root, row['path'])
        if row['split'] == 'train':
            identity = row['identity'].split('_')[-1]
            new_name = os.path.join('train', identity, row['basename'])        
        else:
            new_name = os.path.join('test', row['basename'])
        path1 = os.path.join(root, 'images', name, new_name)
        if copy_files:
            os.makedirs(os.path.dirname(path1), exist_ok=True)
            shutil.copy(path0, path1)
        metadata.loc[i, 'path'] = new_name
    return metadata.drop(['basename'], axis=1)

Do preparation for the datasets from the AnimalCLEF2025 competition.

In [None]:
for name in names_2025:
    metadata = create_metadata(dataset_2025, name, copy_files, root)
    metadata.to_csv(f'{root_metadata}/{name}/metadata.csv', index=False)

Do preparation for the TexasHornedLizards dataset.

In [None]:
name = 'TexasHornedLizards'

dataset = AnimalCLEF2026_TexasHornedLizards(os.path.join(root_datasets, name), remove_unknown=True)

metadata = dataset.df.copy()

identities_public = metadata['identity'].value_counts().index[1::2]
idx_public = metadata['identity'].isin(identities_public)
metadata.loc[~idx_public, 'Usage'] = 'Private'
metadata.loc[idx_public, 'Usage'] = 'Public'
metadata['basename'] = metadata['path'].apply(os.path.basename)
metadata['image_id'] = create_id(metadata['basename'])
for i, row in metadata.iterrows():
    name_new = os.path.join('test', row['image_id'] + os.path.splitext(row['path'])[1])
    path0 = os.path.join(dataset.root, row['path'])
    path1 = os.path.join(root, 'images', name, name_new)
    if copy_files:
        os.makedirs(os.path.dirname(path1), exist_ok=True)
        shutil.copy(path0, path1)
    metadata.loc[i, 'path'] = name_new

metadata = metadata.sort_values('image_id')
metadata = metadata.reset_index(drop=True)
metadata['image_id'] = range(len(metadata))
metadata['identity'] = name + '_' + metadata['identity'].astype(str)
metadata['species'] = 'lizard'
metadata['split'] = 'test'
metadata = metadata.drop('basename', axis=1)

metadata.to_csv(f'{root_metadata}/{name}/metadata.csv', index=False)    

Merge all into one dataframe.

In [None]:
metadata = []
for name in names_2026:
    metadata_part = pd.read_csv(f'{root_metadata}/{name}/metadata.csv')
    metadata_part['dataset'] = name
    metadata_part['path'] = 'images/' + name + '/' + metadata_part['path']
    metadata.append(metadata_part)
metadata = pd.concat(metadata).reset_index(drop=True)
metadata['image_id'] = range(len(metadata))
metadata.to_csv(f'{root}/metadata.csv', index=False)

Load to verify integrity and plot a sample.

In [None]:
dataset = AnimalCLEF2026(root)
for name in dataset.metadata['dataset'].unique():
    dataset_part = dataset.get_subset(dataset.metadata['dataset'] == name)
    dataset_part.plot_grid();

Print some analysis of splits.

In [None]:
for name, df_dataset in dataset.df.groupby('dataset'):
    print(name)
    analyze_split(df_dataset, df_dataset[df_dataset['split'] == 'train'].index, df_dataset[df_dataset['split'] == 'test'].index)
    print('\n##############################################################################\n')

In [None]:
for name, df_dataset in dataset.df.groupby('dataset'):
    print(name)
    analyze_split(df_dataset, df_dataset[df_dataset['Usage'] == 'Public'].index, df_dataset[df_dataset['Usage'] == 'Private'].index)
    print('\n##############################################################################\n')    

Save the solution file.

In [None]:
df_solution = pd.DataFrame({
    'image_id': dataset.df['image_id'],
    'identity': dataset.df['identity'],
    'dataset': dataset.df['dataset'],
    'Usage': dataset.df['Usage'],
})
df_solution = df_solution[~df_solution['Usage'].isnull()]
df_solution.to_csv(f'{root}/solution.csv', index=False)

metadata_hidden = dataset.df.copy()
metadata_hidden.loc[metadata_hidden['split'] == 'test', 'identity'] = np.nan
metadata_hidden = metadata_hidden.drop('Usage', axis=1)
metadata_hidden.to_csv(f'{root}/metadata_hidden.csv', index=False)