# Discrepancies

This notebook shows discrepancies between the metadata extracted from papers and values extracted from the downloaded data. Disrepancies indicate a potential problem.

In [1]:
import sys
sys.path.append('..')

from wildlife_datasets import datasets, loader

root_dataset = '../data'
root_dataframe = '../data/_dataframes'

dataset_names = [dataset_name for dataset_name in datasets.names_all]

ds = loader.load_datasets(dataset_names, root_dataset, root_dataframe)

In [2]:
def print_discrepancies(d):
    df = d.df
    metadata = d.metadata
    keys = metadata.keys()
    
    if 'reported_n_total' in keys:
        val1 = metadata['reported_n_total']
        val2 = len(df)
        error_msg = 'Different number of entries'
        check_equality(val1, val2, error_msg, d)

    if 'reported_n_identified' in keys:
        val1 = metadata['reported_n_identified']
        val2 = len(df[df['identity'] != 'unknown'])
        error_msg = 'Different number of identified entries'
        check_equality(val1, val2, error_msg, d)            

    if 'reported_n_photos' in keys:
        val1 = metadata['reported_n_photos']
        val2 = len(df['path'].unique())
        error_msg = 'Different number of photos'
        check_equality(val1, val2, error_msg, d)            

    if 'reported_n_individuals' in keys:
        val1 = metadata['reported_n_individuals']
        val2 = len(df['identity'].unique())
        if 'unknown' in list(df['identity'].unique()):
            val2 -= 1
        error_msg = 'Different number of identities'
        check_equality(val1, val2, error_msg, d)            

def check_equality(val1, val2, error_msg, d):
    if val1 is not None and val1 != val2:
        print('%s. %s. Reported = %d. Observed = %d.' % (d.__class__.__name__, error_msg, val1, val2))

In [3]:
for d in ds:
    print_discrepancies(d)

ATRW. Different number of entries. Reported = 9496. Observed = 5415.
ATRW. Different number of identified entries. Reported = 3649. Observed = 5415.
ATRW. Different number of photos. Reported = 8076. Observed = 5302.
ATRW. Different number of identities. Reported = 92. Observed = 182.
BirdIndividualID. Different number of entries. Reported = 50643. Observed = 52434.
BirdIndividualID. Different number of identified entries. Reported = 50643. Observed = 51934.
BirdIndividualID. Different number of photos. Reported = 50643. Observed = 52434.
BirdIndividualIDSegmented. Different number of entries. Reported = 50643. Observed = 52774.
BirdIndividualIDSegmented. Different number of identified entries. Reported = 50643. Observed = 52274.
BirdIndividualIDSegmented. Different number of photos. Reported = 50643. Observed = 52774.
CTai. Different number of identified entries. Reported = 5078. Observed = 4662.
CTai. Different number of identities. Reported = 78. Observed = 71.
Cows2021. Different n

- ATRW - ???.
- BirdIndividualID - ???.
- CTai - OK. I believe that the 'Adult identity is a mistake. CHECK!
- Cows2021 - OK.
- Drosophila - OK. Few images had different structure. We ignored them.
- LeopardID2022 - ???.
- LionData - OK.
- MacaqueFaces - OK. In the paper they write that it is slightly less (not enough photos for some individuals).
- MPDD - OK.
- NyalaData - OK.