In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('..')

from wildlife_datasets import datasets
from wildlife_datasets.utils.splits import *

# datasets.MacaqueFaces.download.get_data('data/MacaqueFaces')
dataset = datasets.MacaqueFaces('data/MacaqueFaces')
df = dataset.df.copy()

# For testing purposes only
df = df.iloc[4:]
df['date'].iloc[:1000] = '2016-01-01'

seed = 100

In [None]:
def analyze_split(df, idx_train, idx_test):
    ids_train = set(df.loc[idx_train]['identity'])
    ids_test = set(df.loc[idx_test]['identity'])
    ids_train_only = ids_train - ids_test
    ids_test_only = ids_test - ids_train
    
    n = len(idx_train)+len(idx_test)
    n_train = len(idx_train)
    n_test_only = sum([sum(df['identity'] == ids) for ids in ids_test_only])    
    
    ratio_train = n_train / n    
    ratio_test_only = n_test_only / n   
    
    print('Total individuals = %d' % len(ids_train.union(ids_test)))
    print('Joint individuals = %d' % len(ids_train.intersection(ids_test)))
    print('Only in train     = %d' % len(ids_train - ids_test))
    print('Only in test      = %d' % len(ids_test - ids_train))
    print('')    
    print('Fraction of train set = %1.2f%%' % (100*ratio_train))
    print('Fraction of test set only = %1.2f%%' % (100*ratio_test_only))

# Closed-set split

In [None]:
splitter = ClosedSetSplit(df, seed)
idx_train, idx_test = splitter.split(0.5)
analyze_split(df, idx_train, idx_test)

# Open-set split

In [None]:
splitter = OpenSetSplit(df, seed)
idx_train, idx_test = splitter.split(0.5, 0.1)
analyze_split(df, idx_train, idx_test)

In [None]:
splitter = OpenSetSplit(df, seed)
idx_train, idx_test = splitter.split(0.5, n_class_test=5)
analyze_split(df, idx_train, idx_test)

# Disjoint split

In [None]:
splitter = DisjointSetSplit(df, seed)
idx_train, idx_test = splitter.split(0.5)
analyze_split(df, idx_train, idx_test)

In [None]:
splitter = DisjointSetSplit(df, seed)
idx_train, idx_test = splitter.split(n_class_test=10)
analyze_split(df, idx_train, idx_test)

# Time-proportion splits

In [None]:
splitter = TimeProportionSplit(df, seed)
idx_train, idx_test = splitter.split()
analyze_split(df, idx_train, idx_test)

In [None]:
idx_train, idx_test = splitter.resplit_random(idx_train, idx_test)
analyze_split(df, idx_train, idx_test)

# Time-cutoff split

In [None]:
splitter = TimeCutoffSplit(df, seed)
idx_train, idx_test = splitter.split(2015)
analyze_split(df, idx_train, idx_test)

In [None]:
idx_train, idx_test = splitter.resplit_random(idx_train, idx_test)
analyze_split(df, idx_train, idx_test)

In [None]:
splits = splitter.splits_all()[0]
for (idx_train, idx_test) in splits:
    analyze_split(df, idx_train, idx_test)