# Splits

This notebook shows available splits into the training and testing sets.

In [1]:
import sys
sys.path.append('..')

from wildlife_datasets import datasets, splits

#datasets.MacaqueFaces.get_data('../data/MacaqueFaces')
dataset = datasets.MacaqueFaces('../data/MacaqueFaces')
df = dataset.df.copy()

# For testing purposes only
df = df.iloc[4:]
df.loc[df.iloc[:1000].index, 'date'] = '2016-01-01'

# Closed-set split

In [2]:
splitter = splits.ClosedSetSplit(0.5)
idx_train, idx_test = splitter.split(df)[0]
splits.analyze_split(df, idx_train, idx_test)

Split: time-unaware closed-set
Samples: train/test/unassigned/total = 3138/3138/0/6276
Classes: train/test/unassigned/total = 34/34/0/34
Samples: train only/test only        = 0/0
Classes: train only/test only/joint  = 0/0/34

Fraction of train set     = 50.00%
Fraction of test set only = 0.00%


# Open-set split

In [3]:
splitter = splits.OpenSetSplit(0.5, 0.1)
idx_train, idx_test = splitter.split(df)[0]
splits.analyze_split(df, idx_train, idx_test)

Split: time-unaware open-set
Samples: train/test/unassigned/total = 3134/3142/0/6276
Classes: train/test/unassigned/total = 30/34/0/34
Samples: train only/test only        = 0/760
Classes: train only/test only/joint  = 0/4/30

Fraction of train set     = 49.94%
Fraction of test set only = 12.11%


In [4]:
splitter = splits.OpenSetSplit(0.5, n_class_test=5)
idx_train, idx_test = splitter.split(df)[0]
splits.analyze_split(df, idx_train, idx_test)

Split: time-unaware open-set
Samples: train/test/unassigned/total = 3143/3133/0/6276
Classes: train/test/unassigned/total = 29/34/0/34
Samples: train only/test only        = 0/940
Classes: train only/test only/joint  = 0/5/29

Fraction of train set     = 50.08%
Fraction of test set only = 14.98%


# Disjoint split

In [5]:
splitter = splits.DisjointSetSplit(0.5)
idx_train, idx_test = splitter.split(df)[0]
splits.analyze_split(df, idx_train, idx_test)

Split: time-unaware disjoint-set
Samples: train/test/unassigned/total = 3100/3176/0/6276
Classes: train/test/unassigned/total = 17/17/0/34
Samples: train only/test only        = 3100/3176
Classes: train only/test only/joint  = 17/17/0

Fraction of train set     = 49.39%
Fraction of test set only = 50.61%


In [6]:
splitter = splits.DisjointSetSplit(n_class_test=10)
idx_train, idx_test = splitter.split(df)[0]
splits.analyze_split(df, idx_train, idx_test)

Split: time-unaware disjoint-set
Samples: train/test/unassigned/total = 4416/1860/0/6276
Classes: train/test/unassigned/total = 24/10/0/34
Samples: train only/test only        = 4416/1860
Classes: train only/test only/joint  = 24/10/0

Fraction of train set     = 70.36%
Fraction of test set only = 29.64%


# Time-proportion splits

In [7]:
splitter = splits.TimeProportionSplit()
idx_train, idx_test = splitter.split(df)[0]
splits.analyze_split(df, idx_train, idx_test)

Split: time-proportion closed-set
Samples: train/test/unassigned/total = 4161/2115/0/6276
Classes: train/test/unassigned/total = 34/29/0/34
Samples: train only/test only        = 926/0
Classes: train only/test only/joint  = 5/0/29

Fraction of train set     = 66.30%
Fraction of test set only = 0.00%


In [8]:
idx_train, idx_test = splitter.resplit_random(df, idx_train, idx_test)
splits.analyze_split(df, idx_train, idx_test)

Split: time-unaware closed-set
Samples: train/test/unassigned/total = 4161/2115/0/6276
Classes: train/test/unassigned/total = 34/29/0/34
Samples: train only/test only        = 926/0
Classes: train only/test only/joint  = 5/0/29

Fraction of train set     = 66.30%
Fraction of test set only = 0.00%


In [9]:
splitter = splits.RandomProportion()
idx_train, idx_test = splitter.split(df)[0]
splits.analyze_split(df, idx_train, idx_test)

Split: time-unaware closed-set
Samples: train/test/unassigned/total = 4161/2115/0/6276
Classes: train/test/unassigned/total = 34/29/0/34
Samples: train only/test only        = 926/0
Classes: train only/test only/joint  = 5/0/29

Fraction of train set     = 66.30%
Fraction of test set only = 0.00%


# Time-cutoff split

In [10]:
splitter = splits.TimeCutoffSplit(2015)
idx_train, idx_test = splitter.split(df)[0]
splits.analyze_split(df, idx_train, idx_test)

Split: time-cutoff closed-set
Samples: train/test/unassigned/total = 3137/2139/1000/6276
Classes: train/test/unassigned/total = 29/29/5/34
Samples: train only/test only        = 0/0
Classes: train only/test only/joint  = 0/0/29

Fraction of train set     = 49.98%
Fraction of test set only = 0.00%


In [11]:
idx_train, idx_test = splitter.resplit_random(df, idx_train, idx_test)
splits.analyze_split(df, idx_train, idx_test)

Split: time-unaware closed-set
Samples: train/test/unassigned/total = 3137/2139/1000/6276
Classes: train/test/unassigned/total = 29/29/5/34
Samples: train only/test only        = 0/0
Classes: train only/test only/joint  = 0/0/29

Fraction of train set     = 49.98%
Fraction of test set only = 0.00%
