# Create dev (train-test) and end test dataset

In [None]:
import datetime
from pathlib import Path
import configparser

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.utils

%matplotlib inline

np.random.seed(51)
seed = 78

In [None]:
# User inputs
do_save = True
dog_index = 2
filename_stem = 'private_dog2_correct_plus' # For saving to file
data_dir = '../data/private_data'

In [None]:
df = pd.read_csv(data_dir+'/private_events_dev2/private_events_all_TRAIN_update.txt', header=None, sep=' ')
meta = pd.read_csv(data_dir+'/private_events_dev2/private_events_all_TRAIN_update_meta.txt', sep=',', parse_dates=['date'])
meta_header = list(meta)
dog_names = np.sort(meta['dog'].unique())
this_dog = dog_names[dog_index]
print(dog_names)
print('Selected:', this_dog)
print(df.shape, meta.shape)
print(meta.head())
print(df.head())

In [None]:
# Join meta data to dataset

In [None]:
data_meta = pd.concat([meta, df], axis=1)
data_meta.head()

# Create datasets

In [None]:
def balanced_dataset(selection_0, selection_1):
    ''' Return a single dataset with equal data drawn from selection 0 and 1'''
    n0 = selection_0.count()[0]
    n1 = selection_1.count()[0]
    n = min(n0, n1)
    selection_0 = selection_0.iloc[:n]
    selection_1 = selection_1.iloc[:n]
    selection = pd.concat([selection_0, selection_1])
    selection = sklearn.utils.shuffle(selection)
    return selection

In [None]:
sklearn.utils.shuffle(data_meta)

# Make top-level selection of a subset of data to work with
subset = data_meta[(data_meta['dog']==this_dog)]
n_all = subset.shape[0]
# Put aside data for end test
n_end = round(n_all/4)
end = subset[:n_end]
dev = subset[n_end:]

# From dev set (train-test set), select data where dog was correct
dev_corr = dev[(dev['dog_result']=='TP') | (dev['dog_result']=='TN')]
selection_0 = dev_corr[(dev_corr['class']==0)]
selection_1 = dev_corr[(dev_corr['class']==1)]
dev_corr_bal = balanced_dataset(selection_0, selection_1)
# Split dev into train - test sets
test_split = 0.25
stratify = dev_corr_bal['class']
dev_train, dev_test = train_test_split(dev_corr_bal, test_size=test_split, stratify=stratify, random_state=seed)

# End set - create a balanced set
selection_0 = end[(end['class']==0)]
selection_1 = end[(end['class']==1)]
end_bal = balanced_dataset(selection_0, selection_1)
# End set - select where dog was incorrect and create a balanced set
end_inc = end[(end['dog_result']=='FP') | (end['dog_result']=='FN')]
selection_0 = end_inc[(end_inc['class']==0)]
selection_1 = end_inc[(end_inc['class']==1)]
end_inc_bal = balanced_dataset(selection_0, selection_1)

# Split the dataset back out into meta and dataset 

In [None]:
def to_dataset_and_meta(combo, meta_header):
    ''' Split combo data, a combination of meta data with data, 
    back out into dataset and separate meta data '''
    meta = combo[meta_header]
    dataset = combo[combo.columns.difference(meta_header)]
    print('meta', meta.shape)
    print('dataset', dataset.shape)
    return dataset, meta

# Train
dataset, meta = to_dataset_and_meta(dev_train, meta_header)
if do_save:   
    np.savetxt(filename_stem+'_TRAIN.txt', dataset.to_numpy(), fmt='%f', delimiter=' ')
    meta.to_csv(filename_stem+'_TRAIN_meta.txt', index=False)
# Test
dataset, meta = to_dataset_and_meta(dev_test, meta_header)
if do_save:   
    np.savetxt(filename_stem+'_TEST.txt', dataset.to_numpy(), fmt='%f', delimiter=' ')
    meta.to_csv(filename_stem+'_TEST_meta.txt', index=False)   
# End test set
dataset, meta = to_dataset_and_meta(end_bal, meta_header)
if do_save:   
    np.savetxt(filename_stem+'_END_TEST.txt', dataset.to_numpy(), fmt='%f', delimiter=' ')
    meta.to_csv(filename_stem+'_END_TEST_meta.txt', index=False)
# End test set, dog incorrect
dataset, meta = to_dataset_and_meta(end_inc_bal, meta_header)
if do_save:   
    np.savetxt(filename_stem+'_END_TEST_dog_incorrect.txt', dataset.to_numpy(), fmt='%f', delimiter=' ')
    meta.to_csv(filename_stem+'_END_TEST_dog_incorrect_meta.txt', index=False)  

# Check data

In [None]:
name = filename_stem+'_END_TEST'
dataset_file = name+'.txt'
meta_file = name+'_meta.txt'
df_output = pd.read_csv(dataset_file, header=None, sep=' ')
meta_output = pd.read_csv(meta_file, sep=',', parse_dates=['date'])   
print('Files named', name)
print(df_output.shape, meta_output.shape)
print('class 0:', meta_output[(meta_output['class']==0)].count()[0])
print('class 1:', meta_output[(meta_output['class']==1)].count()[0])
print('TP:', meta_output[(meta_output['dog_result']=='TP')].count()[0])
print('TN:', meta_output[(meta_output['dog_result']=='TN')].count()[0])
print(meta_output.head())
print(df_output.head())