# Create dataset for development

In [None]:
import datetime
from pathlib import Path
import configparser

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.utils

%matplotlib inline

np.random.seed(51)
seed = 78

In [None]:
# User inputs
do_save = True
data_dir = '../data/private_data'

In [None]:
df = pd.read_csv(data_dir+'/private_events_dev2/private_events_all_TRAIN_update.txt', header=None, sep=' ')
meta = pd.read_csv(data_dir+'/private_events_dev2/private_events_all_TRAIN_update_meta.txt', sep=',', parse_dates=['date'])
print(df.shape, meta.shape)
print(meta.head())
print(df.head())

# Plot data

In [None]:
i = 7
print(meta.iloc[i])
df.iloc[i][1:].plot()

# Read config

In [None]:
config = configparser.ConfigParser()
config.optionxform = str
config_files = ['../data/public_config.ini', '../data/private_config.ini']
config.read(config_files)
dog_names = config._sections['unique_dog_names']

# Examine data

In [None]:
this_dog = dog_names[0]
print(meta[(meta['dog']==this_dog) & (meta['dog_result']=='TP')].count()[0])
print(meta[(meta['dog']==this_dog) & (meta['dog_result']=='TN')].count()[0])
print(meta[(meta['dog']==this_dog) & (meta['dog_result']=='TP')]['Concentration'].describe())

In [None]:
print(meta[(meta['dog']==this_dog) & (meta['Concentration']>=1/5e6)].count()[0])
print(meta[(meta['dog']==this_dog) & (meta['Concentration']>=1/5e6) & (meta['dog_result']=='TP')].count()[0])

# Join meta data to dataset

In [None]:
data_meta = pd.concat([meta, df], axis=1)
data_meta.head()

# Create balanced dataset

In [None]:
sklearn.utils.shuffle(data_meta)
# Select by class
if True:
    selection_0 = data_meta[(data_meta['class']==0)]
    selection_1 = data_meta[(data_meta['class']==1)]
# Select by class ad dog
if False:
    selection_0 = data_meta[(data_meta['dog']==this_dog) & (data_meta['class']==0)]
    selection_1 = data_meta[(data_meta['dog']==this_dog) & (data_meta['class']==1)]
# Select where dog was correct
if False:
    selection_0 = data_meta[(data_meta['dog']==this_dog) & (meta['dog_result']=='TN')]
    selection_1 = data_meta[(data_meta['dog']==this_dog) & (meta['dog_result']=='TP')]
# Select stronger concentrations
if False:
    selection_0 = data_meta[(data_meta['dog']==this_dog) & (data_meta['class']==0)]
    selection_1 = data_meta[(data_meta['dog']==this_dog) & (meta['Concentration']>=1/5e6)]
# Select certain dates
if False:
    condd = data_meta['dog']==dog_names[0]
    cond0 = data_meta['date']!='2018-08-07'
    cond1 = data_meta['date']!='2018-08-21'
    cond2 = data_meta['date']!='2018-09-12'
    cond3 = data_meta['date']!='2018-10-16'
    cond4 = data_meta['date']!='2018-23-10'
    cond = condd & cond0 & cond1 & cond2 & cond3 & cond4
    selection_0 = data_meta[cond & (data_meta['class']==0)]
    selection_1 = data_meta[cond & (data_meta['class']==1)]

# Create balanced set
n0 = selection_0.count()[0]
n1 = selection_1.count()[0]
n = min(n0, n1)
print('Min selection set size:', n)
selection_0 = selection_0.iloc[:n]
selection_1 = selection_1.iloc[:n]
selection = pd.concat([selection_0, selection_1])
selection = sklearn.utils.shuffle(selection)
selection.head()

# Split into training and test sets

In [None]:
test_split = 0.25
stratify = selection['class']
selection_train, selection_test = train_test_split(selection, test_size=test_split, stratify=stratify, random_state=seed)

# Split the dataset back out into meta and dataset 

In [None]:
header = list(meta)
meta_train = selection_train[header]
dataset_train = selection_train[selection_train.columns.difference(header)]
meta_test = selection_test[header]
dataset_test = selection_test[selection_test.columns.difference(header)]
meta_test.head()
print('meta_train', meta_train.shape)
print('dataset_train', dataset_train.shape)
print('meta_test', meta_test.shape)
print('dataset_test', dataset_test.shape)

In [None]:
meta_train.head()

# Save to file

In [None]:
filename_stem = 'private_balanced'
if do_save:   
    np.savetxt(filename_stem+'_TRAIN.txt', dataset_train.to_numpy(), fmt='%f', delimiter=' ')
    meta_train.to_csv(filename_stem+'_TRAIN_meta.txt', index=False)
    np.savetxt(filename_stem+'_TEST.txt', dataset_test.to_numpy(), fmt='%f', delimiter=' ')
    meta_test.to_csv(filename_stem+'_TEST_meta.txt', index=False)

# Check data

In [None]:
df_output = pd.read_csv(filename_stem+'_TRAIN.txt', header=None, sep=' ')
meta_output = pd.read_csv(filename_stem+'_TRAIN_meta.txt', sep=',', parse_dates=['date'])
print(df_output.shape, meta_output.shape)
print('class 0:', meta_output[(meta_output['class']==0)].count()[0])
print('class 1:', meta_output[(meta_output['class']==1)].count()[0])
print(meta_output.head())
print(df_output.head())

In [None]:
i = 0
df_output.iloc[i][1:].plot()
plt.suptitle('class: '+str(meta_output.iloc[i]['class']))

In [None]:
i = 1
df_output.iloc[i][1:].plot()
plt.suptitle('class: '+str(meta_output.iloc[i]['class']))

In [None]:
df_output = pd.read_csv('../data/private_data/private_events_dev2/private_dog2_correct/private_dog2_correct_TRAIN.txt', header=None, sep=' ')
meta_output = pd.read_csv('../data/private_data/private_events_dev2/private_dog2_correct/private_dog2_correct_TRAIN_meta.txt', sep=',', parse_dates=['date'])
print(df_output.shape, meta_output.shape)
print('class 0:', meta_output[(meta_output['class']==0)].count()[0])
print('class 1:', meta_output[(meta_output['class']==1)].count()[0])
print(meta_output.head())
print(df_output.head())