# Create dataset for development

In [None]:
import datetime
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.utils

%matplotlib inline

np.random.seed(51)
seed = 78

In [None]:
# User inputs
do_save = True
data_dir = '../data/private_data'

In [None]:
df = pd.read_csv(data_dir+'/private_events_dev2/private_events_all_TRAIN_update.txt', header=None, sep=' ')
meta = pd.read_csv(data_dir+'/private_events_dev2/private_events_all_TRAIN_update_meta.txt', sep=',', parse_dates=['date'])
print(df.shape, meta.shape)
print(meta.head())
print(df.head())

# Plot data

In [None]:
i = 7
print(meta.iloc[i])
df.iloc[i][1:].plot()

# Examine data

In [None]:
this_dog = 'Ozzy'
print(meta[(meta['dog']==this_dog) & (meta['dog_result']=='TP')].count()[0])
print(meta[(meta['dog']==this_dog) & (meta['dog_result']=='TN')].count()[0])

In [None]:
print(meta[(meta['Concentration']>=1/25e6)].count()[0])
print(meta[(meta['Concentration']>=1/25e6) & (meta['dog_result']=='TP')].count()[0])

# Join meta data to dataset

In [None]:
data_meta = pd.concat([meta, df], axis=1)
data_meta.head()

# Create balanced dataset

In [None]:
n = 53
sklearn.utils.shuffle(data_meta)
#selection_0 = data_meta[(data_meta['dog']==this_dog) & (data_meta['class']==0)]
selection_0 = data_meta[(data_meta['dog']==this_dog) & (meta['dog_result']=='TN')]
selection_0 = selection_0.iloc[:n]
#selection_1 = data_meta[(data_meta['dog']==this_dog) & (data_meta['class']==1)]
selection_1 = data_meta[(data_meta['dog']==this_dog) & (meta['dog_result']=='TP')]
selection_1 = selection_1.iloc[:n]
selection = pd.concat([selection_0, selection_1])
selection = sklearn.utils.shuffle(selection)
selection.head()

# Split into training and test sets

In [None]:
test_split = 0.25
stratify = selection['class']
selection_train, selection_test = train_test_split(selection, test_size=test_split, stratify=stratify, random_state=seed)

# Split the dataset back out into meta and dataset 

In [None]:
header = list(meta)
meta_train = selection_train[header]
dataset_train = selection_train[selection_train.columns.difference(header)]
meta_test = selection_test[header]
dataset_test = selection_test[selection_test.columns.difference(header)]
meta_test.head()
print('meta_train', meta_train.shape)
print('dataset_train', dataset_train.shape)
print('meta_test', meta_test.shape)
print('dataset_test', dataset_test.shape)

In [None]:
meta_train.head()

# Save to file

In [None]:
if do_save:
    filename_stem = 'private_dog1_correct'
    np.savetxt(filename_stem+'_TRAIN.txt', dataset_train.to_numpy(), fmt='%f', delimiter=' ')
    meta_train.to_csv(filename_stem+'_TRAIN_meta.txt', index=False)
    np.savetxt(filename_stem+'_TEST.txt', dataset_test.to_numpy(), fmt='%f', delimiter=' ')
    meta_test.to_csv(filename_stem+'_TEST_meta.txt', index=False)

# Check data

In [None]:
df_output = pd.read_csv(filename_stem+'_TRAIN.txt', header=None, sep=' ')
meta_output = pd.read_csv(filename_stem+'_TRAIN_meta.txt', sep=',', parse_dates=['date'])
print(df_output.shape, meta_output.shape)
print('class 0:', meta_output[(meta_output['dog']==this_dog) & (meta_output['class']==0)].count()[0])
print('class 1:', meta_output[(meta_output['dog']==this_dog) & (meta_output['class']==1)].count()[0])
print(meta_output.head())
print(df_output.head())

In [None]:
i = 0
df_output.iloc[i][1:].plot()
plt.suptitle('class: '+str(meta_output.iloc[i]['class']))

In [None]:
i = 1
df_output.iloc[i][1:].plot()
plt.suptitle('class: '+str(meta_output.iloc[i]['class']))

In [None]:
df_output = pd.read_csv('../data/private_data/private_events_dev/private_mini/private_mini_TRAIN.txt', header=None, sep=' ')
meta_output = pd.read_csv('../data/private_data/private_events_dev/private_mini/private_mini_TRAIN_meta.txt', sep=',', parse_dates=['date'])
print(df_output.shape, meta_output.shape)
print('class 0:', meta_output[(meta_output['class']==0)].count()[0])
print('class 1:', meta_output[(meta_output['class']==1)].count()[0])
print(meta_output.head())
print(df_output.head())