# Add to meta data
The database spreadsheet contains additional information about each sensor sample, namely the positive scent 
sample's concentration and the dog's response. This notebook add this additonal meta data into the dataset
where corresponding rows can be found.


In [None]:
import itertools
from pathlib import Path
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# User inputs
do_save = True
file = '../data/private_data/dog_behaviour_database_private_flat.csv'
database = pd.read_csv(file, parse_dates=['Date'])
# Load sensor data and its meta data
name_root = 'private_events_dev_TRAIN_'
file_root = '../data/private_data/private_events_dev/'+name_root
meta = pd.read_csv(file_root+'_meta.txt', sep=',', parse_dates=['date'])
dataset = pd.DataFrame(np.loadtxt(Path(file_root+'.txt')))
print(database.head())
print(meta.head())
# Join meta and sensor data
meta_and_data = pd.concat([meta, dataset], axis=1)
print('meta:', meta.shape, 'dataset:', dataset.shape, 'meta_and_data:', meta_and_data.shape)
print(meta_and_data.head()[:10])

In [None]:
# Remove duplicates from database. 
# E.g. 2018-10-03 had two sessions so the run numbers are re-used in the database. 
# The sensor data is in two folders 'Session1' and 'Session2' but this information 
# is not available in the meta data.
database_u = database.drop_duplicates(subset=['Date', 'DogName', 'Run', 'Pass', 'SensorNumber', 'y_true'], keep=False)
print('Database original shape:', database.shape)
print('Duplicates removed:', database_u.shape)

In [None]:
# Some 30 rows in meta data differ only by timestamp. So we won't be able to match them up to a row in
# the database. Show them here; they will be removed by doing a merge using an inner join.
md_dups = meta_and_data[meta_and_data.duplicated(subset=['date', 'dog', 'run', 'pass', 'sensor_number', 'class'], keep=False)]
print('Duplicates shape:', md_dups.shape)
print(md_dups.sort_values(by=['date', 'dog', 'run', 'pass', 'sensor_number', 'class']))
md_unique = meta_and_data.drop_duplicates(subset=['date', 'dog', 'run', 'pass', 'sensor_number', 'class'], keep=False)
print('Meta original shape:', meta_and_data.shape)
print('Effective duplicates removed:', md_unique.shape)

In [None]:
# Merge using an inner join and a one-to-one relationship.
# Some rows in meta do not have an entry in database now that the duplicates have been removed.
md_all = database_u.merge(meta_and_data, how='inner', 
                           left_on=['Date', 'DogName', 'Run', 'Pass', 'SensorNumber', 'y_true'], 
                           right_on=['date', 'dog', 'run', 'pass', 'sensor_number', 'class'],
                           validate='one_to_many', indicator=False)
print(md_all.tail())
print('meta original shape:', meta_and_data.shape)
print('meta_u original shape:', md_unique.shape)
print('md_all shape:', md_all.shape)
print(list(md_all)[:30])

In [None]:
# Split data back out into sensor data and meta data
meta_new = md_all[md_all.columns[:21]]
dataset_new = md_all[md_all.columns[21:]]
print(list(meta_new))
meta_re = meta_new[['filename', 'date', 'time', 'dog', 'run', 'pass', 'positive_position', 
                    'sensor_number', 'class', 'breakpoint_0', 'breakpoint_1',
                   'Concentration', 'IsLastPass', 'y_pred']]
print(list(meta_re))
meta_re = meta_re.rename(index=str, columns={'y_pred': 'dog_pred'})
print(list(meta_re))
print('dataset:', dataset.shape)
print('md_all:', md_all.shape)
print('meta_re:', meta_re.shape)
print('dataset_new:', dataset_new.shape)
print(dataset_new.head())

In [None]:
# Save
if do_save:
    file = name_root + 'update_meta.txt'
    meta_re.to_csv(file, index=False)
    file = name_root + 'update.txt'
    np.savetxt(file, dataset_new.to_numpy(), fmt='%f', delimiter=' ')