In [None]:
import sys
sys.path.append('./libraries')

In [None]:
import pandas as pd
from libraries.windowing import return_windows
import pickle
from libraries.functions import calculate_magnitude, calculate_VeDBA, calculate_ODBA, calculate_pitch, calculate_roll

# File Paths

In [None]:
# dataset dehorning_annotated_original_dataset.csv is yet to be released 
dataset_path = './dataset/dehorning_annotated_original_dataset.csv'
complete_dataset_save_path = './dataset/dehorning_annotated_complete_dataset.csv'
windowed_dataset_save_path = './dataset/windowed_data_six_labels_dataset.pkl'
data_amounts_per_calf_save_path = './dataset/information_datasets/data_amounts_per_calf_6_labels.csv'

# Consts

In [None]:
CONSIDERED_MAIN_LABELS = sorted(['drinking_milk', 'grooming', 'lying', 'running', 'walking'])
CONSIDERED_LABELS = sorted(['drinking_milk', 'grooming', 'lying', 'running', 'walking', 'other'])

# Reading the Dataset

In [None]:
dataset_df = pd.read_csv(dataset_path)
dataset_df.DateTime = pd.to_datetime(dataset_df.DateTime)
dataset_df.head()

# Deriving Additional Time-series

In [None]:
%%time

dataset_df['Amag'] = dataset_df.apply(lambda row: calculate_magnitude(row['Accx'], row['Accy'], row['Accz']), axis=1)
dataset_df['VeDBA'] = dataset_df.apply(lambda row: calculate_VeDBA(row['Accx'], row['Accy'], row['Accz']), axis=1)
dataset_df['ODBA'] = dataset_df.apply(lambda row: calculate_ODBA(row['Accx'], row['Accy'], row['Accz']), axis=1)
dataset_df['pitch'] = dataset_df.apply(lambda row: calculate_pitch(row['Accx'], row['Accy'], row['Accz']), axis=1)
dataset_df['roll'] = dataset_df.apply(lambda row: calculate_roll(row['Accx'], row['Accy'], row['Accz']), axis=1)

In [None]:
dataset_df = dataset_df[['calf_id', 'DateTime', 'Accx', 'Accy', 'Accz', 'Amag', 'VeDBA', 'ODBA', 'pitch', 'roll', 
                         'behaviour', 'behaviour_seq_id']]
dataset_df.head()

# Preprocessing - Stage 01

In [None]:
# contains any nan values?
dataset_df.isna().any().any()

In [None]:
dataset_df.to_csv(complete_dataset_save_path, index=False)

# Windowing

In [None]:
window_duration = 3 
data_frequency = 25
min_window_size =  0.95
overlap_size = 0.5
datetime_column_name = 'DateTime'

calf_id_column_name = 'calf_id'
label_column_name = 'behaviour'
block_number_column_name = 'behaviour_seq_id'

# getting the unique calf ids
calf_ids = dataset_df.calf_id.unique()

In [None]:
%%time

window_data = {}

try:
    for calf_id in calf_ids:
        window_data[calf_id] = {}
        calf_data_df = dataset_df[dataset_df[calf_id_column_name] == calf_id]
        blocks = calf_data_df[block_number_column_name].unique()

        for block in blocks:
            block_df = calf_data_df[calf_data_df[block_number_column_name] == block]
            
            labels = block_df[label_column_name].unique()
            if len(labels) > 1:
                raise Exception('More than 1 label found per block! ' + str(calf_id) + ' : ' + str(block))
            else:
                label = labels[0]
                        
            # window the block data
            windows = return_windows(block_df, 
                                     window_duration=window_duration, 
                                     data_frequency = data_frequency,
                                     min_window_size = min_window_size,
                                     overlap = overlap_size,
                                     datetime_column_name = datetime_column_name
                                     )

            if len(windows) > 0:
                if not label in window_data[calf_id]:
                    window_data[calf_id][label] = []
                window_data[calf_id][label].extend(windows)

except Exception as e:
    print(e)

# Preprocessing - Stage 02

## Converting to Six Labels

In [None]:
def change_keys(dictionary, CONSIDERED_MAIN_LABELS):
    new_dict = {}
    for key, value in dictionary.items():
        if key in CONSIDERED_MAIN_LABELS:
            new_dict[key] = value
        else:
            if not 'other' in new_dict:
                new_dict['other'] = []
            new_dict['other'].extend(value)
    return new_dict

window_data_six_labels = {}
for calf in window_data:
    window_data_six_labels[calf] = change_keys(window_data[calf], CONSIDERED_MAIN_LABELS)

## Total data amounts

In [None]:
total_data_amounts = {}

for calf in window_data_six_labels:
    for key in window_data_six_labels[calf]:
        if key not in total_data_amounts:
            total_data_amounts[key] = 0
        total_data_amounts[key] += len(window_data_six_labels[calf][key])
        
total_data_amounts

## Data amounts for each calf

In [None]:
df_data = []
for calf_id, calf_data in window_data_six_labels.items():
    df_sub_data = []
    df_sub_data.append(calf_id)
    for key in CONSIDERED_LABELS:
        if key in calf_data.keys():
            df_sub_data.append(len(calf_data[key]))
        else:
            df_sub_data.append(0)
    df_data.append(df_sub_data)

df_data_amounts = pd.DataFrame(df_data, columns=['calf_id', 'drinking_milk', 'grooming', 'lying', 
                                                 'other', 'running', 'walking']
                              ).sort_values(by='calf_id').reset_index(drop=True)

df_data_amounts.head()

# Saving data and information

In [None]:
# saving window data
with open(windowed_dataset_save_path, 'wb') as f:
    pickle.dump(window_data_six_labels, f)

In [None]:
# saving data amounts information
df_data_amounts.to_csv(data_amounts_per_calf_save_path, index=False)