In [1]:
import sys
sys.path.append('./libraries')

In [2]:
import pandas as pd
from libraries.windowing import return_windows
import pickle
from libraries.functions import calculate_magnitude, calculate_VeDBA, calculate_ODBA, calculate_pitch, calculate_roll

# File Paths

In [30]:
dataset_path = './dataset/dehorning_annotated_original_dataset.csv'
complete_dataset_save_path = './dataset/dehorning_annotated_complete_dataset.csv'
windowed_dataset_save_path = './dataset/windowed_data_six_labels_dataset.pkl'
data_amounts_per_calf_save_path = './dataset/information_datasets/data_amounts_per_calf_6_labels.csv'

# Consts

In [20]:
CONSIDERED_MAIN_LABELS = sorted(['drinking_milk', 'grooming', 'lying', 'running', 'walking'])
CONSIDERED_LABELS = sorted(['drinking_milk', 'grooming', 'lying', 'running', 'walking', 'other'])

# Reading the Dataset

In [5]:
dataset_df = pd.read_csv(dataset_path)
dataset_df.DateTime = pd.to_datetime(dataset_df.DateTime)
dataset_df.head()

Unnamed: 0,calf_id,DateTime,Accx,Accy,Accz,behaviour,behaviour_seq_id
0,1306,2022-02-24 00:08:34.402385,0.8125,0.390625,0.28125,oral manipulation of pen,0
1,1306,2022-02-24 00:08:34.441385,0.6875,0.375,0.375,oral manipulation of pen,0
2,1306,2022-02-24 00:08:34.481386,0.625,0.296875,0.265625,oral manipulation of pen,0
3,1306,2022-02-24 00:08:34.521386,0.703125,0.21875,0.21875,oral manipulation of pen,0
4,1306,2022-02-24 00:08:34.560387,0.734375,0.21875,0.25,oral manipulation of pen,0


# Deriving Additional Time-series

In [6]:
%%time

dataset_df['Amag'] = dataset_df.apply(lambda row: calculate_magnitude(row['Accx'], row['Accy'], row['Accz']), axis=1)
dataset_df['VeDBA'] = dataset_df.apply(lambda row: calculate_VeDBA(row['Accx'], row['Accy'], row['Accz']), axis=1)
dataset_df['ODBA'] = dataset_df.apply(lambda row: calculate_ODBA(row['Accx'], row['Accy'], row['Accz']), axis=1)
dataset_df['pitch'] = dataset_df.apply(lambda row: calculate_pitch(row['Accx'], row['Accy'], row['Accz']), axis=1)
dataset_df['roll'] = dataset_df.apply(lambda row: calculate_roll(row['Accx'], row['Accy'], row['Accz']), axis=1)

CPU times: user 2min 19s, sys: 2.51 s, total: 2min 21s
Wall time: 2min 21s


In [7]:
dataset_df = dataset_df[['calf_id', 'DateTime', 'Accx', 'Accy', 'Accz', 'Amag', 'VeDBA', 'ODBA', 'pitch', 'roll', 
                         'behaviour', 'behaviour_seq_id']]
dataset_df.head()

Unnamed: 0,calf_id,DateTime,Accx,Accy,Accz,Amag,VeDBA,ODBA,pitch,roll,behaviour,behaviour_seq_id
0,1306,2022-02-24 00:08:34.402385,0.8125,0.390625,0.28125,0.944376,0.944376,1.484375,17.326459,24.433259,oral manipulation of pen,0
1,1306,2022-02-24 00:08:34.441385,0.6875,0.375,0.375,0.868278,0.868278,1.4375,25.587529,25.587529,oral manipulation of pen,0
2,1306,2022-02-24 00:08:34.481386,0.625,0.296875,0.265625,0.741159,0.741159,1.1875,21.001443,23.612889,oral manipulation of pen,0
3,1306,2022-02-24 00:08:34.521386,0.703125,0.21875,0.21875,0.768172,0.768172,1.140625,16.544924,16.544924,oral manipulation of pen,0
4,1306,2022-02-24 00:08:34.560387,0.734375,0.21875,0.25,0.806014,0.806014,1.203125,18.069379,15.747434,oral manipulation of pen,0


# Preprocessing - Stage 01

In [36]:
# contains any nan values?
dataset_df.isna().any().any()

False

In [8]:
dataset_df.to_csv(complete_dataset_save_path, index=False)

# Windowing

In [9]:
window_duration = 3 
data_frequency = 25
min_window_size =  0.95
overlap_size = 0.5
datetime_column_name = 'DateTime'

calf_id_column_name = 'calf_id'
label_column_name = 'behaviour'
block_number_column_name = 'behaviour_seq_id'

# getting the unique calf ids
calf_ids = dataset_df.calf_id.unique()

In [10]:
%%time

window_data = {}

try:
    for calf_id in calf_ids:
        window_data[calf_id] = {}
        calf_data_df = dataset_df[dataset_df[calf_id_column_name] == calf_id]
        blocks = calf_data_df[block_number_column_name].unique()

        for block in blocks:
            block_df = calf_data_df[calf_data_df[block_number_column_name] == block]
            
            labels = block_df[label_column_name].unique()
            if len(labels) > 1:
                raise Exception('More than 1 label found per block! ' + str(calf_id) + ' : ' + str(block))
            else:
                label = labels[0]
                        
            # window the block data
            windows = return_windows(block_df, 
                                     window_duration=window_duration, 
                                     data_frequency = data_frequency,
                                     min_window_size = min_window_size,
                                     overlap = overlap_size,
                                     datetime_column_name = datetime_column_name
                                     )

            if len(windows) > 0:
                if not label in window_data[calf_id]:
                    window_data[calf_id][label] = []
                window_data[calf_id][label].extend(windows)

except Exception as e:
    print(e)

CPU times: user 49.6 s, sys: 244 ms, total: 49.8 s
Wall time: 49.7 s


# Preprocessing - Stage 02

## Converting to Six Labels

In [18]:
def change_keys(dictionary, CONSIDERED_MAIN_LABELS):
    new_dict = {}
    for key, value in dictionary.items():
        if key in CONSIDERED_MAIN_LABELS:
            new_dict[key] = value
        else:
            if not 'other' in new_dict:
                new_dict['other'] = []
            new_dict['other'].extend(value)
    return new_dict

window_data_six_labels = {}
for calf in window_data:
    window_data_six_labels[calf] = change_keys(window_data[calf], CONSIDERED_MAIN_LABELS)

## Total data amounts

In [19]:
total_data_amounts = {}

for calf in window_data_six_labels:
    for key in window_data_six_labels[calf]:
        if key not in total_data_amounts:
            total_data_amounts[key] = 0
        total_data_amounts[key] += len(window_data_six_labels[calf][key])
        
total_data_amounts

{'other': 23972,
 'grooming': 2547,
 'running': 1329,
 'walking': 1015,
 'lying': 24967,
 'drinking_milk': 5472}

## Data amounts for each calf

In [28]:
df_data = []
for calf_id, calf_data in window_data_six_labels.items():
    df_sub_data = []
    df_sub_data.append(calf_id)
    for key in CONSIDERED_LABELS:
        if key in calf_data.keys():
            df_sub_data.append(len(calf_data[key]))
        else:
            df_sub_data.append(0)
    df_data.append(df_sub_data)

df_data_amounts = pd.DataFrame(df_data, columns=['calf_id', 'drinking_milk', 'grooming', 'lying', 
                                                 'other', 'running', 'walking']
                              ).sort_values(by='calf_id').reset_index(drop=True)

df_data_amounts.head()

Unnamed: 0,calf_id,drinking_milk,grooming,lying,other,running,walking
0,1302,51,154,1930,1229,106,38
1,1303,333,32,1093,1933,35,38
2,1306,45,95,794,1582,73,29
3,1308,281,135,1859,1455,161,60
4,1312,36,120,195,2025,50,6


# Saving data and information

In [29]:
# saving window data
with open(windowed_dataset_save_path, 'wb') as f:
    pickle.dump(window_data_six_labels, f)

In [31]:
# saving data amounts information
df_data_amounts.to_csv(data_amounts_per_calf_save_path, index=False)