In [53]:
import pandas as pd
import os
import matplotlib.pyplot as plt

# for importing apple data 
import xml.etree.ElementTree as ET

import functions
from importlib import reload
reload(functions)

import warnings
warnings.filterwarnings('ignore')

### Load structured data 

In [54]:
behaviour_tracking_data = pd.read_excel('data/Behavioural data app.xlsx')

x_heart_rate_df = pd.read_csv('data/x_heart_rate.csv')
x_sleep_df = pd.read_csv('data/x_sleep.csv')

a_heart_rate_df = pd.read_csv('data/a_heart_rate.csv')
a_sleep_df = pd.read_csv('data/a_sleep.csv')


### Merge sleep data and behavioral data into "aggregated_df"

In [55]:
aggregated_df = behaviour_tracking_data.copy()

In [56]:
aggregated_df['Date'] = pd.to_datetime(aggregated_df['Date created']).dt.date

# for apple watch
aggregated_df_person_5 = aggregated_df[aggregated_df['Person ID'] == 5]
aggregated_df_person_5 = functions.get_wake_up_info_applewatch(a_sleep_df, aggregated_df_person_5)

# for mi band
aggregated_df_rest = aggregated_df[aggregated_df['Person ID'] != 5]
aggregated_df_rest = functions.get_wake_up_info_miband(x_sleep_df, aggregated_df_rest)

# aggregate
aggregated_df = pd.concat([aggregated_df_person_5, aggregated_df_rest], ignore_index=True)

In [57]:
print(len(aggregated_df))
# filter out the rows where the time_of_awakening is NaN
aggregated_df = aggregated_df[~aggregated_df['time_of_awakening'].isna()].reset_index(drop=True)
print(len(aggregated_df))

119
111


Here 8 observations could not be linked due to missing sleepdata caused by Xiaomi export fail

### Aggregate the heart rate for both types of devices

In [58]:
# Only keep the relevant columns of heart rate data
x_heart_rate_df = x_heart_rate_df[['Person ID', 'Time', 'bpm']]

# Prepare apple_heart_rate_data for merging
apple_data_prepared = a_heart_rate_df[['creationDate', 'value']].copy()
apple_data_prepared.rename(columns={'value': 'bpm'}, inplace=True)
apple_data_prepared['Person ID'] = 5

# Convert 'creationDate' to Unix epoch time (seconds since epoch)
apple_data_prepared['creationDate'] = pd.to_datetime(apple_data_prepared['creationDate'])
apple_data_prepared['Time'] = apple_data_prepared['creationDate'].astype('int64') // 10**9

# Drop the original 'creationDate' column
apple_data_prepared.drop('creationDate', axis=1, inplace=True)

# Use concat instead of append
heart_rate_df = pd.concat([x_heart_rate_df, apple_data_prepared], ignore_index=True)
heart_rate_df.to_csv('data/heart_rate.csv', index=False)

### Add measures and combine heart rate data into "aggregated_df"

In [59]:
time_interval = 60  # Assuming a 60 minute interval

# Add new columns for the statistics in behaviour_tracking_data
aggregated_df['Number of Measurements'] = pd.NA
aggregated_df['Average Heart Rate'] = pd.NA
aggregated_df['Average Lowest Three obs'] = pd.NA
aggregated_df['Average First Ten min'] = pd.NA
aggregated_df['Average First Thirty min'] = pd.NA

for index, row in aggregated_df.iterrows():
    # Get heart rate data for the interval
    heart_rate_data = functions.get_heartrate_data_for_interval(heart_rate_df, row['Person ID'], row['Date'], row['time_of_awakening'], time_interval)
    
    # Calculate the test statistics
    num_measurements, avg_hr, avg_lowest_three, avg_first_ten, avg_first_thirty = functions.calculate_test_statistics_heartrate(heart_rate_data)

    # Update the aggregated_df DataFrame with the new statistics
    aggregated_df.at[index, 'Number of Measurements'] = num_measurements
    aggregated_df.at[index, 'Average Heart Rate'] = avg_hr
    aggregated_df.at[index, 'Average Lowest Three obs'] = avg_lowest_three
    aggregated_df.at[index, 'Average First Ten min'] = avg_first_ten
    aggregated_df.at[index, 'Average First Thirty min'] = avg_first_thirty

### In oude code worden hier de outliers weggehaald, maar idk of dat zomaar mag 
reden hiervoor was: sports activity or too little measurements 

### Small final changes on the dataframe to make further analysis easier

In [60]:
# replace all False values with 0 and True values with 1 in aggregated_df
aggregated_df = aggregated_df.replace(False, 0)
aggregated_df = aggregated_df.replace(True, 1)

In [61]:
aggregated_df[aggregated_df["Yesterday"] == 1]

Unnamed: 0,Person ID,Drinks,Fastfood,Sports,Food 23 before sleep,Medication,Date created,Woke up by (smart) alarm,Woke up by external factors,Yesterday,Slept again after alarm,Smart alarm,Date,time_of_awakening,state_before_awakening,Number of Measurements,Average Heart Rate,Average Lowest Three obs,Average First Ten min,Average First Thirty min
22,5,0,0,0,0,1,2023-10-26T06:43:53.000Z,1,0,1,0,1,2023-10-26,07:43:07,2.0,14,74.250721,68.0,73.0,72.333333
65,3,0,0,0,0,0,2023-10-20T19:02:52.000Z,1,0,1,0,1,2023-10-20,05:21:00,3.0,60,69.166667,58.666667,64.7,67.5
72,4,0,0,0,1,0,2023-10-06T06:51:13.000Z,1,0,1,0,0,2023-10-06,06:33:00,3.0,21,85.285714,69.666667,81.8,82.0
77,4,0,0,1,0,0,2023-10-12T05:32:44.000Z,1,0,1,0,0,2023-10-12,04:56:00,3.0,24,91.125,70.333333,89.4,89.333333
105,2,1,1,1,1,0,2023-10-23T06:48:41.000Z,1,0,1,0,0,2023-10-23,03:18:00,3.0,60,53.483333,43.333333,56.6,52.8


In [62]:
aggregated_df.loc[aggregated_df['Yesterday'] == 1, 'Date'] -= pd.to_timedelta(1, unit='d')

In [63]:
aggregated_df[aggregated_df["Yesterday"] == 1]

Unnamed: 0,Person ID,Drinks,Fastfood,Sports,Food 23 before sleep,Medication,Date created,Woke up by (smart) alarm,Woke up by external factors,Yesterday,Slept again after alarm,Smart alarm,Date,time_of_awakening,state_before_awakening,Number of Measurements,Average Heart Rate,Average Lowest Three obs,Average First Ten min,Average First Thirty min
22,5,0,0,0,0,1,2023-10-26T06:43:53.000Z,1,0,1,0,1,2023-10-25,07:43:07,2.0,14,74.250721,68.0,73.0,72.333333
65,3,0,0,0,0,0,2023-10-20T19:02:52.000Z,1,0,1,0,1,2023-10-19,05:21:00,3.0,60,69.166667,58.666667,64.7,67.5
72,4,0,0,0,1,0,2023-10-06T06:51:13.000Z,1,0,1,0,0,2023-10-05,06:33:00,3.0,21,85.285714,69.666667,81.8,82.0
77,4,0,0,1,0,0,2023-10-12T05:32:44.000Z,1,0,1,0,0,2023-10-11,04:56:00,3.0,24,91.125,70.333333,89.4,89.333333
105,2,1,1,1,1,0,2023-10-23T06:48:41.000Z,1,0,1,0,0,2023-10-22,03:18:00,3.0,60,53.483333,43.333333,56.6,52.8


In [46]:
# Now we can remove the "Yesterday" column and save the dataset
aggregated_df = aggregated_df.drop(columns=['Yesterday'])


In [49]:
# Save the aggregated_df DataFrame to a csv file
aggregated_df.to_csv('data/aggregated_df.csv', index=False)