## Imports

In [128]:
import pandas as pd
import os
import matplotlib.pyplot as plt

# for importing apple data 
import xml.etree.ElementTree as ET

import functions
from importlib import reload
reload(functions)

import warnings
warnings.filterwarnings('ignore')

## Preparing the data 

#### Load and filter Xiaomi data

In [129]:
# order of people: Andy, Tom, Huub, Oumaima 

# Get data for each person
person1 = pd.read_csv('data/20231030_8210796956_MiFitness_hlth_center_fitness_data.csv')
person2 = pd.read_csv('data/20231030_8211531339_MiFitness_hlth_center_fitness_data.csv')
person3 = pd.read_csv('data/20231031_8210564343_MiFitness_hlth_center_fitness_data.csv')
person4 = pd.read_csv('data/20231110_8210586841_MiFitness_hlth_center_fitness_data.csv')

In [130]:
# Identify each person before combining all data into one big dataframe 
person1['Person ID'] = 1
person2['Person ID'] = 2
person3['Person ID'] = 3
person4['Person ID'] = 4

master_frame = pd.concat([person1, person2, person3, person4], ignore_index=True)


In [131]:
# Format all data from the keys to dataframes
key_dataframes = functions.format_mi_band_data(master_frame)


# relevant dataframes:
x_heart_rate_df = key_dataframes['heart_rate']
x_sleep_df = key_dataframes['watch_night_sleep']

In [132]:
# Get the data within our study time period

# 1696118399 = Saturday 30 September 2023 23:59:59 dit is de nacht voor de 1e keer dat we ontwaken
# 1697371200 = Sunday 15 October 2023 12:00:00 dit is de laatste keer dat we met een normale wekker (of smart wake voor tom) wakker worden
# 1698537599 = Saturday 28 October 2023 23:59:59 dit is de laatste dag dat we wakker worden

start_time = 1696118399
end_time = 1698537599

filtered_heart_rate_df = x_heart_rate_df[(x_heart_rate_df['Time'] >= start_time) & (x_heart_rate_df['Time'] <= end_time)]
filtered_watch_night_sleep_df = x_sleep_df[(x_sleep_df['Time'] >= start_time) & (x_sleep_df['Time'] <= end_time)]

#### Load and filter Apple data

In [133]:
# Load and parse the XML file
current_path = os.getcwd()
tree = ET.parse(current_path + '/data/export.xml')  
root = tree.getroot()

# Extract data into a list of dictionaries
data = []
for record in root.findall('Record'):
    record_data = record.attrib  # Extract attributes of the Record tag
    for metadata in record.findall('MetadataEntry'):
        record_data[metadata.get('key')] = metadata.get('value')
    data.append(record_data)

# Convert the list of dictionaries into a DataFrame
apple_df = pd.DataFrame(data)

In [134]:
apple_df['type'] = apple_df['type'].str.replace('HKQuantityTypeIdentifier', '')
apple_df['type'] = apple_df['type'].str.replace('HKCategoryTypeIdentifier', '')

# Filter for SleepAnalysis and HeartRate - relevant for our study
apple_df_filtered = apple_df[apple_df['type'].isin(['SleepAnalysis', 'HeartRate'])]

In [135]:
apple_df['creationDate'] = pd.to_datetime(apple_df['creationDate'])
apple_df['startDate'] = pd.to_datetime(apple_df['startDate'])
apple_df['endDate'] = pd.to_datetime(apple_df['endDate'])

# Filter for the time period
start_date = '2023-09-30'
end_date = '2023-10-28'
apple_df_filtered = apple_df_filtered[(apple_df_filtered['creationDate'] >= start_date) & (apple_df_filtered['creationDate'] <= end_date)]

##### Get apple heart rate dataframe

In [136]:
a_heart_rate_df = apple_df_filtered[apple_df_filtered['type'] == 'HeartRate']
a_heart_rate_df = a_heart_rate_df[['creationDate', 'startDate', 'endDate', 'value']]
a_heart_rate_df.head()

Unnamed: 0,creationDate,startDate,endDate,value
121528,2023-09-30 00:00:46 +0100,2023-09-29 23:49:49 +0100,2023-09-29 23:49:49 +0100,72
121529,2023-09-30 00:00:46 +0100,2023-09-29 23:54:35 +0100,2023-09-29 23:54:35 +0100,69
121530,2023-09-30 00:12:15 +0100,2023-09-29 23:59:35 +0100,2023-09-29 23:59:35 +0100,70
121531,2023-09-30 00:12:15 +0100,2023-09-30 00:04:34 +0100,2023-09-30 00:04:34 +0100,70
121532,2023-09-30 00:14:57 +0100,2023-09-30 00:09:45 +0100,2023-09-30 00:09:45 +0100,69


##### Get Apple sleep dataframe

In [137]:
a_sleep_df = apple_df_filtered[apple_df_filtered['type'] == 'SleepAnalysis']
# a_sleep_df.head()

In [138]:
# Convert startDate, endDate, and creationDate to datetime if not already done
a_sleep_df['startDate'] = pd.to_datetime(a_sleep_df['startDate'])
a_sleep_df['endDate'] = pd.to_datetime(a_sleep_df['endDate'])
a_sleep_df['creationDate'] = pd.to_datetime(a_sleep_df['creationDate'])

# Filter out 'HKCategoryValueSleepAnalysisInBed'
a_sleep_df = a_sleep_df[a_sleep_df['value'] != 'HKCategoryValueSleepAnalysisInBed']

# Define a function to categorize sleep states
def categorize_sleep_state(value):
    if 'Deep' in value:
        return 4
    elif 'REM' in value:
        return 2
    elif 'Core' in value:  # Assuming 'Core' is light sleep
        return 3
    elif 'Awake' in value:
        return 1
    else:
        return 'unknown'

# Categorize each record
a_sleep_df['sleep_state'] = a_sleep_df['value'].apply(categorize_sleep_state)

# Calculate the duration for each record in minutes
a_sleep_df['duration'] = (a_sleep_df['endDate'] - a_sleep_df['startDate']).dt.total_seconds() / 60

In [139]:
aggregated_data = []

# Process each day's data
for creation_date, day_data in a_sleep_df.groupby('creationDate'):
    # Ensure day_data is sorted by startDate to get the correct last record
    day_data_sorted = day_data.sort_values(by='startDate')

    # Fetch the wake-up time from the last record's creationDate
    wake_up_date = day_data_sorted['creationDate'].iloc[-1].date()
    wake_up_time = day_data_sorted['creationDate'].iloc[-1].time()

    # Calculate the total duration for each sleep state
    sleep_durations = day_data_sorted.groupby('sleep_state')['duration'].sum()

    day_summary = {
        'creation_date': creation_date,
        'wake_up_date': wake_up_date, 
        'wake_up_time': wake_up_time,
        'last_sleep_state': day_data_sorted['sleep_state'].iloc[-1],
        'sleep_deep_duration': sleep_durations.get(4, 0),  
        'sleep_light_duration': sleep_durations.get(3, 0),  
        'sleep_rem_duration': sleep_durations.get(2, 0),  
        'sleep_awake_duration': sleep_durations.get(1, 0), 
        'total_sleep_duration': sum(sleep_durations)
    }
    aggregated_data.append(day_summary)

# Convert the list of dictionaries to a DataFrame
a_sleep_df = pd.DataFrame(aggregated_data)


#### Load behaviour tracking data 

In [140]:
behaviour_tracking_data = pd.read_excel('data/Behavioural data app.xlsx')

#### Save all data for further preprocessing 

In [110]:
# Sleep xiaomi, heart xiaomi, sleep apple, heart apple
x_heart_rate_df.to_csv('data/x_heart_rate.csv', index=False)
x_sleep_df.to_csv('data/x_sleep.csv', index=False)

a_heart_rate_df.to_csv('data/a_heart_rate.csv', index=False)
a_sleep_df.to_csv('data/a_sleep.csv', index=False)

## Exploratory Data Analysis

Hier simpel houden, bij analysis meer diepgang:
- five nr summary per persoon voor heart and sleep data 
- heart rate scatter plots general en during sleep 
- heart rate plots over time per persoon 
- binary variabelen distribution van waardes 

In [111]:
# Numbers: data points per person, five number summary, find extreme values etc 
# do the same for all people together - merged dataset 

In [124]:
# x_heart_rate_df.head()

In [125]:
# x_sleep_df.head()

In [126]:
# a_heart_rate_df.head()

In [127]:
# a_sleep_df.head()