**[Log 31.10.2023]**  
This code doesn't include an important step of checking and cleaning the physical activity data provided by Fitbit. For instance, the MET should be between 0<MET<18, there should be no duplicate or incorrect values in dates, etc).

In [4]:
import numpy as np
import pandas as pd
import os
from datetime import date

########## Definition of threshold values
a = numberOfSteps = 1 #number of step to consider a day 
b = percentageDiffMinDay = 0.1 #percentage of difference between "min-per-min data resampled by day" and "daily data" to consider a day
c = duration_of_PA_min = 600 #number of minutes per day above RMR
d = duration_of_PA_hour = 10 #number of hours containing at least e minutes above RMR
e = number_active_minutes = 1 #minutes
f = number_of_steps = 3000 #number of steps in a day
g = number_of_HR_hour = 10 #number of hours with HR data
# Julien's add
h = wakingHours = True

########## Step one - Time serie cleaning procedure

# 1- Import MINUTE data
# Define the path where you have stored the Fitbit raw minute data (Steps, MET, Calories, Intensity)
path_min = os.path.join(os.getcwd(),'samples/min')
dict_min = {}
for file_name in os.listdir(path_min):
    if file_name.endswith(".csv"):  # we only import the .csv files
        file_path = os.path.join(path_min, file_name)
        columns_to_keep = ['Steps', 'Calories', 'ActivityMinute', 'ID']  # We are only interested in Steps and Calories here
        df = pd.read_csv(file_path, usecols=columns_to_keep)
        df = df.set_index('ActivityMinute')
        df.index = pd.to_datetime(df.index)
        
        # Julien's add
        if h:
            df=df.between_time('5:00','22:59')
        
        df['RMR'] = df.resample('D')['Calories'].transform('min')  # Calculate the resting metabolic rate (RMR) = minute with the minimum Calorie value per day
        key = file_name[:15] 
        df['ID'] = key
        dict_min[key] = df

# 2- Resample the minute data as daily
dict_min_d = {}
for key, df in dict_min.items():
    # Calculate for each day the number of steps & calories
    df_resampled = df.resample('D').agg({
        'Steps': 'sum', 
        'Calories': 'sum',
        'RMR': 'mean',
        'ID': 'first'
    })
    df_resampled['dailyRMR'] = 1440 * df_resampled['RMR']  # Calculate the daily RMR = 1440 minute * RMR
    dict_min_d[key] = df_resampled

# 3- Import DAILY data
# Define the path where you have stored the Fitbit raw daily data (Steps, MET, Calories, Intensity)
path_day = os.path.join(os.getcwd(),'samples/day')
dict_day = {}
for file_name in os.listdir(path_day):
    if file_name.endswith(".csv"):
        file_path = os.path.join(path_day, file_name)
        columns_to_keep = ['StepTotal', 'Calories', 'ActivityDay', 'ID']
        df = pd.read_csv(file_path, usecols=columns_to_keep)

        df = df.set_index('ActivityDay')
        df.index = pd.to_datetime(df.index)
        key = file_name[:15] 
        df['ID'] = key
        dict_day[key] = df

# 4- Estimate the record of at least one step during the day (or 1 calorie, not used in the manuscript)
for key, df in dict_min_d.items():
    df['oneStep'] = df['Steps'] >= numberOfSteps
    condition = df['Calories'] > df['RMR'] # not used in the manuscript
    df['oneCalorie'] = condition # not used in the manuscript
    dict_min_d[key] = df 

# 5- Estimate the difference between minute data resampled by day
columns_to_keep = ['Steps', 'Calories', 'ID', 'Date', 'oneStep', 'oneCalorie', 'diffMinDay']
merged_dict = {}
for key in dict_min_d.keys() & dict_day.keys():
    df_min_d = dict_min_d[key]
    df_day = dict_day[key]
    # Merge minute data resampled by day ( suffix _m) with daily data (suffix _d)
    merged_df = pd.concat([df_min_d.add_suffix('_m'), df_day.add_suffix('_d')], axis=1)
    # Calculate the difference in steps and calories between minute data resampled by day with daily data
    merged_df['diff_step'] = np.abs(merged_df['StepTotal_d'] - merged_df['Steps_m'])
    merged_df['diff_cal'] = np.abs(merged_df['Calories_d'] - merged_df['Calories_m'])
    # Determine if the difference is under the threshold
    threshold_step = percentageDiffMinDay * merged_df['StepTotal_d'].mean()
    threshold_cal = percentageDiffMinDay * merged_df['Calories_d'].mean()
    merged_df['diff_step_ok'] = merged_df['diff_step'] < threshold_step
    merged_df['diff_cal_ok'] = merged_df['diff_cal'] < threshold_cal
    # Estimate if there is a difference for the day (diffMinDay column)
    merged_df['diffMinDay'] = ~(merged_df['diff_step_ok'] & merged_df['diff_cal_ok'])
    # Format the index as date and add the Date column
    merged_df.index = pd.to_datetime(merged_df.index).strftime('%Y-%m-%d')
    merged_df['Date'] = merged_df.index
    # Ensure oneStep and oneCalorie columns are present in merged_df
    merged_df['oneStep'] = df_min_d['oneStep']
    merged_df['oneCalorie'] = df_min_d['oneCalorie']
    # Keep  the necessary columns only, and rename them
    final_df = merged_df[['StepTotal_d', 'Calories_d', 'ID_d', 'Date', 'oneStep', 'oneCalorie', 'diffMinDay']].copy()
    final_df.columns = columns_to_keep
    # Store the final DataFrame in the dictionary
    merged_dict[key] = final_df

########## Step two - estimation of valid wear

#1- 3 different methods based on accelerometer data
result_dict = {}
for key, df in dict_min.items():
    # Method 1: Days with valid wear required a minimum of 600 minutes above the estimated RMR
    df_daily1 = df.resample('D').sum()
    df_daily1['nMinAboveRMR'] = df[df['Calories'] > df['RMR']].resample('D').count()['Calories']
    df_daily1['Method1'] = df_daily1['nMinAboveRMR'] >= duration_of_PA_min

    # Method 2: Number of hours per days containing at least X active minutes
    df['minAboveRMR'] = (df['Calories'] > df['RMR']).astype(int)
    df['hourAboveRMR'] = df['minAboveRMR'].resample('h').sum() >= number_active_minutes
    df_daily2 = df['hourAboveRMR'].resample('D').sum().to_frame()
    df_daily2['Method2'] = df_daily2['hourAboveRMR'] >= duration_of_PA_hour

    # Method 3: Number of steps per day (not used in the manuscript)
    df_daily3 = df.resample('D').sum()
    df_daily3['Method3'] = df_daily3['Steps'] >= number_of_steps # not used in the manuscript

    # Concat all methods, and add them to the results
    df_final = pd.concat([df_daily1['Method1'], df_daily2['Method2'], df_daily3['Method3']], axis=1)
    df_final = df_final.rename_axis('Date')
    result_dict[key] = df_final
    
for key, df in dict_min_d.items():
    if key in result_dict:
        result_df = result_dict[key]
        final_df = pd.concat([df, result_df], axis=1)
        dict_min_d[key] = final_df

#2- One method based on HR data 
# (plus synchronisation and Device name informations from Fitbit)
path_HR = os.path.join(os.getcwd(),'samples/HR')
weartime_dict = {}
syncevents_dict = {}
syncevents_dict2 = {}
deviceName_dict = {}
diff_synch = {}
for file in sorted(os.listdir(path_HR)):
    if file.endswith(".csv"):
        #importing the number of minutes with HR values
        if "WearTime" in file:
            key = file[:3]
            if key not in weartime_dict:
                weartime_dict[key] = []
            file_path = os.path.join(path_HR, file)
            df = pd.read_csv(file_path)

            df["Day"] = pd.to_datetime(df["Day"])
            df.set_index("Day", inplace=True)
            weartime_dict[key] = df
        #importing the number of synchronisation per day (not used in the manuscript) and device name
        if "syncEvents" in file:
            key = file[:3]
            if key not in syncevents_dict:
                syncevents_dict[key] = []
            file_path = os.path.join(path_HR, file)
            df = pd.read_csv(file_path)

            df["DateTime"] = pd.to_datetime(df["DateTime"])
            df.set_index("DateTime", inplace=True)
            syncevents_dict2[key] = df
            # Here we save the name of the device. As we only use the line 1, this can change over time
            df_daily = df.groupby(df.index.date).first()
            deviceName_dict[key] = df_daily['DeviceName']
            # Here we count the number of synchronisations per day            
            resampled_df = df['SyncDateUTC'].resample('D').count().rename("synchPerDay")
            syncevents_dict[key] = resampled_df
concat_dict = {}
for key in syncevents_dict:
    if key in weartime_dict and key in deviceName_dict:
        weartime_df = weartime_dict[key]
        syncevents_df = syncevents_dict[key]
        weartime_df['DeviceName'] = deviceName_dict[key]
        concat_df = pd.concat([weartime_dict[key], syncevents_dict[key]], axis=1, join='inner')
        concat_df = concat_df.assign(ID=key)
        # DeviceName needs to be filled for missing dates
        concat_df['DeviceName'] = concat_df['DeviceName'].fillna(method='ffill')
        # Here we precise the memory length based on the model - set X based on DeviceName column: Alta = 5 days of memory, others = 7 days
        concat_df['X'] = np.where(concat_df['DeviceName'] == 'Alta', -5, -7)
        # MethodHR is TRUE if >= 10 hours of wearing time over the day (600 minutes with HR data)
        concat_df['MethodHR'] = np.where(concat_df['TotalMinutesWearTime'] / 60 >= number_of_HR_hour, True, False)
        # measuredHR is TRUE if any MethodHR is true for the subject (therefore, the watch was able to measure HR - and so contained an HR lens)
        concat_df['measuredHR'] = concat_df['MethodHR'].any()

        # not used in the manuscript
        # SYNCH_FITBIT is TRUE if any synchronisation occured during the day
        concat_df["SYNCH_FITBIT"] = np.where(concat_df["synchPerDay"] == 0, False, True)
        # LASTSYNCH_D measures the number of days separating day n to the last day with SYNCH_FITBIT = TRUE        
        concat_df['LASTSYNCH_D'] = pd.NaT
        last_true_index = concat_df['SYNCH_FITBIT'].last_valid_index()
        for index, row in concat_df.iterrows():
            if row['SYNCH_FITBIT']:
                concat_df.loc[index, 'LASTSYNCH_D'] = 0
                last_true_index = index
            elif last_true_index is not None:
                concat_df.loc[index, 'LASTSYNCH_D'] = last_true_index - index
                if concat_df.loc[index, 'LASTSYNCH_D'] != pd.Timedelta('0 days'):
                    concat_df.loc[index, 'LASTSYNCH_D'] = concat_df.loc[index, 'LASTSYNCH_D'].days
        # DATA_LOST is TRUE if LASTSYNCH_D > X
        concat_df['LASTSYNCH_D'] = concat_df['LASTSYNCH_D'].astype(int)
        concat_df['dataLostSynch'] = False
        for index, row in concat_df.iterrows():
            if row['LASTSYNCH_D'] <= row['X']:
                index_to_update = concat_df.index.get_loc(index)
                concat_df.at[concat_df.index[index_to_update], 'dataLostSynch'] = True
        concat_df.drop(['ID', 'X'], axis=1, inplace=True)
        concat_df['ID'] = key
        concat_dict[key] = concat_df

# merging all methods together
dict_min_d_updated_keys = {key.split(".")[0]: value for key, value in dict_min_d.items()}
for key in dict_min_d_updated_keys.keys() & concat_dict.keys():
    df_min_d = dict_min_d_updated_keys[key]
    # here we are only interested in MethodHR
    df_concat = concat_dict[key][['DeviceName', 'MethodHR', 'measuredHR', 'dataLostSynch']]
    # Convert indices to datetime if they aren't already
    if not pd.api.types.is_datetime64_any_dtype(df_min_d.index):
        df_min_d.index = pd.to_datetime(df_min_d.index)
    if not pd.api.types.is_datetime64_any_dtype(df_concat.index):
        df_concat.index = pd.to_datetime(df_concat.index)
    # Ensure both DataFrames have the same index by taking the intersection of their indices
    common_index = df_min_d.index.intersection(df_concat.index)
    df_min_d = df_min_d.loc[common_index]
    df_concat = df_concat.loc[common_index]
    # Merge the DataFrames on their indices using concat with axis=1
    merged_df = pd.concat([df_min_d, df_concat], axis=1)
    # Only keep columns of interest for the manuscript
    columns_to_keep = ['ID','oneStep',	'oneCalorie', 'DeviceName',	'measuredHR', 'dataLostSynch',
                       'Steps',	'Calories',	
                       'Method1','Method2','Method3','MethodHR']
    merged_df = merged_df[columns_to_keep]
    # Update dict_min_d with the merged DataFrame using the original key
    dict_min_d[key + '.csv'] = merged_df

##### Saving results (independently and all together)
save_path = os.path.join(os.getcwd(),'results') #define here your path
if not os.path.exists(save_path):
    os.makedirs(save_path)
all_dfs = []
for key, df in dict_min_d.items():
    df = df.reset_index().rename(columns={'index': 'Date'})
    all_dfs.append(df)
    file_path = os.path.join(save_path, f"{key}.csv")
    df.to_csv(file_path, index=True)
concatenated_df = pd.concat(all_dfs)
concatenated_file_path = os.path.join(save_path, "concatenated.csv")
concatenated_df.to_csv(concatenated_file_path, index=False)


# Show participant 093 as example
dict_min_d['093.csv'].head()


Unnamed: 0,ID,oneStep,oneCalorie,DeviceName,measuredHR,dataLostSynch,Steps,Calories,Method1,Method2,Method3,MethodHR
2023-02-24,093.csv,True,True,Alta HR,True,False,6852,2578.959183,True,True,True,True
2023-02-25,093.csv,True,True,Alta HR,True,False,7268,3022.053579,True,True,True,True
2023-02-26,093.csv,True,True,Alta HR,True,False,8966,2982.914397,True,True,True,True
2023-02-27,093.csv,True,True,Alta HR,True,False,12893,3411.02959,True,True,True,True
2023-02-28,093.csv,True,True,Alta HR,True,False,12964,3128.599201,True,True,True,True


From there, we can easily select days:
- with at least one step recorded, 
- with at least one calories recorded, 
- with the correct device name used, recording HR,
- with no data loss due to too sparse synchronisation (not used in the manuscript).  
  
After that, we can investigate, for each invididual and each day, the step count and the number of calories spend, depending on the method applyed:
- method1: Days with valid wear required a minimum of 600 minutes above the estimated RMR
- method2: Days with valid wear required a minimum of 10 hours containing at least 1 minute above the RMR
- method3: Days with valid wear required a minimum of 3000 steps (not used in the manuscript)
- methodHR: Days with valid wear required a minimum of 600 minutes with HR data