In [1]:
import pandas as pd
import json
import datetime as dt
import os
# load_dotenv()
# from dotenv import load_dotenv, dotenv_values

# File Reading / List population

In [2]:
#Set directory, get list of files in raw data directory, so that we can loop through them
directory = '../data/raw_data'
raw_data_files = os.listdir(directory)
#Create list of dfs, that will hold pointers to each json -> df.
activity_df_list = []
sleep_df_list = []
skin_df_list = []
hrv_df_list = []
brv_df_list = []
#list of dfs from above, for referencing
list_of_dfs = [activity_df_list, sleep_df_list, skin_df_list, hrv_df_list, brv_df_list]
list_of_str = ['get_activities', 'get_sleep', 'get_skin', 'get_hrv', 'get_br']
list_of_norm = ['activities', 'sleep', 'tempSkin', 'hrv', 'br']
#Initialise the excel data dfs
sleep_xsl_df = None
stress_xsl_df = None
mfp_df = None
gf_df = None



In [3]:
#Converting json files in raw_data and then creating a df for each of them, adding them to a list

for index, ldf in enumerate(list_of_dfs):
    
    #looping through files list, and creating a list of jsons by loading them all
    list_of_dfs[index] = [json.load(open(directory + "/" + f)) for f in raw_data_files if f.startswith(list_of_str[index]) and f.endswith('.json')]
    #print(len(ldf))
    
    #Loop and apply json_normalize on all files in ldf
    list_of_dfs[index][:] = map(lambda x: pd.json_normalize(x[list_of_norm[index]]), list_of_dfs[index])
    
    #Convert normalized json dicts to dfs
    list_of_dfs[index][:] = map(lambda x: pd.DataFrame.from_dict(x), list_of_dfs[index])
    #print(list_of_dfs[index][0].info())

# Activity Data Cleaning

In [4]:
# Loop through activity files normalize, pd them, drop cols, drop more cols (might need a catch), concat all dfs ,
# loop through and transform data, create column names, change types

In [5]:
#First lets remove all the columns that we deemed not necessary.

columns_remove = ['logId','activityLevel','logType','caloriesLink','heartRateLink','tcxLink','lastModified','hasGps','manualValuesSpecified.calories','manualValuesSpecified.distance','manualValuesSpecified.steps','activeZoneMinutes.totalMinutes','activeZoneMinutes.minutesInHeartRateZones','distance','speed','pace','distanceUnit', 'source.type', 'source.id', 'source.url', 'source.trackerFeatures', 'source.name', 'inProgress', 'customHeartRateZones']
#Loop through list of dfs, and call the .drop func and remove listed columns above
list_of_dfs[0][:] = [df.drop(columns=[col for col in columns_remove if col in df.columns], axis=1) for df in list_of_dfs[0]]

In [6]:
#Now concat all the data frames together

list_of_dfs[0] = pd.concat(list_of_dfs[0], ignore_index = True)

In [7]:
#Now expand some of the cols within the df

#Give list_of_dfs[0] an alias for simplicity
activity_df = list_of_dfs[0]

#Initialize new column names in df
base_column_string = 'hrz'
activity_df['hrz_OutofRange_calories'] = None
activity_df['hrz_FatBurn_calories'] = None
activity_df['hrz_Cardio_calories'] = None
activity_df['hrz_Peak_calories'] = None

activity_df['hrz_OutofRange_minutes'] = None
activity_df['hrz_FatBurn_minutes'] = None
activity_df['hrz_Cardio_minutes'] = None
activity_df['hrz_Peak_minutes'] = None

#loop through df
for index in range(0, len(activity_df['heartRateZones'])):
    #get list of heartratezones from nested json structure
    data_list = activity_df['heartRateZones'].iloc[index]
    
    #loop through the kv pair in each item in list mentioned above
    for small_dict in data_list:
        #generate what column the data will be placed in by the value 
        generated_base_col_string = base_column_string + '_' + small_dict['name']
        generated_base_col_string = generated_base_col_string.replace(" ", "")
        
        #get calorie value
        cal_val = small_dict['caloriesOut']
        #insert calories by col name generated and particular index
        activity_df.iloc[index, activity_df.columns.get_loc(generated_base_col_string + '_' + 'calories')] = cal_val
        
        #Repeat process above but with minutes per section
        minute_val = small_dict['minutes']
        activity_df.iloc[index, activity_df.columns.get_loc(generated_base_col_string + '_' + 'minutes')] = minute_val
        

activity_df.drop(columns=['heartRateZones'], axis = 1, inplace = True)

In [8]:
#Rename the columns that I outlined in document
rename_act_cols = {"customHeartRateZones" : "custom_hrz", 
                   "intervalWorkoutData.intervalSummaries" : "iwd_intervalSummaries",
                   "intervalWorkoutData.numRepeats" : "iwd_numRepeats"}

list_of_dfs[0].rename(columns = rename_act_cols, inplace = True)

In [9]:
#change data types 
data_type_change = {'hrz_OutofRange_calories' : 'float64',
                    'hrz_FatBurn_calories' : 'float64',
                    'hrz_Cardio_calories' : 'float64',
                    'hrz_Peak_calories' : 'float64',
                    'hrz_OutofRange_minutes' : 'int64',
                    'hrz_FatBurn_minutes' : 'int64',
                    'hrz_Cardio_minutes' : 'int64',
                    'hrz_Peak_minutes' : 'int64'}

list_of_dfs[0] = list_of_dfs[0].astype(data_type_change)

In [10]:
list_of_dfs[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   activityTypeId           294 non-null    int64  
 1   activityName             294 non-null    object 
 2   calories                 294 non-null    int64  
 3   steps                    293 non-null    float64
 4   averageHeartRate         294 non-null    int64  
 5   duration                 294 non-null    int64  
 6   activeDuration           294 non-null    int64  
 7   startTime                294 non-null    object 
 8   originalStartTime        294 non-null    object 
 9   originalDuration         294 non-null    int64  
 10  elevationGain            294 non-null    float64
 11  hasActiveZoneMinutes     294 non-null    bool   
 12  iwd_intervalSummaries    294 non-null    object 
 13  iwd_numRepeats           294 non-null    int64  
 14  hrz_OutofRange_calories  2

# Sleep Data Cleaning

In [11]:
# Loop through sleep logs, keep only certain columns per each, concat all dfs together

In [12]:
# Remove unnecessary columns 
remove_sleep_cols = ['infoCode', 'logId', 'logType', 'minutesAfterWakeup', 'minutesToFallAsleep']

list_of_dfs[1][:] = [df.drop(columns = [col for col in remove_sleep_cols if col in df.columns], axis = 1) for df in list_of_dfs[1]]

In [13]:
# concat all the dfs 

list_of_dfs[1] = pd.concat(list_of_dfs[1], ignore_index = True)

In [14]:
list_of_dfs[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 30 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   dateOfSleep                               292 non-null    object 
 1   duration                                  292 non-null    int64  
 2   efficiency                                292 non-null    int64  
 3   endTime                                   292 non-null    object 
 4   isMainSleep                               292 non-null    bool   
 5   minutesAsleep                             292 non-null    int64  
 6   minutesAwake                              292 non-null    int64  
 7   startTime                                 292 non-null    object 
 8   timeInBed                                 292 non-null    int64  
 9   type                                      292 non-null    object 
 10  levels.data                           

In [15]:
# Read sleep csv, (go through date conversion stuff and only keep necessary things), 
# (dont drop first row, idk why that is there), create datetimekey col and convert to datetime..., perform join, b

In [16]:
# Perform any necessary data cleaning as was done on sleep data csv...

# Skin / HRV / BRV Data Cleaning

In [17]:
#Copy work dne in previous notebook, try to  find a way to auto detect the # of files with skin temp...

# Stress Data Cleaning