In [2]:
"""
FUNCTIONS
"""
import pandas as pd
import numpy as np

def create_samples_cell(exercises):
    
    if 'samples.rr' in exercises.columns:
        rr = pd.json_normalize(exercises['samples.rr'])
        rr = rr.to_numpy().flatten()
        rr = [float(i['duration'].split('T')[1].split('S')[0]) for i in rr]
        avg_rr = np.mean(rr)
        sd_rr = np.std(rr)
        min_rr = np.min(rr)
        max_rr = np.max(rr)
        
    return avg_rr, sd_rr, min_rr, max_rr

def read_training(training_data):
    # Erotellaan training-datasta exercises, periodData ja loadInformation, mikäli mahdollista.
    
    # exercises-muuttujat
    duration = float('nan')
    distance = float('nan')
    sport = str('nan')
    ascent = float('nan')
    descent = float('nan')
    kiloCalories = str('nan')
    heartRate_min = str('nan')
    heartRate_avg = str('nan')
    heartRate_max = str('nan')
    speed_avg = float('nan')
    speed_max = float('nan')
    cadence_avg = str('nan')
    cadence_max = str('nan')
    
    avg_rr = float('nan')
    sd_rr = float('nan')
    min_rr = float('nan')
    max_rr = float('nan')
    
    # if-lause tässä nyt turha ja väärin loogisesti versus palautettavat muuttujat
    if 'exercises' in training_data:
        exercises = pd.json_normalize(training_data, record_path=['exercises'])
        try:
            dateTime = str(exercises['startTime'][0]).split('T')[0]
        except Exception:
            print("dateTime not found")
            dateTime = str("nan")
        try:
            duration = float(exercises['duration'][0].split('T')[1].split('S')[0])
        except Exception:
            print("duration not found")
        try:
            sport = str(exercises['sport'][0])
        except Exception:
            print("sport not found")

        if 'distance' in exercises.columns:
            distance = float(exercises['distance'][0])
        if 'ascent' in exercises.columns:
            ascent = float(exercises['ascent'][0])
        if 'descent' in exercises.columns:
            descent = float(exercises['descent'][0])

        try:
            kiloCalories = int(exercises['kiloCalories'][0])
        except Exception:
            print("kiloCalories not found")
        
        if 'heartRate.min' in exercises:
            heartRate_min = int(exercises['heartRate.min'][0])
        if 'heartRate.avg' in exercises:
            heartRate_avg = int(exercises['heartRate.avg'][0])
        if 'heartRate.max' in exercises:
            heartRate_max = int(exercises['heartRate.max'][0])
        if 'speed.avg' in exercises:
            speed_avg = float(exercises['speed.avg'][0])
        if 'speed.max' in exercises:
            speed_avg = float(exercises['speed.max'][0])
        if 'cadence.avg' in exercises:
            speed_avg = float(exercises['cadence.avg'][0])
        if 'cadence.max' in exercises:
            speed_avg = float(exercises['cadence.max'][0])
        
        if 'samples*' in exercises.columns:
            avg_rr, sd_rr, min_rr, max_rr = create_samples_cell(exercises)


    # loadInformation (RPE jos mahd)
    load_cardio = float(np.nan)
    load_muscle = float(np.nan)
    load_cardio_interp = str('nan')
    load_muscle_interp = str('nan')
    load_percieved = float(np.nan)
    load_percieved_interp = str('nan')
    load_sessionrpe = str('nan')

    if 'loadInformation' in training_data:
        loadInformation = pd.json_normalize(training_data, record_path=['loadInformation'])
        if 'cardioLoad' in loadInformation:
            load_cardio = loadInformation['cardioLoad']
        if 'muscleLoad' in loadInformation:
            load_muscle  = loadInformation['muscleLoad']
        if 'cardioLoadInterpretation' in loadInformation:
            load_cardio_interp = loadInformation['cardioLoadInterpretation']
        if 'muscleLoadInterpretation' in loadInformation:
            load_muscle_interp = loadInformation['muscleLoadInterpretation']
        if 'sessionRPE' in loadInformation and loadInformation['sessionRPE']!=("UNKNOWN"):
            load_sessionrpe = loadInformation['sessionRPE'] 
        if 'perceivedLoad' in loadInformation:
            load_percieved = loadInformation['perceivedLoad']
        if 'perceivedLoadInterpretation' in loadInformation and loadInformation['perceivedLoadInterpretation']!=("NOT_AVAILABLE"):
            load_percieved_interp = loadInformation['perceivedLoadInterpretation']

    # periodData-muuttujat
    # RUNNING_TEST
    rt_category = float('nan')
    rt_mas = float('nan')
    rt_map = float('nan')
    rt_vo2max = float('nan')
    rt_initial_speed = float('nan')
    rt_speed_increase_rate = float('nan')
    rt_quality_rate = float('nan')
    rt_duration = float('nan')
    rt_avg_hr = float('nan')
    rt_max_hr = float('nan')
    rt_max_speed = float('nan')
    rt_avg_speed = float('nan')
    rt_avg_cadence = float('nan')
    rt_max_cadence = float('nan')
    rt_avg_power = float('nan')
    rt_max_power = float('nan')    
    
    if 'periodData' in training_data:
        try:
            subPeriods = pd.json_normalize(training_data['periodData']['subPeriods'])
            try:
                subPeriods_lower = pd.json_normalize(subPeriods['subPeriods'])

                for i in range(len(subPeriods_lower.columns)):
                    sub_df = pd.json_normalize(subPeriods_lower[i])
                    if sub_df['type'][0] == 'RUNNING_TEST':
                        running_test = sub_df                
                        attributes = pd.json_normalize(running_test['attributes'])

                        for i in range(len(attributes.columns)):
                            cell = attributes[i][0]
                            if cell['key'] == 'RUNNING_TEST_CATEGORY':
                                rt_category = cell['numericValue']
                            if cell['key'] == 'MAX_AEROBIC_SPEED':
                                rt_mas = cell['numericValue']
                            if cell['key'] == 'MAX_AEROBIC_POWER':
                                rt_map = cell['numericValue']
                            if cell['key'] == 'VO2MAX':
                                rt_vo2max = cell['numericValue']
                            if cell['key'] == 'INITIAL_SPEED':
                                rt_initial_speed = cell['numericValue']
                            if cell['key'] == 'SPEED_INCREASE_RATE':
                                rt_speed_increase_rate = cell['numericValue']
                            if cell['key'] == 'QUALITY_RATE':
                                rt_quality_rate = cell['numericValue']

                        subPeriods_results = pd.json_normalize(running_test['subPeriods'])
                        subPeriods_results = pd.json_normalize(subPeriods_results[1])
                        subPeriods_results = pd.json_normalize(subPeriods_results['attributes'])

                        for i in range(len(subPeriods_results.columns)):
                            cell = subPeriods_results[i][0]
                            if cell['key'] == 'DURATION':
                                rt_duration = cell['numericValue']
                            if cell['key'] == 'AVG_HEART_RATE':
                                rt_avg_hr = cell['numericValue']
                            if cell['key'] == 'MAX_HEART_RATE':
                                rt_max_hr = cell['numericValue']
                            if cell['key'] == 'MAX_SPEED':
                                rt_max_speed = cell['numericValue']
                            if cell['key'] == 'AVG_SPEED':
                                rt_avg_speed = cell['numericValue']
                            if cell['key'] == 'AVG_CADENCE':
                                rt_avg_cadence = cell['numericValue']
                            if cell['key'] == 'MAX_CADENCE':
                                rt_max_cadence = cell['numericValue']
                            if cell['key'] == 'AVG_POWER':
                                rt_avg_power = cell['numericValue']
                            if cell['key'] == 'MAX_POWER':
                                rt_max_power = cell['numericValue']
            except Exception:
                print("Harjoitustiedostossa ei subPeriod-dataa")
        except Exception:
            print("Harjoitustiedostossa ei subPeriod-dataa")
           
    d = pd.Series({sport + "_duration":duration, sport + "_distance":distance, sport + "_ascent":ascent, sport + "_descent":descent, 
        sport + "_kiloCalories":kiloCalories, sport + "_heartRate_min":heartRate_min, sport + "_heartRate_avg":heartRate_avg, 
        sport + "_heartRate_max":heartRate_max, sport + "_speed_avg":speed_avg, sport + "_speed_max":speed_max, 
        sport + "_cadence_avg":cadence_avg, sport + "_cadence_max":cadence_max, 
        sport + "_avg_rr":avg_rr, sport + "_sd_rr":sd_rr, sport + "_min_rr":min_rr, sport + "_max_rr":max_rr,
        sport + "_cardioLoad":load_cardio, sport + "_muscleLoad":load_muscle, sport + "_cardioLoadInterp":load_cardio_interp, sport + "_muscleLoadInterp":load_muscle_interp,
        sport + "_sessionRPE":load_sessionrpe, sport + "_percievedLoad":load_percieved,sport + "_percievedLoadInterp":load_percieved_interp,
                  "CONTROL_category":rt_category, "CONTROL_max_aerobic_speed":rt_mas, "CONTROL_max_aerobic_power":rt_map,
                  "CONTROL_vo2max":rt_vo2max, "CONTROL_initial_speed":rt_initial_speed, "CONTROL_speed_increase_rate":rt_speed_increase_rate,
                   "CONTROL_quality_rate":rt_quality_rate, "CONTROL_duration":rt_duration, "CONTROL_avg_hr": rt_avg_hr, "CONTROL_max_hr":rt_max_hr,
                   "CONTROL_avg_speed":rt_avg_speed, "CONTROL_max_speed":rt_max_speed, "CONTROL_avg_cadence":rt_avg_cadence,
                   "CONTROL_max_cadence":rt_max_cadence, "CONTROL_avg_power":rt_avg_power, "CONTROL_max_power":rt_max_power})  
    
    return d, dateTime
    
def numOfDays(date1, date2):
  #check which date is greater to avoid days output in -ve number
    if date2 > date1:   
        return (date2-date1).days
    else:
        return (date1-date2).days


def read_activity(activity_data):
    # Luetaan activity-datasta päivittäinen aktiivisuus.
    
    # exercises-muuttujat
    dateTime = str('nan')
    stepCount = str('nan') # MUISTA LOPUKSI KÄSITELLÄ SARAKE MUUTTAEN SE NUMEERIKSEKSI!
    stepsDistance = float('nan')
    calories = str('nan') # MUISTA LOPUKSI KÄSITELLÄ SARAKE MUUTTAEN SE NUMEERIKSEKSI!
    sleepQuality = float('nan')
    sleepDuration = str('nan') # MUISTA LOPUKSI KÄSITELLÄ SARAKE MUUTTAEN SE NUMEERIKSEKSI!
    inactivityAlertCount = str('nan') # MUISTA LOPUKSI KÄSITELLÄ SARAKE MUUTTAEN SE NUMEERIKSEKSI!
    dailyMetMinutes = float('nan')
    
    activity = pd.json_normalize(activity_data)
    if 'date' in activity_data:
        dateTime = str(activity['date'][0])
    if 'summary.stepCount' in activity.columns:
        stepCount = int(activity['summary.stepCount'][0])
    if 'summary.stepsDistance' in activity.columns:
        stepsDistance = float(activity['summary.stepsDistance'][0])
    if 'summary.calories' in activity.columns:
        calories = int(activity['summary.calories'][0])
    if 'summary.sleepQuality' in activity.columns:
        sleepQuality = float(activity['summary.sleepQuality'][0])
    if 'summary.sleepDuration' in activity.columns:
        sleepDuration = int(activity['summary.sleepDuration'][0].split('T')[1].split('S')[0])
    if 'summary.inactivityAlertCount' in activity.columns:
        inactivityAlertCount = int(activity['summary.inactivityAlertCount'][0])
    if 'summary.dailyMetMinutes' in activity.columns:
        dailyMetMinutes = float(activity['summary.dailyMetMinutes'][0])
        #if 'activityLevels*' in activity.columns:
        #    jotain = create_activityLevels_cell(activity)       
            
    d = pd.Series({"stepCount":stepCount, "stepsDistance":stepsDistance, "calories":calories,
                   "sleepQuality":sleepQuality, "sleepDuration":sleepDuration,
                   "inactivityAlertCount":inactivityAlertCount, "dailyMetMinutes":dailyMetMinutes})
        
    return d, dateTime

In [7]:
import os

import pandas as pd
import numpy as np
import glob
import json
from datetime import date
from datetime import datetime
from dateutil import relativedelta

files = glob.glob("../polar_user_data/*/*.json")

subjects = []
for file in files:
    splitted = file.split("\\")[1]
    if splitted not in subjects:
        subjects.append(splitted)
    
main_files = ['247ohr', 'activity', 'nightly', 'sleep', 'training']

# Käydään läpi jokaisen tutkittavan data
training = {}
#sleep = {}
#nightly = {}
activity = {}
#ohr247 = {}
    
for s in subjects:
    for file in files:
        if s in file:
            filename = file.split('\\')[2].split('-')[0]
            if "_" in filename:
                filename = filename.split('_')[0]
            if filename in main_files:
                if filename == "training":
                    year = int(file.split('\\')[2].split('-')[2])
                    month = int(file.split('\\')[2].split('-')[3])
                    if (year == 2023 and month >= 9) or year > 2023:
                        datafile = json.load(open(file))
                        training_series, dateTime = read_training(datafile)
                        new_data = [((s, dateTime),training_series)]
                        training.update(new_data)

                if filename == "activity":
                    year = int(file.split('\\')[2].split('-')[1])
                    month = int(file.split('\\')[2].split('-')[2])
                    if (year == 2023 and month >= 9) or year > 2023:
                        datafile = json.load(open(file))
                        activity_series, dateTime = read_activity(datafile)
    
                        new_data = [((s, dateTime),activity_series)]
                        activity.update(new_data)
    
training_df = pd.DataFrame.from_dict(training)
training_df = training_df.transpose()

activity_df = pd.DataFrame.from_dict(activity)
activity_df = activity_df.transpose()

# Luodaan main_df
begin_date = datetime(2023, 9, 1)
end_date = datetime(2024, 10, 31)
difference = numOfDays(begin_date, end_date)

columns = pd.DataFrame(pd.date_range(begin_date, periods=difference))
col_arr = np.array(columns)
col_arr = [str(col[0]).split("T")[0] for col in col_arr]*len(subjects)

id_arr = [subject for subject in subjects for i in range(difference)]

arrays = [
    id_arr,
    col_arr
]

main_df = pd.DataFrame(index=arrays)
merged_df = activity_df.join(training_df, how="outer")
main_df = main_df.join(merged_df, how="outer")
    
now = datetime.now()
now_str = str(now)
date_time = "_" + now_str.replace(":", "").split(".")[0].replace(" ", "_").replace("-", "")
    
main_df.to_excel("main_df" + date_time + ".xlsx")
print("done")

Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
kiloCalories not found
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedostossa ei subPeriod-dataa
Harjoitustiedosto

In [4]:
file

'../polar_user_data\\HDR101_polar-user-data-export_f76dcea7-ae2d-4e03-864f-189e2b6254c1\\247ohr_2018_01-e61dcba7-2e17-47db-add2-96829652a41e.json'