# Process Fitness Data
Notebook for the extraction and processing of fitness logs
### TOC:
* [Init Environment](#init_environment)
* [Extract Data](#extract_data)
* [Transform Data](#transform_data)
* [Load Data](#load_data)

***
## Init Environment <a class="anchor" id="init_environment"></a>

In [1]:
%load_ext autoreload
%autoreload 2

# imports
import pandas as pd
import datetime
import pprint

from pydrive.drive import GoogleDrive
from pydrive.auth import GoogleAuth


# misc objects
gauth = GoogleAuth()
drive = GoogleDrive(gauth)


# misc modules
from getStravaActivities import findStravaActivities
from fitnessHelperFunctions import *

You are using the Extract API 2.0, please save the output as .hyper format


In [2]:
# misc globals

path_to_conf = "./conf/"
path_to_logs = "./logs/"
path_to_tableau = "./tableau/"

file_tmp = "tmp_FY20 H1 Workout Tracker.xlsx"
file_out_weightlifting = "weightlifting_extract.hyper"
file_out_cardio = "cardio_extract.hyper"
file_out_strava = "strava_extract.hyper"

lookup_weightlifting = {
    "Bench Press", 
    "Deadlifts", 
    "Shoulder Press", 
    "Squat"
}

columns_weightlifting = {
    'workout_uid' : 'Workout UID',
    'Rotation' : 'Rotation',
    '1RM' : 'Goal 1RM',
    'Workout' : 'Workout',
    'Exercise' : 'Exercise',
    'Week' : 'Week',
    'Sets' : 'Sets',
    'Reps' : 'Reps',
    '% 1RM (pace if running)' :  '% of Goal 1RM',
    'Actual Lift' :  'Actual Lift',
    'Date' : 'Date',
    'Time' : 'Time',
    'Bodyweight' : 'Bodyweight',
    'Notes' : 'Notes',
    'timestamp' : "Timestamp"
}

lookup_cardio = {
    "Run"
}

columns_cardio = {
    'workout_uid' : 'Workout UID',
    'Rotation' : 'Rotation',
    'Workout' : 'Workout',
    'Exercise' : 'Exercise',
    'Week' : 'Week',
    '% 1RM (pace if running)' :  'Average Pace',
    'Actual Lift' :  'Total Distance',
    'Date' : 'Date',
    'Time' : 'Time',
    'Notes' : 'Notes',
    'timestamp' : "Timestamp"
}

mimetypes = {
    'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # Drive Document files as MS Word files.
    'application/vnd.google-apps.spreadsheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' # Drive Sheets files as MS Excel files.
}


In [4]:
# init
print("SYSTEM INIT: {0}".format(
    datetime.datetime.strftime(datetime.datetime.now(), '%D %r')
))

SYSTEM INIT: 08/15/20 03:58:31 PM


***
## Extract Data <a class="anchor" id="extract_data"></a>

In [5]:
# extract raw log from google drive

file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList();
for file1 in file_list:
    
    if file1['title'] == "FY20 H1 Workout Tracker" :
        
        download_mimetype = None
        if file1['mimeType'] in mimetypes:
            download_mimetype = mimetypes[file1['mimeType']]

        
        file1.GetContentFile("tmp_"+file1['title']+".xlsx", mimetype=download_mimetype) # write out to .xlsx as tmp file locally

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=874659251868-8fdkk74gtuje4j65gvjosq2gcu447hga.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


#### Extract data from google drive

In [10]:
# read in tmp file to dataframe

df_raw = pd.read_excel(file_tmp, skiprows=3)
print(df_raw.shape)
df_raw.head(3)

(227, 13)


Unnamed: 0,Rotation,1RM,Workout,Exercise,Week,Sets,Reps,% 1RM (pace if running),Actual Lift,Date,Time,Bodyweight,Notes
0,7.0,,1.0,Shoulder Press,1.0,3.0,5,"65, 75, 85",5x50,2020-07-12 00:00:00,16:00:00,,
1,,,,Shoulder Press,2.0,3.0,3,"70, 80, 90",5x50,2020-07-20 00:00:00,21:00:00,90.8,
2,,,,Shoulder Press,3.0,3.0,5 / 3 / 1,"75, 85, 95",2x60,2020-07-29 00:00:00,15:30:00,89.7,


#### Extract data from strava

In [7]:
df_strava = findStravaActivities()
print(df_strava.shape)
df_strava.head(3)

(83, 48)


Unnamed: 0,resource_state,athlete,name,distance,moving_time,elapsed_time,total_elevation_gain,type,workout_type,id,...,average_speed,max_speed,has_heartrate,heartrate_opt_out,display_hide_heartrate_option,elev_high,elev_low,pr_count,total_photo_count,has_kudoed
0,2,"{'id': 46800838, 'resource_state': 1}",Heavy (cause I'm a huge lunk),7008.9,2425,3256,78.1,Run,0,3905601905,...,2.89,8.3,False,False,False,95.8,77.9,0,1,False
1,2,"{'id': 46800838, 'resource_state': 1}",Slip n slide,7534.7,2663,3117,79.3,Run,0,3891218146,...,2.829,9.1,False,False,False,95.8,77.9,0,1,False
2,2,"{'id': 46800838, 'resource_state': 1}",Warmup,3308.9,1080,1353,33.9,Run,0,3841254310,...,3.064,7.8,False,False,False,90.6,71.8,1,0,False


***
## Transform Data <a class="anchor" id="transform_data"></a>

#### Pre-process general data

In [14]:
# drop junk data, retain only recorded exercises
df_raw = df_raw[(df_raw['Exercise'].isna()!=True) & (df_raw['Week'].isna()!=True)]

# propogate codes down for uid
df_raw[['Rotation', 'Workout']] = df_raw[['Rotation', 'Workout']].fillna(method='ffill')
df_raw['workout_uid'] = df_raw.apply(make_uid, axis=1)
#df_raw.set_index('workout_uid', inplace=True)

# format date and time columns
df_raw['timestamp'] = df_raw.apply(make_datetime, axis=1)

df_raw.head()

TypeError: ('combine() argument 1 must be datetime.date, not str', 'occurred at index 13')

#### Seperate datasets

In [13]:
# create seperate datasets

# weightlifting
map_weightlifting = df_raw['Exercise'].apply(lambda x: True if x in lookup_weightlifting else False)
df_weightlifting = df_raw[map_weightlifting==True][list(columns_weightlifting.keys())]
print("Weightlifting data:",df_weightlifting.shape)

df_weightlifting.rename(columns = columns_weightlifting, inplace=True)

# cardio
map_cardio = df_raw['Exercise'].apply(lambda x: True if x in lookup_cardio else False)
df_cardio = df_raw[map_cardio==True][list(columns_cardio.keys())]
print("Cardio data:",df_cardio.shape)

df_cardio.rename(columns = columns_cardio, inplace=True)

KeyError: "['timestamp'] not in index"

#### Process seperate sets

In [40]:
# weightlifting specific processing
df_weightlifting['Calculated 1RM'] = df_weightlifting['Actual Lift'].apply(lambda x: make_1rm(x))
df_weightlifting['Goal 1RM'] = df_weightlifting['Goal 1RM'].fillna(method='ffill')
df_weightlifting['Metric Type'] = 'Weightlifting'
print("Weightlifting")
df_weightlifting.head(3)

Weightlifting


Unnamed: 0,Workout UID,Rotation,Goal 1RM,Workout,Exercise,Week,Sets,Reps,% of Goal 1RM,Actual Lift,Date,Time,Bodyweight,Notes,Timestamp,Calculated 1RM,Metric Type
0,7.1.1,7.0,,1.0,Shoulder Press,1.0,3.0,5,"65, 75, 85",5x50,2020-07-12,16:00:00,,,2020-07-12 16:00:00,58.333333,Weightlifting
1,7.1.2,7.0,,1.0,Shoulder Press,2.0,3.0,3,"70, 80, 90",,NaT,,,,NaT,,Weightlifting
2,7.1.3,7.0,,1.0,Shoulder Press,3.0,3.0,5 / 3 / 1,"75, 85, 95",,NaT,,,,NaT,,Weightlifting


In [44]:
# strava specific processing 

df_strava['datetime'] = df_strava['start_date_local'].apply(
    lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')
)
df_strava['date'] = df_strava['start_date_local'].apply(
    lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ').date()
)
df_strava['time'] = df_strava['start_date_local'].apply(
    lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ').time()
)

strava_cols = [
    'name', 'distance', 'moving_time', 'elapsed_time', 'total_elevation_gain', 'type', 'id',
    'datetime', 'date', 'time','start_latlng', 'end_latlng', 'start_latitude', 'start_longitude',
    'average_speed', 'max_speed', 'elev_high', 'elev_low'
]

df_strava = df_strava[strava_cols]
df_strava['Metric Type'] = 'Running'
print("Strava")
df_strava.head(3)

Strava


Unnamed: 0,name,distance,moving_time,elapsed_time,total_elevation_gain,type,id,datetime,date,time,start_latlng,end_latlng,start_latitude,start_longitude,average_speed,max_speed,elev_high,elev_low,Metric Type
0,Evening Run,7568.3,2525,3131,75.4,Run,3783975156,2020-07-19 18:06:06,2020-07-19,18:06:06,"[-33.823825, 151.198758]","[-33.823843, 151.199172]",-33.823825,151.198758,2.997,10.2,95.8,77.9,Running
1,"Went for a run, forgot my puffer, had an asthm...",5060.9,1704,2256,71.3,Run,3768887579,2020-07-16 11:09:50,2020-07-16,11:09:50,"[-33.823876, 151.198782]","[-33.823506, 151.198988]",-33.823876,151.198782,2.97,5.9,90.7,48.5,Running
2,Afternoon Run,6705.7,2183,2903,71.6,Run,3739581614,2020-07-10 14:32:06,2020-07-10,14:32:06,"[-33.823841, 151.198873]","[-33.823844, 151.199316]",-33.823841,151.198873,3.072,6.4,95.8,77.9,Running


In [45]:
# running dataset

df_cardio['Average Pace'] = df_cardio['Average Pace'].apply( # conversion to m/s
    lambda x: (1000 / (x.hour * 60 + x.minute)) if pd.notnull(x) else None
)

df_cardio['Total Distance'] = df_cardio['Total Distance'].apply( # conversion to m/s
    lambda x: float(x.split("km")[0]) * 1000 if pd.notnull(x) else None
)

df_cardio['Metric Type'] = 'Running'
print("Running (google drive)")
df_cardio.head(3)

Running (google drive)


Unnamed: 0,Workout UID,Rotation,Workout,Exercise,Week,Average Pace,Total Distance,Date,Time,Notes,Timestamp,Metric Type
48,6.5.1,6.0,5.0,Run,1.0,2.793296,6250.0,2020-02-25,20:10:00,,2020-02-25 20:10:00,Running
49,6.5.2,6.0,5.0,Run,2.0,2.941176,6300.0,2020-03-04,18:22:00,,2020-03-04 18:22:00,Running
50,6.5.3,6.0,5.0,Run,3.0,2.770083,6310.0,2020-03-09,19:47:00,,2020-03-09 19:47:00,Running


***
## Load Data <a class="anchor" id="load_data"></a>

In [48]:
df_final= pd.DataFrame()

# join weightlifting data
df_final = df_final.append(df_weightlifting)

# join cardio
df_final = df_final.append(df_cardio)

# join strava
strava_lookup = {
    "distance" : "Total Distance",
    "id" : "Workout UID",
    "type" : "Exercise",
    "average_speed" : "Average Pace",
    "date" : "Date",
    "time" : "Time",
    "datetime" : "Timestamp"
}
df_mod_strava = df_strava.rename(
    columns = strava_lookup 
)

df_final = df_final.append(df_mod_strava)
df_final.head(3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,% of Goal 1RM,Actual Lift,Average Pace,Bodyweight,Calculated 1RM,Date,Exercise,Goal 1RM,Metric Type,Notes,...,elev_high,elev_low,end_latlng,max_speed,moving_time,name,start_latitude,start_latlng,start_longitude,total_elevation_gain
0,"65, 75, 85",5x50,,,58.333333,2020-07-12 00:00:00,Shoulder Press,,Weightlifting,,...,,,,,,,,,,
1,"70, 80, 90",,,,,NaT,Shoulder Press,,Weightlifting,,...,,,,,,,,,,
2,"75, 85, 95",,,,,NaT,Shoulder Press,,Weightlifting,,...,,,,,,,,,,


In [49]:
%%time
# write to Tableau compliant format    

make_conversion(df_final, file_out_weightlifting) # note this holds all relevant data
make_conversion(df_strava, file_out_strava)

clean_logs()

print("SYSTEM COMPLETE: {0}".format(
    datetime.datetime.strftime(datetime.datetime.now(), '%D %r')
))

(223, 30)
Table 'Extract' does not exist in extract ./tableau/weightlifting_extract.hyper, creating.

  self._column_static_type = self._dataframe.apply(lambda x: pandleau.data_static_type(x), axis=0)
processing table: 223it [00:00, 3430.62it/s]


PYTHON: ./tableau/weightlifting_extract.hyper conversion complete
PYTHON: 20/07/2020 20:38
(78, 19)



processing table: 78it [00:00, 5571.36it/s]

Table 'Extract' does not exist in extract ./tableau/strava_extract.hyper, creating.
PYTHON: ./tableau/strava_extract.hyper conversion complete
PYTHON: 20/07/2020 20:38

PYTHON: Directory cleaned
PYTHON: 20/07/2020 20:38
SYSTEM COMPLETE: 07/20/20 08:38:30 PM
Wall time: 622 ms



