# Fitnotes Processing Notebook
This notebook processes workout data collected from the Fitnotes workout Android application, creating three tables:
- Sets, showing set-level data during the workout.
- Volume, showing aggregate set, rep, and volume metrics for each exercise.
- Workouts, showing aggregate workout metadata.

In [17]:
#basic imports
import numpy as np
import pandas as pd

from glob import glob
from sqlalchemy import create_engine
engine = create_engine('sqlite:///../gains.db')

In [29]:
def format_columns(df):
    '''
    Set columns to lowercase, strip, and replace spaces with underscores
    '''
    
    df.columns = (df.columns
                  .str.strip()
                  .str.lower()
                  .str.replace(' \(lbs\)', '') #remove the (lbs) subtitle from weight column
                  .str.replace(' ', '_') 
                 )
    return df

def Workouts(df):
    '''
    Create matrix of aggregated key performance indicators for workouts
    '''

    # get key workout performance indicators
    agg = {
        'volume': 'sum', 
        'set_completed': 'mean',
        'id': 'count',
    }

    gains = (df
             .groupby('date', as_index = False)
             .agg(agg)
             .rename(columns = {'set_completed': 'completion_rate',  # rename set_completed to completion_rate because all sets comprise the workout
                                'id': 'sets'
                               })
            )
    
    gains['weekday'] = gains.date.dt.weekday_name
    gains['primary_lift'] = df.groupby('date', as_index = False).apply(lambda g: g.set_index('exercise').weight.idxmax()).fillna('Cardio')
    gains['category'] = df.groupby('date', as_index = False).apply(lambda g: g.set_index('category').volume.idxmax()).fillna('Cardio')
    gains['days_since_workout'] = gains.date.diff().fillna(pd.Timedelta('0 days')).dt.days
    gains['days_since_category'] = gains.groupby('category').date.diff().fillna(pd.Timedelta('0 days')).dt.days
    
    gains = pd.merge_ordered(gains, df[['date', 'category', 'routine']], 
                             on = ['date', 'category'],
                             how = 'left',
                            ).drop_duplicates()
    
    gains = gains[['date', 'weekday', 'routine', 'category', 'primary_lift', 'sets', 'volume', 'completion_rate', 'days_since_workout', 'days_since_category']]
    
    return gains

def Volume(df):
    '''
    Aggregate statistics for exercises within a workout
    '''
    
    df = df.copy(deep = True)
    
    df['id'] = df.index
    
    agg = {'category': max,
           'routine': max,
           'reps': sum, 
           'volume': sum, 
           'relative_volume': sum,
           'intensity': 'mean',
           'id': 'count'}
    
    volume = (df
                 .groupby(['date', 'exercise'], as_index = False)[list(agg.keys())]
                 .agg(agg)
                 .rename(columns = {'id': 'sets', 'intensity': 'avg_intensity'})
                 .query('volume > 0'))
    
    # convert reps to int
    volume.reps = volume.reps.astype(int)
    
    volume = volume[['date', 'routine', 'category', 'exercise', 'sets', 'reps', 'avg_intensity', 'volume', 'relative_volume']]
    
    return volume

class SetTransformer:
    '''
    A class that contains functions to format, augment,
    and add features to a workout sets DataFrame.
    '''
    
    def __init__(self):
        pass
    
    def augment_sets(self, fitnotes, weight, routines):
        '''
        Run the pipeline of functions that augment
        and edit features of workout sets.
        '''
        sets = (fitnotes
                .pipe(self.assign_routines, routines)
                .pipe(self.add_bodyweight_exercises, weight)
                .pipe(self.calculate_one_rep_max)
                .pipe(self.volume_intensity)
                .pipe(self.add_cardio)
                .pipe(self.consolidate_exercises)
                .pipe(self.other_features))
        
        return sets
    
    def assign_routines(self, df, routines):
        '''
        Assign workout routine to each workout
        '''
        # for every dimension of the routines table...
        cols = ['routine'] # , 'superset', 'sub_superset']

        # ...instantiate dimension to nan, then...
        for col in cols:
            df[col] = np.nan

            # for every exercise in a routine... (each row is an exercise)
            for idx, row in routines.iterrows():

                # if a performed exercise occurs within the dates of that routine's exercise
                exercise_within_dates = (df.exercise == row.exercise) & df.date.between(row.start_date, row.end_date)

                # ...assign the corresponding column
                df[col] = df[col].mask(exercise_within_dates, row[col])

        return df

    def add_bodyweight_exercises(self, df, weight):
        '''
        Add bodyweight to exercises where I lift my whole body
        '''

        # merge dataframe with bodyweight within 60 days of measurement
        df = pd.merge_asof(df, weight[['date', 'bodyweight']], on = 'date', direction = 'backward', tolerance = pd.Timedelta('60 days'))

        # add my bodyweight to the weight lifted on bodyweight exercises
        bodyweight_exercises = ['Pull Up', 'Chin Up', 'Close Grip Dip']
        is_bodyweight = df.exercise.isin(bodyweight_exercises)
        df.loc[is_bodyweight, 'weight'] += np.floor(df.loc[is_bodyweight, 'bodyweight'])

        # drop the bodyweight column
        df = df.drop(['bodyweight'], axis = 1)

        return df

    def calculate_one_rep_max(self, df):
        '''
        Calculate the largest one rep max in the past 30 days.

        This calculation is distinctly separate from calculating 
        the largest one rep max in the past 30 workouts. Using 30 days
        instead of 30 workouts helps account for my strength gains
        or decreases within a time span, assuming I continue to
        lift weights with some degree of consistency. The 30 workouts
        metric has a disadvantage in that it does not account for 
        workouts that occured far away from each other in time.
        This could cause me to overestimate my one rep max if
        the last big max was from a workout a long time ago.
        '''

        def helper(group):
            '''
            Helper function to calculate largest one rep max in past 30 days.

            Creating this function allows me to keep one rep max calcualtions
            within the same functional unit.
            '''

            # calculate the heighest one rep max of the day
            reduced = group.groupby('date').one_rep_max.max()

            # create a date range from the beginning to the end of the exercise's lifespan ...
            date_range = pd.date_range(start = reduced.index.min(), end = reduced.index.max(), freq = 'D')

            # ... then reindex using the daily date range to find the largest one rep max in the last 30 days
            rolling_max = (reduced
                           .reindex(date_range)
                           .rolling('30D')
                           .max()
                           .round()
                           .to_frame('one_rep_max'))

            # update one rep max with the new rolling values; all happens inplace
            group.set_index('date', inplace = True)
            group.update(rolling_max)

            return group

        # create a set id to sort on after groupby
        df['id'] = df.index

        # calculate Epley-formulated one rep max
        df['one_rep_max'] = df.eval('weight * (1 + reps / 30)')

        # calculate latest one rep max within 30 days
        result = (df
                  .groupby('exercise')
                  .apply(helper)
                  .drop('exercise', axis = 1)
                  .sort_values('id')
                  .reset_index()
                  .drop('id', axis = 1))

        return result    

    def volume_intensity(self, df):
        '''
        Calculate intensity, volume, and relative volume.
        '''

        # calculate intensity  
        df['intensity'] = df.eval('weight / one_rep_max').round(2)

        # calculate total volume and ...
        df['volume'] = df.eval('weight * reps')

        # ... and relative volume, rounded down
        df['relative_volume'] = np.floor(df.eval('weight * reps * intensity'))

        return df

    def add_cardio(self, df):
        '''
        Format cardio exercises
        '''
        # convert timestamp string to seconds as an int
        df.time = pd.to_datetime(df.time)
        df.time = df.time.dt.minute * 60 + df.time.dt.second

        # convert meters to miles and drop distance unit entirely
        df.distance = df.distance.mask(df.distance_unit == 'm', df.distance * 0.000621371)
        df = df.drop('distance_unit', axis = 1)

        # calculate speed of timed exercises to miles per hour
        df['speed'] = df.eval('distance / (time / 3600)')

        return df
    
    def consolidate_exercises(self, df):
        '''
        Merge together exercise names that clearly
        refer to the same exercise.
        '''
        duplicate_exercises = {
            'Pec Dec': 'Seated Machine Fly',
            'Decline Cable Fly': 'Low Cable Cross-over',
            'Bent over Standing Dumbbell Flyes': 'Standing Rear Delt Dumbbell Raise',
            'Decline Dumbbell Curls': 'Seated Incline Dumbbell Curl'
        }
        
        df.category = df.category.replace(duplicate_exercises)

        return df
        

    def other_features(self, df):
        '''
        Impute and format remaining features.
        '''

        # determine completed sets for weights and cardio
        df['set_complete'] = 0
        df.loc[(df.category == 'Cardio') & (df.distance > 0), 'set_complete'] = 1
        df.loc[(df.weight > 0) | (df.exercise.str.contains('Stretch') & df.time.gt(0)), 'set_complete'] = 1

        # make null set_completed before I began officially tracking that metric
        df.loc[df.date < '5-18-18', 'set_complete'] = np.nan

        # eliminate incomplete sets for now
        df = df.query('set_complete != 0').reset_index(drop = True)

        # extract RPE entries
        df['rpe'] = df.comment.str.extract('([\d.]+)\s*rpe').astype(float)
        
        # stimulating reps
        df['stim_reps'] = np.where(df.reps < 5, 0, df.rpe - 5)
        
        # assign ids
        df['set_id'] = df.index
        df['ex_set_no'] = df.groupby(['date', 'exercise']).set_id.cumcount()
        df['workout_id'] = df.groupby('date').ngroup()
        
        # strength ranges and stimulus categorization
        df['strength_range'] = pd.cut(df.reps, bins = [1, 7, 20], right = False, include_lowest = True)
        df['hypertrophy_range'] = pd.cut(df.reps, bins = [1,7, 12, 20], right = False, include_lowest = True)
        df['stimulus'] = np.where(df.intensity < .7, 'hypertrophy', 'strength')

        ### prepare for export ###

        # set column order
        col_order = ['set_id', 'workout_id', 'date', 'routine', 'category', 'exercise', 'ex_set_no',
                     'weight', 'reps', 'rpe', 'stim_reps', 'intensity', 'one_rep_max', 'volume', 'relative_volume',
                     'strength_range', 'hypertrophy_range', 'stimulus', 'distance', 'time', 'speed', 'comment']

        df = df[col_order]

        return df

def classify_exercises(df):
    '''
    Classify exercise as push, pull, squat, or hip hinge variants,
    as well as main or accessory lifts.
    
    --------------------------------------------------------------
    From rippedbody.com's "A Guide to Exercise Selection 
    When You Don’t Have Access to a Coach":
    
    "Push and press exercises mainly work the triceps, deltoids, and pecs. 
    They can be further divided into horizontal and vertical movement patterns...
    
    "Pull variants and rowing exercises work muscles on the upper 
    back and elbow flexors like the biceps. They can be further 
    divided into horizontal and vertical movement patterns...
    
    
    "Squat-type exercises involve the knees and hips. 
    They mainly work the quadriceps and glutes...
    
    
    "Hip hinge exercises involve the hips. 
    They mainly work the posterior chain consisting of 
    muscles on the back of your body like hamstrings, glutes, 
    and spinal erectors."
    '''
    
    # muscle_group	secondary_muscle_group	ulc	push/pull	modality	equipment	joint	develops
    
    # push
    df.loc[df.exercise.str.contains('Press'), 'variant'] = 'push'
    df.loc[df.exercise.str.contains('Push[-\s]*[Uu]p'), 'variant'] = 'push'
    
    # pull
    df.loc[df.exercise.str.contains('Row|Pull'), 'variant'] = 'pull'
    df.loc[df.exercise == 'Chin Up', 'variant'] = 'pull'
    
    # squat
    df.loc[df.exercise.str.contains('Squat|Leg Press'), 'variant'] = 'squat'
    
    # hip hinge
    df.loc[df.exercise.str.contains('Deadlift'), 'variant'] = 'hip hinge'
    
    # the exercises that are compound movements contributing most to my fitness goals
    # of strength, size, and physique improvement
    big_three = ['Flat Barbell Bench Press', 'Deadlift', 'Barbell Squat']
    
    df['label'] = np.where(df.exercise.isin(big_three), 'primary', 'accessory')
    
    return df

In [30]:
saved_workouts = glob('exports/Fitnotes*.csv')

In [31]:
routines = pd.read_excel('routines.xlsx', parse_dates = ['start_date', 'end_date'])
fitnotes = pd.read_csv(saved_workouts[-1], parse_dates = ['Date']).pipe(format_columns)
weight = pd.read_sql('weight', con = engine, parse_dates = 'date').rename(columns = {'weight':'bodyweight'})

## Sets, Volume

In [32]:
set_transformer = SetTransformer()
sets = set_transformer.augment_sets(fitnotes, weight, routines)

In [33]:
volume = Volume(sets)

## Exercises

In [None]:
exercises = sets.drop_duplicates('exercise')[['exercise', 'category']].reset_index(drop = True)

In [None]:
exercises = classify_exercises(exercises)

In [None]:
exercises.head()

# Export

In [None]:
exercises.to_csv('exercises.csv', index = False)
sets.to_csv('sets.csv', index = False)
volume.to_csv('volume.csv', index = False)
routines.to_csv('routines.csv', index = False)

In [None]:
exercises.to_sql('exercises', con = engine, if_exists = 'replace', index = False)
sets.to_sql('sets', con = engine, if_exists = 'replace', index = False)
volume.to_sql('volume', con = engine, if_exists = 'replace', index = False)
routines.to_sql('routines', con = engine, if_exists = 'replace', index = False)