# Get steps (and other features) from epochs.csv

In [1]:
# imports
import pandas as pd
import pickle

In [2]:
# load dataset
df = pd.read_csv('data/epochs.csv')
print(f'num of examples: {df.shape[0]}\nnum of categories: {df.shape[1]}')
df.head()

num of examples: 2119020
num of categories: 15


Unnamed: 0,userId,userAccessToken,summaryId,activityType,activeKilocalories,steps,distanceInMeters,durationInSeconds,activeTimeInSeconds,startTimeInSeconds,startTimeOffsetInSeconds,met,intensity,meanMotionIntensity,maxMotionIntensity
0,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-626066e3-6,WALKING,93,1506,125.73,900.0,860,1650484963,-18000,11.544035,HIGHLY_ACTIVE,2.411726,4.152604
1,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-62606a67-6,WALKING,3,1174,207.0,900.0,584,1650485863,-18000,42.362816,HIGHLY_ACTIVE,3.986872,5.518433
2,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-62606a67-8,SEDENTARY,0,0,0.0,900.0,143,1650485863,-18000,1.0,SEDENTARY,0.0,0.0
3,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-62606deb-6,WALKING,41,1363,222.45,900.0,188,1650486763,-18000,44.88087,HIGHLY_ACTIVE,3.548326,3.323441
4,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-6260716f-6,WALKING,73,1755,461.11,900.0,269,1650487663,-18000,6.17543,HIGHLY_ACTIVE,5.39798,6.647376


In [3]:
users = df['userId'].unique()
print(f'num of users: {len(users)}')

num of users: 213


In [4]:
result_dict = {}

## Data Preprocessing 

*startTimeOffsetInSeconds* is the offset in seconds to add to startTimeInSeconds to derive the "local" time of the device that captured the data, Garmin manual page 23. Because the measurements were taken in Israel, we expect the offsets to be +7200 (UTC +2) or +10800 (UTC +3) seconds. About 10 percent of the data had unusual offsets, which we decided to remove.

In [5]:
df = df[df['startTimeOffsetInSeconds'].isin([7200, 10800])]

*startTimeInSeconds* is in unix format. we want to convert it to utc format, taking into account the offset to get the local time.

In [6]:
df['startTimeLocal'] = pd.to_datetime(df['startTimeInSeconds'] + df['startTimeOffsetInSeconds'], unit='s')
df = df.drop('startTimeInSeconds', axis=1)
df = df.drop('startTimeOffsetInSeconds', axis=1)

add *WeekNumber* for each example

In [7]:
# returns the week number (1,52) of the date.
# assuming week starts on a sunday, and not a monday.
def get_week_number(date):
    adjusted_date = date - pd.Timedelta(days=(date.weekday() + 1) % 7) # date of closest, prev sunday
    return adjusted_date.isocalendar()[1]

In [8]:
df['WeekNumber'] = df['startTimeLocal'].apply(get_week_number)

## Extracting steps data 

fill in result_dict:

TODO: explain format

In [9]:
def highest_intensity(intensities):
    intensity_order = ['SEDENTARY', 'ACTIVE', 'HIGHLY_ACTIVE']
    return max(intensities, key=intensity_order.index)

agg_dict = {
    'steps': 'sum',
    'userId': 'first',  
    'intensity': highest_intensity, 
    'activeTimeInSeconds': 'sum', 
    'distanceInMeters': 'sum'
}

In [10]:
for user, user_df in df.groupby('userId'):
    weekly_user_df = {}
    for week_num, week_df in user_df.groupby('WeekNumber'):
        weekly_user_df[week_num] = week_df.groupby('startTimeLocal').agg(agg_dict).reset_index()
    result_dict[user] = weekly_user_df

save to a pickle

In [11]:
pickle_file_path = 'steps.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(result_dict, file)