# Get steps grid (and other features) from epochs.csv

In [1]:
# imports
import pandas as pd
import pickle

In [3]:
# load dataset
df = pd.read_csv('data/epochs.csv')
print(f'num of examples: {df.shape[0]}\nnum of categories: {df.shape[1]}')
df.head()

num of examples: 2119020
num of categories: 15


Unnamed: 0,userId,userAccessToken,summaryId,activityType,activeKilocalories,steps,distanceInMeters,durationInSeconds,activeTimeInSeconds,startTimeInSeconds,startTimeOffsetInSeconds,met,intensity,meanMotionIntensity,maxMotionIntensity
0,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-626066e3-6,WALKING,93,1506,125.73,900.0,860,1650484963,-18000,11.544035,HIGHLY_ACTIVE,2.411726,4.152604
1,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-62606a67-6,WALKING,3,1174,207.0,900.0,584,1650485863,-18000,42.362816,HIGHLY_ACTIVE,3.986872,5.518433
2,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-62606a67-8,SEDENTARY,0,0,0.0,900.0,143,1650485863,-18000,1.0,SEDENTARY,0.0,0.0
3,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-62606deb-6,WALKING,41,1363,222.45,900.0,188,1650486763,-18000,44.88087,HIGHLY_ACTIVE,3.548326,3.323441
4,e31d5fa7-7a63-43a6-973a-f2169c0661f7,69420bd2-052d-4df2-9ba1-55d9b2fd7489,sd46aeb3d-6260716f-6,WALKING,73,1755,461.11,900.0,269,1650487663,-18000,6.17543,HIGHLY_ACTIVE,5.39798,6.647376


In [4]:
users = df['userId'].unique()
print(f'num of users: {len(users)}')

num of users: 213


In [5]:
users_weekly_epoch = {}

## Data Preprocessing 

*startTimeOffsetInSeconds* is the offset in seconds to add to startTimeInSeconds to derive the "local" time of the device that captured the data, Garmin manual page 23. Because the measurements were taken in Israel, we expect the offsets to be +7200 (UTC +2) or +10800 (UTC +3) seconds. About 10 percent of the data had unusual offsets, which we decided to remove.

In [6]:
df = df[df['startTimeOffsetInSeconds'].isin([7200, 10800])]

*startTimeInSeconds* is in unix format. we want to convert it to utc format, taking into account the offset to get the local time.

In [7]:
df['startTimeLocal'] = pd.to_datetime(df['startTimeInSeconds'] + df['startTimeOffsetInSeconds'], unit='s')
df = df.drop('startTimeInSeconds', axis=1)
df = df.drop('startTimeOffsetInSeconds', axis=1)

add *WeekNumber* for each example

In [8]:
# returns the week number (1,52) of the date.
# assuming week starts on a sunday, and not a monday.
def get_week_number(date):
    adjusted_date = date - pd.Timedelta(days=(date.weekday() + 1) % 7) # date of closest, prev sunday
    return adjusted_date.isocalendar()[1]

In [9]:
df['WeekNumber'] = df['startTimeLocal'].apply(get_week_number)

## Extracting steps data 

```python
users_weekly_epoch = {
    userID: {
        week_number: pd.DataFrame(startTimeLocal, steps, userId, activeTimeInSeconds, distanceInMeters, speed, is_running)
    }
}

| Level               | Description                                       |
|---------------------|---------------------------------------------------|
| `users_weekly_epoch`| The main dictionary containing all users' data.   |
| `userID`            | A unique identifier for each soldier.                |
| `week_number`       | A key within each `userID` dictionary representing a specific week. |
| `pd.DataFrame`      | A dataframe filled with the relevant data for that user and week. Split by 15 min intervals.|

Each dataframe contains the following data:

- `startTimeLocal` - local time of user, example: 2022-08-28 19:15:00
- `steps` - number of steps in the 15 minute interval 
- `userId` - soldier's ID
- `activeTimeInSeconds` - number of seconds the user was active in the 15 minute interval. i.e. not sedentary
- `distanceInMeters` - distance travelled by soldier in the 15 minute interval
- `speed` - average speed of soldier in 15 minute interval in km/h, of when he was walking/running. 
- `is_running` - True if soldier was running (speed >= 7.5 km/h) in the 15 minute interval, False o.w.

In [10]:
agg_dict = {
    'steps': 'sum',
    'userId': 'first',  
    'activeTimeInSeconds': 'sum', 
    'distanceInMeters': 'sum'
}

In [11]:
for user, user_df in df.groupby('userId'):
    weekly_user_df = {}
    for week_num, week_df in user_df.groupby('WeekNumber'):
        weekly_df = week_df.groupby('startTimeLocal').agg(agg_dict).reset_index()
        # speed:
        filtered_df = week_df[week_df['activityType'].isin(['WALKING', 'RUNNING'])]
        grouped_df = filtered_df.groupby('startTimeLocal').agg({'activeTimeInSeconds':'sum','distanceInMeters':'sum'}).reset_index()
        grouped_df['speed'] = (grouped_df['distanceInMeters'] / grouped_df['activeTimeInSeconds']) * 3.6 # convert from m/s to km/h
        weekly_df = weekly_df.merge(grouped_df[['startTimeLocal', 'speed']], on='startTimeLocal', how='left')
        weekly_df['speed'] = weekly_df['speed'].fillna(0)
        weekly_df['is_running'] = weekly_df['speed'] >= 7.5
        
        weekly_user_df[week_num] = weekly_df
    
    users_weekly_epoch[user] = weekly_user_df

save to a pickle

In [12]:
pickle_file_path = 'steps_grid.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(users_weekly_epoch, file)