In [51]:
import pandas as pd
from pathlib import Path
from history_parser.parser import parse_history, LocationHistory

Let's get our data ready.

In [72]:
ACTIVITIES_PATH = 'activities.csv'
activities = pd.read_csv(ACTIVITIES_PATH, encoding='utf-8', parse_dates=['start_timestamp', 'end_timestamp'])
print(f'Data has {activities.shape[0]} rows and {activities.shape[1]} columns')
activities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   start_lat_e7            49 non-null     int64              
 1   start_lon_e7            49 non-null     int64              
 2   end_lat_e7              49 non-null     int64              
 3   end_lon_e7              49 non-null     int64              
 4   start_timestamp         49 non-null     datetime64[ns, UTC]
 5   end_timestamp           49 non-null     datetime64[ns, UTC]
 6   distance                49 non-null     int64              
 7   activity_type           49 non-null     object             
 8   confidence              49 non-null     object             
 9   travel_distance_meters  49 non-null     float64            
dtypes: datetime64[ns, UTC](2), float64(1), int64(5), object(2)
memory usage: 4.0+ KB


In [73]:
activities.head()

Data has 49 rows and 10 columns


Unnamed: 0,start_lat_e7,start_lon_e7,end_lat_e7,end_lon_e7,start_timestamp,end_timestamp,distance,activity_type,confidence,travel_distance_meters
0,483638218,108943138,483666230,108961275,2022-09-01 12:07:28.804000+00:00,2022-09-01 12:15:31.914000+00:00,375,WALKING,HIGH,492.879961
1,483666060,108958319,483643566,108949647,2022-09-01 12:33:48.940000+00:00,2022-09-01 12:37:40.913000+00:00,270,WALKING,HIGH,386.340018
2,483645579,108952060,483664767,108949331,2022-09-01 17:23:24.621000+00:00,2022-09-01 17:25:25.815000+00:00,214,WALKING,HIGH,251.005421
3,483670781,108971635,483646845,108936911,2022-09-01 17:33:27.816000+00:00,2022-09-01 17:51:05.711000+00:00,1356,WALKING,HIGH,1192.562792
4,483638263,108943149,483507289,109088655,2022-09-02 07:50:39.440000+00:00,2022-09-02 08:01:32.054000+00:00,1930,CYCLING,HIGH,2586.854528


In [85]:
# Let's count the number of each activity type in the data=
print('Number of records per activity type:')
activities['activity_type'].value_counts()

Number of records per activity type


WALKING                 27
CYCLING                  8
IN_TRAIN                 8
IN_PASSENGER_VEHICLE     4
IN_BUS                   1
IN_SUBWAY                1
Name: activity_type, dtype: int64

In [86]:
# Calculate total travel distance by activity type *in kilometers*
print('Total distance traveled by activity type:')
activities.groupby('activity_type')['travel_distance_meters'].sum() / 1000

Total distance traveled by activity type:


activity_type
CYCLING                  25.297993
IN_BUS                    4.633910
IN_PASSENGER_VEHICLE     83.148529
IN_SUBWAY                 4.435000
IN_TRAIN                521.113955
WALKING                  37.420584
Name: travel_distance_meters, dtype: float64

In [97]:
# Calculate total travel time by activity type *in hours*
print('Total time spent traveling by activity type:')
activities['duration'] = activities['end_timestamp'] - activities['start_timestamp']
activities.groupby('activity_type')['duration'].sum().dt.total_sec
# Note: we can also calculate the number of hours using the following:
# activities.groupby('activity_type')['duration'].sum().dt.total_seconds() / 3600

activity_type
CYCLING                0 days 02:03:07.247000
IN_BUS                 0 days 00:13:56.103000
IN_PASSENGER_VEHICLE   0 days 05:54:25.553000
IN_SUBWAY              0 days 00:12:48.495000
IN_TRAIN               0 days 05:55:56.508000
WALKING                0 days 05:00:19.494000
Name: duration, dtype: timedelta64[ns]