# Activity Analysis
In this notebook we'll do some analysis of the Google "ActivitySegments" data using pandas. We'll plot our results using Matplotlib.

In [None]:
%matplotlib inline
import pandas as pd

In [None]:
# Set this to the name of your activities file
ACTIVITIES_PATH = 'activities.csv'
# Optional: configure the start and end dates of data you want to consider, e.g. date(2020, 1, 1)
START_DATE = None
END_DATE = None
# Optional: configure whether to exclude data that is low confidence
EXCLUDE_LOW_CONFIDENCE = True

In [None]:
activities = pd.read_csv(ACTIVITIES_PATH, sep='|', encoding='utf-8', parse_dates=['start_timestamp', 'end_timestamp'])

# Filter out rows that don't match the configured settings
if START_DATE:
    activities = activities[activities['start_timestamp'].dt.date >= START_DATE]
if END_DATE:
    activities = activities[activities['end_timestamp'].dt.date <= END_DATE]
if EXCLUDE_LOW_CONFIDENCE:
    activities = activities[activities['confidence'] != 'LOW']

print(f'Data has {activities.shape[0]} rows and {activities.shape[1]} columns')
activities.info()

In [None]:
# See what the first few rows of data look like
activities.head()

In [None]:
print('Number of records, by level of confidence')
activities['confidence'].value_counts()

In [None]:
print('Number of records per activity type:')
num_records = activities['activity_type'].value_counts()
num_records

In [None]:
print('Total distance traveled by activity type (km):')
distance = activities.groupby('activity_type')['travel_distance'].sum() / 1000
distance.sort_values(inplace=True)
distance

In [None]:
# Plot travel distance, by activity type
ax = distance.plot.barh()
ax.set_title('Distance traveled by activity type')
ax.set_xlabel('Km')
ax.set_ylabel('')

In [None]:
# Calculate total travel time by activity type *in hours*
print('Total time spent traveling by activity type (hours):')
activities['duration'] = activities['end_timestamp'] - activities['start_timestamp']
time_spent = activities.groupby('activity_type')['duration'].sum().dt.total_seconds() / 3600
time_spent.sort_values(inplace=True)
time_spent

In [None]:
# Plot travel time, by activity type
ax = time_spent.plot.barh()
ax.set_title('Time spent by activity type')
ax.set_xlabel('Hours')
ax.set_ylabel('')