# Clean and prep data for modeling
## Author: Oliver Gladfelter
## Feb 20, 2026

In [60]:
import pandas as pd
import numpy as np
from datetime import datetime

# GPX stuff
import polyline
from geopy.distance import geodesic

In [53]:
def get_time_of_day(hour):
    """ categorize hour --> day part"""
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

# a new mapping to group the sport types in a new column
sport_types_dict = {
    'Ride': 'Ride',
    'Walk': 'Walk',
    'Run': 'Run',
    'Hike': 'Hike',
    'InlineSkate': 'Workout',
    'Workout': 'Workout',
    'Rowing': 'Workout',
    'AlpineSki': 'Workout',
    'Snowboard': 'Workout',
    'Swim': 'Workout',
    'Crossfit': 'Workout',
    'RollerSki': 'Workout',
    'EBikeRide': 'Ride',
    'VirtualRide': 'Ride',
    'Snowshoe': 'Walk',
    'WeightTraining': 'Workout',
    'BackcountrySki': 'Workout',
    'Kayaking': 'Workout',
    'NordicSki': 'Workout',
    'IceSkate': 'Workout',
    'Yoga': 'Workout',
    'MountainBikeRide': 'Ride',
    'StandUpPaddling': 'Workout',
    'Windsurf': 'Workout',
    'RockClimbing': 'Workout',
    'Elliptical': 'Workout',
    'Surfing': 'Workout',
    'Canoeing': 'Workout',
    'Velomobile': 'Workout',
    'StairStepper': 'Workout',
    'Sail': 'Workout',
    'TrailRun': 'Run',
    'VirtualRun': 'Run',
    'EMountainBikeRide': 'Ride',
    'GravelRide': 'Ride', 
    'Skateboard': 'Workout', 
    'Soccer': 'Workout', 
    'TableTennis': 'Workout', 
    'Pilates': 'Workout',
    'HighIntensityIntervalTraining': 'Workout', 
    'VirtualRow': 'Workout', 
    'Golf': 'Workout', 
    'Tennis': 'Workout',
    'Kitesurf': 'Workout', 
    'Pickleball': 'Workout', 
    'Squash': 'Workout', 
    'Badminton': 'Workout', 
    'Racquetball': 'Workout',
    'Wheelchair': 'Workout'
}

# Load & wrangle the data

In [52]:
df = pd.read_csv("data/activities_export.tsv", sep="\t", encoding="latin-1")

# filter out activities set to view="only_me"
df = df[df['private'] == 0]

# drop workouts that have GPS data but the total distance is less than 200 meters
df = df[~(df['map_summary_polyline'].notnull() & (df['distance'] < 200))]

print(len(df))

399646


In [54]:
# clean the polyline code (need to ditch the double escape)
df['map_summary_polyline'] = df['map_summary_polyline'].str.replace('\\\\', '\\', regex=False)

# convert start_date_local from str to datetime object
df['start_date'] = pd.to_datetime(df['start_date_local'])

# start_date includes year, month, day, and timestamp --> ex: 2011-04-06 15:20:10
# get hour from start_date, then compute day_part column
df['hour'] = df['start_date'].dt.hour
df['day_part'] = df['hour'].apply(get_time_of_day)

# extract month (which later may be useful in conjunction with is is_northern_hemisphere --> Winter, Summer, etc)
df['month'] = df['start_date'].dt.month

# also create is_weekend flag
df['dayofweek'] = df['start_date'].dt.dayofweek
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

# people usually talk distance in terms of miles or kilometers, not meters
df['miles'] = df['distance'] / 1609
df['kilometers'] = df['distance'] / 1000

# avg speed of the workout
df['speed_mph'] = df['miles'] / (df['elapsed_time'] / 3600)
# speed_mph will be inf if elapsed_time == 0, will be NaN if miles == 0 
# but there are valid reasons why these values might be 0, so let's replace inf and NaN values with 0s
df['speed_mph'] = df['speed_mph'].replace([np.inf, -np.inf], np.nan)
df['speed_mph'] = df['speed_mph'].fillna(0) # stationary activities

# create boolean feature to show if activity was in the northern hemisphere
df['is_northern_hemisphere'] = (df['start_lat'] > 0).astype(int) # 1 = is northern hemisphere, 0 = southern

# we have total_elevation_gain, which is in meters. But this isn't super helpful on its own:
# I suspect gain-over-distance is more predictive
df['meters_per_km'] = df['total_elevation_gain'] / df['kilometers']
df['feet_per_mile'] = (df['total_elevation_gain'] * 3.28084) / df['miles'] # for the Americans lmao

# having the time metrics available in minutes (as well as seconds) will be helpful
df['moving_minutes'] = df['moving_time'] / 60
df['elapsed_minutes'] = df['elapsed_time'] / 60

# Make a moving:elapsed time percentage (was there constant movement or was there lot of total downtime?)
df['moving_time_per'] = df['moving_time'] / df['elapsed_time']

# Make a boolean for gear_id : is gear added or not (depends on user input)
df['has_gear'] = df['gear_id'].notnull()

# create a grouped sport type column using dict defined above
df['sport_type_grouped'] = df['sport_type'].map(sport_types_dict)

# 'activity_title' is a much more descriptive column name than 'name', 'id' instead of 'strava_activity_id' is for convenience
df = df.rename(columns={'name': 'activity_title', 'strava_activity_id':'id'})

# mask the IDs to hide PII
df['id'] = df['id'].map({id_val: i for i, id_val in enumerate(df['id'].unique(), 1)})
df['user_id'] = df['user_id'].map({uid: i for i, uid in enumerate(df['user_id'].unique(), 1)})

## GPX Features

Workouts recorded with a GPS device represent the GPX data in map polylines codes. These coded strings can be decoded into arrays of latitude and longitude coordinates. 

See https://developers.google.com/maps/documentation/routes/polylinedecoder. 

We will extract the following information from the GPX data:
- Number of total turns
- Avg turns per mile
- Wobble of trace (trace follows straight lines vs curving, winding routes / how rounded are some paths vs how rigid are the lines?)
- Sprawl: derived from bounding box diag distance (straight-line distance in miles between the two corners of the bounding box around a trace - proxy for compact vs sprawl)


Note many of the activities are manual, indoor activities, stationary activities, etc and therefore are missing `map_summary_polyline` values.

In [125]:
def decode_polyline(poly):
    """
    Returns array of lat,lng coordinates for a gps trace (map_summary_polyline, in this case)
    Skips null or empty string values (for indoor activities or workouts with no GPS)
    """
    if pd.isnull(poly) or poly == "":
        return None
    try:
        coords = polyline.decode(poly)[::10]
        return coords if len(coords) >= 2 else None
    except:
        return None

def bearing(p1, p2):
    """
    returns degrees change in direction between two lat,lng points
    params: two points ([lat,lng])
    convert degrees to radians, then compute the compass bearing between two coordinates
    uses spherical geometry (which requires converting degrees to radians)
    """
    lat1, lon1 = np.radians(p1)
    lat2, lon2 = np.radians(p2)
    return np.degrees(np.arctan2(np.sin(lon2-lon1)*np.cos(lat2), 
                                  np.cos(lat1)*np.sin(lat2) - np.sin(lat1)*np.cos(lat2)*np.cos(lon2-lon1)))

def calc_turn_metrics(coords, total_miles):
    """
    returns:
        - number of turns in a GPS trace, 
        - number of turns per mile
        - wobble
    param: an entire array of lat,lng coordinates
    uses 45 as the threshold for a 'turn' -- i.e. trace must change direction over 45 degrees for the 
    direction change to count as a turn
    uses bearing() to get degrees direction change, sums up all instances > turn_threshold
    
    So both num_turns and wobble considers changes in directions (measured by in bearings by bearing())
    But num_turns only counts changes in direction above a set threshold (45 degrees) as a turn
    
    wobble considers bearing changes under the threshold and sums up the magnitude of the bearings
    So if someone runs a square, and it's all straight lines, even with the 3 rigid turns, the route is not thought of as 'wobbly'
    wobble sums up minor bearings (< threshold) to account for minor and larger changes in direction
    We would likely see highest values on trail runs - following a straight but curvy path, for example,
    rather than in a city with straight sidewalks
    """
    turn_threshold = 45 # angle threshold for what counts as a "turn" (measured in degrees)
    bearings = [bearing(coords[i], coords[i+1]) for i in range(len(coords)-1)]
    changes = [abs(bearings[i+1] - bearings[i]) for i in range(len(bearings)-1)]
    changes = [c if c <= 180 else 360 - c for c in changes] # handle wrap-around (going from 350 -> 10 should be thought of as 20 degrees, not 340)
    
    # final metrics
    num_turns = sum(c > turn_threshold for c in changes) # total number of turns
    wobble = sum(c for c in changes if c <= turn_threshold) / total_miles if total_miles > 0 else None
    turns_per_mile = num_turns / total_miles if total_miles > 0 else None
    
    return num_turns, turns_per_mile, wobble

def calc_bbox_diagonal(coords):
    """
    Return distance (in miles) between the most northwest and the most southeast lat,lng coordinates
    This is a proxy for a compact route (loops in a park) vs a sprawling route (big rectangle around a neighborhood)
    
    param: the array of lat,lng coords
    """
    lats = [c[0] for c in coords]
    lngs = [c[1] for c in coords]
    corner1 = (min(lats), min(lngs))
    corner2 = (max(lats), max(lngs))
    return geodesic(corner1, corner2).miles

In [138]:
for i, row in df.iterrows():
    coords = decode_polyline(row['map_summary_polyline']) # returns None if there is no polyline code
    if coords: # skip when null
        num_turns, turns_per_mile, wobble = calc_turn_metrics(coords, row['miles'])
        df.at[i, 'num_turns'] = num_turns
        df.at[i, 'turns_per_mile'] = turns_per_mile
        df.at[i, 'wobble'] = wobble
        df.at[i, 'sprawl'] = calc_bbox_diagonal(coords)

# Export for use in next notebook: `02-eda.ipynb`

In [159]:
# select columns to move forward with
features = [
    'id', 
    'user_id', 
    'sport_type', 
    'sport_type_grouped', 
    'speed_mph',
    'distance', 
    'miles', 
    'kilometers',
    'moving_time', 
    'elapsed_time', 
    'moving_minutes', 
    'elapsed_minutes', 
    'moving_time_per',
    'total_elevation_gain',
    'meters_per_km', 
    'feet_per_mile',
    'commute', 
    'manual', 
    'has_gear',
    'suffer_score', 
    'kudos_count', 
    'device_name', 
    'start_date', 
    'hour', 
    'day_part', 
    'month', 
    'dayofweek', 
    'is_weekend', 
    'is_northern_hemisphere',
    'num_turns',
    'turns_per_mile',
    'wobble',
    'sprawl'
]

df = df[features]

In [160]:
print(f"Data has {len(df)} rows and {len(df.columns)} columns")

Data has 398442 rows and 32 columns


In [161]:
df.to_csv("data/processed_activities.csv", index=False)

# Also export a smaller sample

In [9]:
df = pd.read_csv("processed_activities.csv")
df = df.sample(10000).reset_index(drop=True)
df.to_csv("sample_workout_data.csv", index=False)