In [1]:
import pandas as pd
import requests
from datetime import datetime

## Get the authorization code and activity data

In [68]:
# open the link below and approve access. in the redirect URL, copy the code
print("http://www.strava.com/oauth/authorize?client_id=47842&response_type=code&redirect_uri=http://localhost/exchange_token&approval_prompt=force&scope=activity:read_all")

http://www.strava.com/oauth/authorize?client_id=47842&response_type=code&redirect_uri=http://localhost/exchange_token&approval_prompt=force&scope=activity:read_all


In [2]:
# set strava variables
client_id = 47842
client_secret = '' # get this from https://www.strava.com/settings/api
code = '7addfcece537cc0a746d91252899d981d1d10aa9' # get this from the above URL's redirect

In [None]:
# get access token
def get_access_token(client_id, client_secret, code):
    
    oauth_url = 'https://www.strava.com/oauth/token'
    payload = {
        'client_id': client_id, 
        'client_secret': client_secret, 
        'code': code, 
        'grant_type': 'authorization_code'}
    r = requests.post(oauth_url, data=payload, verify=False)
    access_token = r.json()['access_token']
    
    return access_token

# use access token to get activities
def get_activities(access_token, per_page=200, page=1):
    activities_url = 'https://www.strava.com/api/v3/athlete/activities'
    headers = {'Authorization': 'Bearer ' + access_token}
    params = {'per_page': per_page, 'page': page, 'access_token': access_token}
    data = requests.get(
       activities_url, 
       headers=headers, 
       params=params).json()
    return data

access_token = get_access_token(client_id, client_secret, code)

# loop in order to get all activities, since API returns max of 200 per call
more_data_to_collect = True
activities_data = []
page_num = 1

while more_data_to_collect:
    print(page_num)
    activities = get_activities(access_token, page = page_num)
    activities_data = activities_data + activities
    page_num += 1
    if len(activities) < 200:
        more_data_to_collect = False 
        
# convert list of activities to dataframe
df = pd.DataFrame(activities_data)

## Some basic data prep

In [6]:
# convert string datetime stamp to date object
def dateConvert(value):
    return datetime.strptime(value, '%Y-%m-%dT%H:%M:%SZ').date()

# Retrieve year from date column 
def getYear(value):
    return str(value)[:4]

def stringAndDropTime(value):
    return str(value).split(" ")[0]

# remove non-run activities
df = df[df['type']=='Run']

# remove runs from before 2022
df['year'] = df['start_date_local'].apply(getYear)
df = df[df['year']=='2022']


df = df.reset_index()
del df['index']

# convert km to miles
df['miles'] = df['distance'] * 0.000621371

# convert date format
df['date'] = df['start_date_local'].apply(dateConvert)

# only need these 2 columns
df = df[["date", "miles"]]

# groupby date column to combine mileage on days I recorded 2+ separate runs
df = df.groupby('date').sum()
df = df.reset_index()

### Fill In Missing Dates: currently, there are no rows representing dates when I didn't run. For plotting purposes, let's add a row for every date, even when miles run == 0.

In [None]:
dates = pd.date_range(start=df['date'].min(), end=df['date'].max()) # make list of dates, where min and max correspond to df
dates = pd.DataFrame({'date':dates}) # convert to df
dates = dates[dates['date']>="2018-11-29"] # this is the date I registered with Strava 
dates = dates.reset_index()
del dates['index']

# ensure dates are date objects, not strings
dates['date'] = dates['date'].apply(stringAndDropTime)
df['date'] = df['date'].apply(stringAndDropTime)

# left merge dates with df, fill in missing rows with 0
runs = dates.merge(df, how='left').sort_values('date', ascending=True).fillna(0)

# sanity check
print(runs.sum()['miles'])

In [24]:
runs.to_csv("mileage-goal/data/2022_mileage.csv", index = False)