In [2]:
import requests
import json
import time
import os
import pandas as pd
import dlt
from dlt.sources.filesystem import filesystem, read_csv, read_jsonl
from dotenv import load_dotenv

In [3]:
# === CONFIGURATION ===
load_dotenv()
CLIENT_ID = os.getenv(f'CLIENT_ID')
CLIENT_SECRET = os.getenv(f'CLIENT_SECRET')

TOKENS_FILE = 'secrets/strava_tokens.json'

In [None]:
# === LOAD OR REFRESH TOKENS ===
def load_tokens():
    if os.path.exists(TOKENS_FILE):
        with open(TOKENS_FILE, 'r') as f:
            return json.load(f)
    else:
        raise FileNotFoundError("Token file not found. Authorize first and save your tokens.")

def save_tokens(tokens):
    with open(TOKENS_FILE, 'w') as f:
        json.dump(tokens, f)

def refresh_tokens(tokens):
    if time.time() > tokens['expires_at']:
        print("Access token expired. Refreshing...")
        response = requests.post("https://www.strava.com/oauth/token", data={
            'client_id': CLIENT_ID,
            'client_secret': CLIENT_SECRET,
            'grant_type': 'refresh_token',
            'refresh_token': tokens['refresh_token']
        })
        new_tokens = response.json()
        tokens.update({
            'access_token': new_tokens['access_token'],
            'refresh_token': new_tokens['refresh_token'],
            'expires_at': new_tokens['expires_at']
        })
        save_tokens(tokens)
    return tokens

# === GET ACTIVITIES ===
def get_activities(access_token, per_page=30):
    headers = {'Authorization': f"Bearer {access_token}"}
    page = 1

    while True:
        response = requests.get(
            'https://www.strava.com/api/v3/athlete/activities',
            headers=headers,
            params={'per_page': per_page, 'page': page}
        )
        data = response.json()

        if not data:
            break

        for activity in data:
            yield activity
        
        page += 1

In [None]:

# === GET ACTIVITIES ===
def get_activities(access_token, per_page=30):
    headers = {'Authorization': f"Bearer {access_token}"}
    page = 1

    while True:
        response = requests.get(
            'https://www.strava.com/api/v3/athlete/activities',
            headers=headers,
            params={'per_page': per_page, 'page': page}
        )
        data = response.json()

        if not data:
            break

        for activity in data:
            yield activity
        
        page += 1

In [18]:
def get_athlete(access_token):
    headers = {'Authorization': f"Bearer {access_token}"}

    response = requests.get(
        'https://www.strava.com/api/v3/athlete', #/zones
        headers=headers,
        #params={'per_page': per_page, 'page': page}
    )
    data = response.json()

    return data

athlete_data = get_athlete(ACCESS_TOKEN)
athlete_data

{'message': 'Rate Limit Exceeded',
 'errors': [{'resource': 'Application',
   'field': 'read rate limit',
   'code': 'exceeded'}]}

In [5]:
# === MAIN FLOW ===
tokens = load_tokens()
tokens = refresh_tokens(tokens)
ACCESS_TOKEN=tokens['access_token']

activities_generator = get_activities(ACCESS_TOKEN)

activities = [activity for activity in activities_generator]

# === DISPLAY RESULTS ===
# for act in activities:
#     print(f"{act['start_date'][:10]} - {act['name']} - {act['distance']/1000:.2f} km")

In [6]:
activities

['message', 'errors']

In [5]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import OffsetPaginator

os.environ["ACCESS_TOKEN"] = ACCESS_TOKEN
@dlt.source
def strava_source(
    access_token=dlt.secrets.value
):
    client = RESTClient(
        base_url='https://www.strava.com/api/v3/',
        auth=BearerTokenAuth(token=access_token),
        paginator=OffsetPaginator(
            limit=100,
            limit_param='per_page',
            offset=1,
            offset_param='page',
            stop_after_empty_page=True,
            total_path=None
        )
    )

    @dlt.resource(
        write_disposition="replace",
        #primary_key="id",
        max_table_nesting=None,
    )
    def activities():
        for page in client.paginate("athlete/activities"):
            yield page

    @dlt.resource(
        write_disposition="replace",
        #primary_key="id",
        max_table_nesting=None,
    )
    def athlete():
        for page in client.paginate("athlete"):
            yield page

    return activities, athlete

In [6]:
os.environ["NORMALIZE__DATA_WRITER__DISABLE_COMPRESSION"] = "true"
pipeline_s3 = dlt.pipeline(
    pipeline_name="strava_to_s3",       # you can keep the same name if you like
    destination="filesystem",            # ← switch to filesystem
    dataset_name="strava_activities_s3"  # name for the S3 “folder” in your bucket
)
load_info_s3 = pipeline_s3.run(strava_source(), loader_file_format = "jsonl")
#print(load_info_s3)

PipelineStepFailed: Pipeline execution failed at stage extract when processing package 1748600330.7960694 with exception:

<class 'dlt.extract.exceptions.ResourceExtractionError'>
In processing pipe athlete: extraction of resource athlete in generator athlete caused an exception: 429 Client Error: Too Many Requests for url: https://www.strava.com/api/v3/athlete?page=7201&per_page=100

In [None]:
pipeline = dlt.pipeline(
    pipeline_name="strava_rest_api_redshift",
    destination="redshift",
)
load_info = pipeline.run(strava_source())

In [None]:
# Define a dlt pipeline with automatic normalization
pipeline = dlt.pipeline(
    destination="duckdb",
    pipeline_name="strava_rest_api",
)

# run the pipeline with the new resource
load_info = pipeline.run(strava_source())

In [None]:
pipeline.dataset(dataset_type="default").schema.data_table_names()

['activities', 'activities__start_latlng', 'activities__end_latlng']

In [None]:
pipeline.dataset(dataset_type="default").activities.df()

Unnamed: 0,resource_state,athlete__id,athlete__resource_state,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,...,upload_id_str,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,_dlt_load_id,_dlt_id,total_elevation_gain__v_double,workout_type
0,2,148269563,1,Morning Walk,1410.3,982,982,,Walk,Walk,...,15611268529,garmin_ping_443437184249,False,0,0,False,1748598390.470637,amFSXtJNuVyt/w,10.8,
1,2,148269563,1,Morning Run,6454.8,1802,1802,0,Run,Run,...,15601242833,garmin_ping_443140876535,False,0,0,False,1748598390.470637,m6t8iGug/dywtQ,,
2,2,148269563,1,Morning Run,6938.3,1801,1801,0,Run,Run,...,15597334634,garmin_ping_443006351934,False,0,0,False,1748598390.470637,GEMoMKHWo2IX2w,,
3,2,148269563,1,Morning Walk,1166.9,897,897,,Walk,Walk,...,15588803482,garmin_ping_442780035147,False,0,0,False,1748598390.470637,tp6ky7yFiivshg,9.7,
4,2,148269563,1,Morning Walk,1422.2,938,938,,Walk,Walk,...,15579558715,garmin_ping_442500649377,False,0,0,False,1748598390.470637,1OAq369aiSRV6A,14.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,2,148269563,1,Evening Run,568.6,3321,3321,0,Run,Run,...,14524948604,garmin_ping_410575511422,False,0,0,False,1748598390.470637,qojgnsRR9R7FLA,,
72,2,148269563,1,Morning Run,7129.3,2613,2613,0,Run,Run,...,13310815861,stripped_garmin_ping_372336753957,False,0,0,False,1748598390.470637,cpjhAQjaQPcKPw,,
73,2,148269563,1,Morning Run,5099.3,1533,1533,0,Run,Run,...,13310815879,stripped_garmin_ping_372336753968,False,0,0,False,1748598390.470637,VFBYfJXOlrr9Zg,,
74,2,148269563,1,Morning Run,6040.3,1589,1589,0,Run,Run,...,13310818837,stripped_garmin_ping_372336816902,False,0,0,False,1748598390.470637,Bas+zTDJ851PKQ,,


In [None]:
import duckdb

conn = duckdb.connect(f"strava_rest_api.duckdb")
conn.sql(f"SET search_path = 'strava_rest_api'")
conn.sql("DESCRIBE").df()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,strava_rest_api,strava_rest_api_dataset,_dlt_loads,"[load_id, schema_name, status, inserted_at, sc...","[VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...",False
1,strava_rest_api,strava_rest_api_dataset,_dlt_pipeline_state,"[version, engine_version, pipeline_name, state...","[BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...",False
2,strava_rest_api,strava_rest_api_dataset,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False
3,strava_rest_api,strava_rest_api_dataset,activities,"[resource_state, athlete__id, athlete__resourc...","[BIGINT, BIGINT, BIGINT, VARCHAR, DOUBLE, BIGI...",False
4,strava_rest_api,strava_rest_api_dataset,activities__end_latlng,"[value, _dlt_parent_id, _dlt_list_idx, _dlt_id]","[DOUBLE, VARCHAR, BIGINT, VARCHAR]",False
5,strava_rest_api,strava_rest_api_dataset,activities__start_latlng,"[value, _dlt_parent_id, _dlt_list_idx, _dlt_id]","[DOUBLE, VARCHAR, BIGINT, VARCHAR]",False


In [None]:
from dlt.sources.rest_api import RESTAPIConfig, rest_api_source

config: RESTAPIConfig = {
    "client": {
        "base_url": "https://www.strava.com/api/v3/",
        "auth": {
            "token": ACCESS_TOKEN, # <--- we already configured access_token above
        },
        "paginator": OffsetPaginator(
            limit=100,
            limit_param='per_page',
            offset=1,
            offset_param='page',
            stop_after_empty_page=True,
            total_path=None
        )
    },
    "resources": [  # <--- list resources
        {
            "name": "activities",
            "endpoint": {
                "path": "athlete/activities",
            },
            "write_disposition": "replace",  # ✅ Merge mode
            #"primary_key": "id" 
        },
    ],
}

strava_source = rest_api_source(config)


pipeline_from_config = dlt.pipeline(
    pipeline_name="strava_rest_api_from_config",
    destination="duckdb",
    #dataset_name="rest_api_data",
    dev_mode=True,
)

load_info = pipeline_from_config.run(strava_source)

In [None]:
conn = duckdb.connect(f"strava_rest_api_from_config.duckdb")
conn.sql(f"SET search_path = 'strava_rest_api_from_config'")
conn.sql("DESCRIBE").df()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,_dlt_loads,"[load_id, schema_name, status, inserted_at, sc...","[VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...",False
1,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,_dlt_pipeline_state,"[version, engine_version, pipeline_name, state...","[BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...",False
2,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False
3,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,activities,"[resource_state, athlete__id, athlete__resourc...","[BIGINT, BIGINT, BIGINT, VARCHAR, DOUBLE, BIGI...",False
4,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,activities__end_latlng,"[value, _dlt_root_id, _dlt_parent_id, _dlt_lis...","[DOUBLE, VARCHAR, VARCHAR, BIGINT, VARCHAR]",False
5,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,activities__start_latlng,"[value, _dlt_root_id, _dlt_parent_id, _dlt_lis...","[DOUBLE, VARCHAR, VARCHAR, BIGINT, VARCHAR]",False
6,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False
7,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,activities,"[resource_state, athlete__id, athlete__resourc...","[BIGINT, BIGINT, BIGINT, VARCHAR, DOUBLE, BIGI...",False
8,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,activities__end_latlng,"[value, _dlt_root_id, _dlt_parent_id, _dlt_lis...","[DOUBLE, VARCHAR, VARCHAR, BIGINT, VARCHAR]",False
9,strava_rest_api_from_config,strava_rest_api_from_config_dataset_2025051304...,activities__start_latlng,"[value, _dlt_root_id, _dlt_parent_id, _dlt_lis...","[DOUBLE, VARCHAR, VARCHAR, BIGINT, VARCHAR]",False
