In [None]:
import os
import re
import sys

from dotenv import load_dotenv
from google.cloud import bigquery
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.config.settings import BIGQUERY_DATASET, BIGQUERY_RAW_TABLE, GCP_PROJECT_ID
from src.etl.extractors.strava import (
    StravaEndpoints,
    StravaExtractor,
    get_fresh_access_token,
)

# Get strava data

In [None]:
load_dotenv()

CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
REFRESH_TOKEN = os.getenv('REFRESH_TOKEN')

In [None]:
activities_url = StravaEndpoints.get_activities()
access_token = get_fresh_access_token(CLIENT_ID, CLIENT_SECRET, REFRESH_TOKEN)

header = {'Authorization': f'Bearer {access_token}'}

In [None]:
extractor = StravaExtractor(access_token)
all_activities = extractor.fetch_all_activities()
all_activities

In [None]:
df_all_activities_raw = pd.json_normalize(all_activities)
df_all_activities_raw.head()

In [None]:
# Data Cleaning for bigquery
df_all_activities = df_all_activities_raw.copy()

# Replace all invalid characters with underscores
df_all_activities.columns = [
    re.sub(r'[^a-zA-Z0-9_]', '_', col) for col in df_all_activities.columns
]

# GCP Exploration

In [None]:
CREDENTIALS_PATH = '../credentials/sa-athlete-dashboard.json'
TABLE_ID = f'{GCP_PROJECT_ID}.{BIGQUERY_DATASET}.{BIGQUERY_RAW_TABLE}'
print(TABLE_ID)

In [None]:
client = bigquery.Client.from_service_account_json(
    CREDENTIALS_PATH, project='athlete-dashboard-467718'
)

## Upload data

In [None]:
job_config = bigquery.LoadJobConfig(
    write_disposition='WRITE_TRUNCATE',
    create_disposition='CREATE_IF_NEEDED',  # Auto-create if table doesn't exist
    autodetect=True,  # <-- Schema inferred automatically
)

# job_config = bigquery.LoadJobConfig(write_disposition='WRITE_APPEND') # Avoid duplication if using this

In [None]:
client.load_table_from_dataframe(
    df_all_activities, TABLE_ID, job_config=job_config
).result()

In [None]:
table = client.get_table(TABLE_ID)
table

## Read data

In [None]:
table = client.get_table(TABLE_ID)
print(f'ℹ️ Tabelle gefunden: {TABLE_ID}')
print(f'Spalten: {[schema.name for schema in table.schema]}')
print(f'Zeilen: {table.num_rows}')

# --- 5. Kleine Abfrage, um Daten zu testen ---
query = f'SELECT * FROM `{TABLE_ID}`'
print('🔹 Starte Abfrage:', query)

job = client.query(query)
df = job.result().to_dataframe()  # Ergebnisse in Pandas DataFrame

print('✅ Abfrage abgeschlossen, erste Zeilen:')
df