In [None]:
import os
import pandas as pd
import seaborn as sns

from dotenv import load_dotenv
from google.cloud import bigquery
from google.oauth2 import service_account

import matplotlib.pyplot as plt

load_dotenv()

_GCP_PROJECT_ID = os.getenv('GCP_PROJECT_ID')
_BQ_DATASET_MARTS = os.getenv('BIGQUERY_DATASET_MARTS')

# BigQuery Connection

In [None]:
try:
    creds = service_account.Credentials.from_service_account_info(
        st.secrets['gcp_service_account']
    )
    client = bigquery.Client(credentials=creds, project=creds.project_id)
except Exception:
    # Local dev fallback (requires GOOGLE_APPLICATION_CREDENTIALS)
    client = bigquery.Client()

# Data Loading

In [None]:
# Get activities
query_activities = f"SELECT * FROM {_GCP_PROJECT_ID}.{_BQ_DATASET_MARTS}.fct_activities"
# query_activities = f"SELECT * FROM {_GCP_PROJECT_ID}.strava_data.raw_activities"

df_activities = client.query(query_activities).to_dataframe()

In [None]:
print(f'Count activities: {len(df_activities)}')

In [None]:
df_activities.columns

# Filter relevant data

In [None]:
# # Filter
df_activities_filtered = df_activities[
    (df_activities['discipline'] == 'Run')
].drop_duplicates(subset=['activity_id'], keep='last').copy()

# Canonical types
df_activities_filtered["start_date_local"] = pd.to_datetime(df_activities_filtered["start_date_local"], utc=False)

# Derived core metric (preferred for modeling)
df_activities_filtered["pace_sec_per_km"] = df_activities_filtered["moving_time_s"] / df_activities_filtered["distance_km"]
print(f'Len relevant activities: {len(df_activities_filtered)}')

In [None]:
pd.set_option('display.max_columns', None)

df_activities_filtered.head(10)

In [None]:
df_activities_filtered[df_activities_filtered['is_race'] == True]

In [None]:
from typing import List, Optional
import numpy as np

def build_labels_from_runs(
    runs_df: pd.DataFrame,
    race_flag_col: str = "is_race",  # change if your column is named differently
    fallback_race_ids: Optional[List[int]] = None,
) -> pd.DataFrame:
    df = runs_df.copy()
    
    if race_flag_col in df.columns:
        labels = df[df[race_flag_col].fillna(False)].copy()
    elif fallback_race_ids is not None:
        labels = df[df["activity_id"].isin(fallback_race_ids)].copy()
    else:
        raise ValueError(
            "No race flag column found and no fallback_race_ids provided. "
            "Either add is_race to your mart table or pass fallback_race_ids."
        )
    
    # Label schema
    labels = labels.rename(columns={
        "activity_id": "race_id",
        "start_date_local": "race_date_local",
        "distance_km": "target_distance_km",
        "moving_time_s": "finish_time_s",
    })[["race_id", "athlete_id", "race_date_local", "target_distance_km", "finish_time_s"]]
    
    
    labels = labels.sort_values("race_date_local").reset_index(drop=True)
    return labels

# ---- If you DON'T have is_race yet, provide fallback list manually:
# fallback_race_ids = [1234567890, 2345678901, ...]
# labels = build_labels_from_runs(runs, fallback_race_ids=fallback_race_ids)

labels = build_labels_from_runs(df_activities_filtered)  # works if runs has is_race
labels.head()

In [None]:
# =========================================
# 3) Feature engineering: leak-proof rolling windows
#    One row per race, computed from training before race_date
# =========================================
import dataclasses
from dataclasses import dataclass
from typing import Dict

@dataclass(frozen=True)
class WindowSpec:
    name: str
    days: int

WINDOWS = [
    WindowSpec("7d", 7),
    WindowSpec("28d", 28),
    WindowSpec("56d", 56),
    WindowSpec("84d", 84),
]

def approx_quantiles(x: pd.Series, qs=(0.25, 0.5, 0.75)) -> Dict[str, float]:
    x = x.dropna()
    if len(x) == 0:
        return {f"q{int(q*100)}": np.nan for q in qs}
    vals = x.quantile(list(qs)).to_dict()
    return {f"q{int(q*100)}": float(vals[q]) for q in qs}

def build_features_for_race(runs_df: pd.DataFrame, race_row: pd.Series) -> Dict[str, float]:
    athlete_id = race_row["athlete_id"]
    race_time = pd.Timestamp(race_row["race_date_local"])
    
    df = runs_df[runs_df["athlete_id"] == athlete_id].copy()
    df = df[df["start_date_local"] < race_time]  # LEAK-PROOF cutoff
    
    out: Dict[str, float] = {
        "race_id": int(race_row["race_id"]),
        "athlete_id": int(athlete_id),
        "race_date_local": race_time,
        "target_distance_km": float(race_row["target_distance_km"]),
        "log_target_distance": float(np.log(float(race_row["target_distance_km"]))),
        "finish_time_s": int(race_row["finish_time_s"]),
    }
    
    # Recency features (global, not windowed)
    if len(df) > 0:
        last_run_time = df["start_date_local"].max()
        out["days_since_last_run"] = float((race_time.date() - last_run_time.date()).days)
    else:
        out["days_since_last_run"] = np.nan
    
    # Define "long run" threshold (tune) ============================
    LONG_RUN_KM = 18.0
    df_long = df[df["distance_km"] >= LONG_RUN_KM]
    out["days_since_long_run"] = float((race_time.date() - df_long["start_date_local"].max().date()).days) if len(df_long) else np.nan
    
    # Rolling windows
    for w in WINDOWS:
        start = race_time - pd.Timedelta(days=w.days)
        d = df[df["start_date_local"] >= start]
        
        out[f"runs_{w.name}"] = float(len(d))
        out[f"dist_km_{w.name}"] = float(d["distance_km"].sum()) if len(d) else 0.0
        out[f"time_s_{w.name}"] = float(d["moving_time_s"].sum()) if len(d) else 0.0
        out[f"longest_run_km_{w.name}"] = float(d["distance_km"].max()) if len(d) else np.nan
        
        # Time-weighted mean pace: sum(time)/sum(dist)
        dist = d["distance_km"].sum()
        time = d["moving_time_s"].sum()
        out[f"pace_sec_per_km_{w.name}"] = float(time / dist) if dist > 0 else np.nan
        
        # Elevation per km
        elev = d["elevation_gain_m"].sum(skipna=True) if "elevation_gain_m" in d else np.nan
        out[f"elev_m_per_km_{w.name}"] = float(elev / dist) if (dist > 0 and pd.notna(elev)) else np.nan
        
        # HR (time-weighted) where available
        if "has_heartrate" in d.columns and "avg_heartrate" in d.columns:
            dh = d[d["has_heartrate"] == True].copy()
            denom = dh["moving_time_s"].sum()
            out[f"hr_mean_{w.name}"] = float((dh["avg_heartrate"] * dh["moving_time_s"]).sum() / denom) if denom > 0 else np.nan
            out[f"hr_coverage_{w.name}"] = float(len(dh) / len(d)) if len(d) > 0 else np.nan
        else:
            out[f"hr_mean_{w.name}"] = np.nan
            out[f"hr_coverage_{w.name}"] = np.nan
        
        # Power (time-weighted) if available
        if "avg_watts" in d.columns:
            dp = d[d["avg_watts"].notna()].copy()
            denom = dp["moving_time_s"].sum()
            out[f"watts_mean_{w.name}"] = float((dp["avg_watts"] * dp["moving_time_s"]).sum() / denom) if denom > 0 else np.nan
            out[f"watts_coverage_{w.name}"] = float(len(dp) / len(d)) if len(d) > 0 else np.nan
        
        # Cadence mean
        if "avg_cadence" in d.columns:
            out[f"cadence_mean_{w.name}"] = float(d["avg_cadence"].mean()) if len(d) else np.nan
        
        # Pace distribution (per-run, not time-weighted) for longer windows only
        if w.days >= 56:
            qs = approx_quantiles(d["pace_sec_per_km"], qs=(0.25, 0.5, 0.75))
            out[f"pace_p25_{w.name}"] = qs["q25"]
            out[f"pace_p50_{w.name}"] = qs["q50"]
            out[f"pace_p75_{w.name}"] = qs["q75"]
            out[f"pace_std_{w.name}"] = float(d["pace_sec_per_km"].std(ddof=1)) if len(d) >= 2 else np.nan
    
    # Acute:chronic ratio (7d/28d)
    out["acr_dist_7d_28d"] = float(out["dist_km_7d"] / out["dist_km_28d"]) if out["dist_km_28d"] > 0 else np.nan
    
    return out

def build_feature_matrix(runs_df: pd.DataFrame, labels_df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, race_row in labels_df.iterrows():
        rows.append(build_features_for_race(runs_df, race_row))
    Xy = pd.DataFrame(rows).sort_values("race_date_local").reset_index(drop=True)
    return Xy

Xy = build_feature_matrix(df_activities_filtered, labels)
Xy.head()

In [None]:
# =========================================
# 4) Quick sanity checks (highly recommended)
# =========================================
# Check leakage cutoff: last training run must be BEFORE race time for each row
# (This is ensured by construction, but we can still validate edge cases.)
assert Xy["days_since_last_run"].isna().sum() < len(Xy), "Looks like you have no prior runs before some races."

# Check targets by distance
Xy.groupby("target_distance_km")["finish_time_s"].agg(["count", "mean", "min", "max"])


In [None]:
# =========================================
# 5) Train a first baseline model (time-aware split)
#    (You can replace later with XGBoost; start simple)
# =========================================
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import HistGradientBoostingRegressor

# Define features/target
target = "finish_time_s"
non_features = {"race_id", "athlete_id", "race_date_local", "finish_time_s"}
feature_cols = [c for c in Xy.columns if c not in non_features]

X = Xy[feature_cols]
y = Xy[target].astype(float)

# Time-based CV
tscv = TimeSeriesSplit(n_splits=5)

models = {
    "elasticnet": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("model", ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=42, max_iter=20000)),
    ]),
    "hgb": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", HistGradientBoostingRegressor(
            max_depth=5,
            learning_rate=0.05,
            max_iter=500,
            random_state=42
        )),
    ]),
}

scores = []
for name, model in models.items():
    fold = 0
    for train_idx, test_idx in tscv.split(X):
        fold += 1
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, pred)
        scores.append({"model": name, "fold": fold, "mae_s": mae})
        
pd.DataFrame(scores).groupby("model")["mae_s"].agg(["mean","std","min","max"]).sort_values("mean")

In [None]:
n = len(Xy)
print("Labeled races:", n)
print(Xy[["race_date_local","target_distance_km","finish_time_s"]].sort_values("race_date_local"))


In [None]:
# =========================================
# 6) Fit final model on all data (for now) + example predictions
# =========================================
best_model_name = "hgb"  # switch based on CV above
final_model = models[best_model_name]
final_model.fit(X, y)

# Predict the training set (sanity)
Xy_pred = Xy[["race_id","race_date_local","target_distance_km","finish_time_s"]].copy()
Xy_pred["pred_finish_time_s"] = final_model.predict(X)

# Error summary
Xy_pred["abs_err_s"] = (Xy_pred["pred_finish_time_s"] - Xy_pred["finish_time_s"]).abs()
Xy_pred.groupby("target_distance_km")["abs_err_s"].agg(["count","mean","median","max"]).sort_index()


In [None]:
# =========================================
# 7) Predict "today" race times (no future race_id needed)
#    Build a "virtual race" label row at a given date & distance.
# =========================================
def predict_for_date_and_distance(
    runs_df: pd.DataFrame,
    model,
    athlete_id: int,
    as_of: pd.Timestamp,
    distance_km: float
) -> float:
    fake_label = pd.Series({
        "race_id": -1,
        "athlete_id": athlete_id,
        "race_date_local": as_of,
        "target_distance_km": distance_km,
        "finish_time_s": 0,  # placeholder
    })
    feats = build_features_for_race(runs_df, fake_label)
    feats_df = pd.DataFrame([feats])[feature_cols]
    pred_s = float(model.predict(feats_df)[0])
    return pred_s

def format_time(seconds: float) -> str:
    seconds = int(round(seconds))
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return f"{h:d}:{m:02d}:{s:02d}" if h > 0 else f"{m:d}:{s:02d}"

athlete_id = int(df_activities_filtered["athlete_id"].mode()[0])  # if single athlete
as_of = pd.Timestamp.now()  # local notebook time
for d in [5.0, 10.0, 21.0975, 42.195]:
    pred = predict_for_date_and_distance(df_activities_filtered, final_model, athlete_id, as_of, d)
    print(d, "km ->", format_time(pred))
