In [None]:
"""
This script only needs to be run once to dowload all of the basin's historical data
The script 'update_daily_append.py' will be automated to run daily and append new rows 

"""
import os, json, time, random, argparse
from io import StringIO
from datetime import datetime, timezone, date

import pandas as pd
import geopandas as gpd
import requests

GPKG_PATH = r"dnipro_sword_reaches_clip.gpkg"
LAYER_NAME = "dnipro_reaches"
REACH_ID_FIELD = "reach_id"

OUT_DIR = r"hydrocron_timeseries_by_reach"

HYDROCRON_URL = "https://soto.podaac.earthdatacloud.nasa.gov/hydrocron/v1/timeseries"
COLLECTION_NAME = "SWOT_L2_HR_RiverSP_2.0"
FIELDS = "reach_id,time_str,cycle_id,pass_id,wse,slope,width,dschg_gm,dschg_gm_q,reach_q"

TIMEOUT_S = 60
SLEEP_BETWEEN_REQUESTS_S = (0.2, 0.6)
MAX_RETRIES = 5


def hydrocron_response_to_df(text: str) -> pd.DataFrame:
    text = (text or "").strip()
    if not text:
        return pd.DataFrame()
    if text.startswith("{"):
        try:
            obj = json.loads(text)
        except json.JSONDecodeError:
            return pd.DataFrame()
        csv_text = (obj.get("results", {}).get("csv", "") or "").strip()
        return pd.read_csv(StringIO(csv_text)) if csv_text else pd.DataFrame()
    return pd.read_csv(StringIO(text))


def request_with_retries(url: str, params: dict) -> requests.Response:
    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.get(url, params=params, timeout=TIMEOUT_S)
            if r.status_code in (429, 500, 502, 503, 504):
                raise RuntimeError(f"HTTP {r.status_code}")
            return r
        except Exception as e:
            last_err = e
            backoff = min(30, (2 ** (attempt - 1)) * 0.7) + random.uniform(0, 0.5)
            time.sleep(backoff)
    raise RuntimeError(f"Failed after {MAX_RETRIES} retries. Last error: {last_err}")


def iso(dt: datetime) -> str:
    return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def parse_end_datetime(end_date: str | None) -> datetime:
    if end_date is None:
        return datetime.now(timezone.utc)
    s = end_date.strip()
    if len(s) == 10 and s[4] == "-" and s[7] == "-":
        d = date.fromisoformat(s)
        return datetime(d.year, d.month, d.day, 23, 59, 59, tzinfo=timezone.utc)
    if s.endswith("Z"):
        s = s.replace("Z", "+00:00")
    dt = datetime.fromisoformat(s)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc)


def safe_reach_filename(reach_id: str) -> str:
    s = str(reach_id).strip()
    cleaned = "".join(ch for ch in s if ch.isalnum() or ch in ("-", "_"))
    return cleaned or "unknown_reach"


def load_reach_ids() -> list[str]:
    gdf = gpd.read_file(GPKG_PATH, layer=LAYER_NAME)
    if REACH_ID_FIELD not in gdf.columns:
        raise ValueError(f"Field '{REACH_ID_FIELD}' not found in layer '{LAYER_NAME}'")
    s = gdf[REACH_ID_FIELD].dropna().astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    return s.loc[s != ""].unique().tolist()


def main():
    START_TIME = "2022-12-01T00:00:00Z"
    END_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    start_iso = START_TIME
    end_iso = END_TIME

    reach_ids = load_reach_ids()
    os.makedirs(OUT_DIR, exist_ok=True)

    print("Historical Hydrocron Download0")
    print("Reaches:", len(reach_ids))
    print("Window:", start_iso, "→", end_iso)

    log = []
    for i, rid in enumerate(reach_ids, 1):
        out_csv = os.path.join(OUT_DIR, f"reach_{safe_reach_filename(rid)}.csv")

        params = {
            "feature": "Reach",
            "feature_id": rid,
            "start_time": start_iso,
            "end_time": end_iso,
            "output": "csv",
            "collection_name": COLLECTION_NAME,
            "fields": FIELDS,
        }

        try:
            time.sleep(random.uniform(*SLEEP_BETWEEN_REQUESTS_S))
            r = request_with_retries(HYDROCRON_URL, params)
            df = hydrocron_response_to_df(r.text)
            df.to_csv(out_csv, index=False)
            log.append({"reach_id": rid, "rows": len(df), "status": "ok", "csv": out_csv})
            print(f"[{i}/{len(reach_ids)}] {rid} rows={len(df)}")
        except Exception as e:
            log.append({"reach_id": rid, "rows": None, "status": f"error: {e}", "csv": out_csv})
            print(f"[{i}/{len(reach_ids)}] {rid} ERROR: {e}")

    pd.DataFrame(log).to_csv(os.path.join(OUT_DIR, "download_log_historical.csv"), index=False)
    print("Done.")


if __name__ == "__main__":
    main()


Historical backfill
Reaches: 842
Window: 2022-12-01T00:00:00Z → 2026-02-11T21:47:25Z
[1/842] 22511300103 rows=197
[2/842] 22400900035 rows=187
[3/842] 22601000045 rows=0
[4/842] 22511100015 rows=0
[5/842] 22511100055 rows=167
[6/842] 22511100021 rows=167
[7/842] 22511100031 rows=167
[8/842] 22511100041 rows=0
[9/842] 22511300011 rows=167
[10/842] 22511300291 rows=0
[11/842] 22511300021 rows=197
[12/842] 22511300281 rows=0
[13/842] 22511200011 rows=167
[14/842] 22511300301 rows=0
[15/842] 22511300031 rows=197
[16/842] 22511200021 rows=167
[17/842] 22511300063 rows=0
[18/842] 22511300051 rows=0
[19/842] 22511300071 rows=197
[20/842] 22511300041 rows=197
[21/842] 22511300084 rows=0
[22/842] 22511200031 rows=167
[23/842] 22511300093 rows=0
[24/842] 22511200041 rows=167
[25/842] 22511200051 rows=197
[26/842] 22511200061 rows=197
[27/842] 22511200071 rows=197
[28/842] 22511300113 rows=0
[29/842] 22511300276 rows=0
[30/842] 22511300263 rows=169
[31/842] 22511300123 rows=0
[32/842] 22511300243