In [17]:
import pandas as pd
from sqlalchemy import create_engine, text

engine = create_engine(
    "postgresql+psycopg2://postgres:postgres@postgres:5432/etl_db"
)

station_ids = pd.read_sql(
    text("""
        SELECT station_id
        FROM silver.stations
        ORDER BY station_id
        LIMIT 3;
    """),
    engine
)

station_ids


Unnamed: 0,station_id
0,US009052008
1,US10adam001
2,US10adam002


In [None]:
from pathlib import Path
import requests
import csv

BASE_URL = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/all"
OUT_DIR = Path("/app/data/landing/weather")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def download_weather_to_csv(station_id: str) -> Path:
    url = f"{BASE_URL}/{station_id}.dly"
    out_csv = OUT_DIR / f"{station_id}.csv"

    if out_csv.exists():
        print(f"[SKIP] {station_id}")
        return out_csv

    r = requests.get(url, timeout=60)
    r.raise_for_status()

    with open(out_csv, "w", newline="") as fout:
        writer = csv.writer(fout)
        writer.writerow([
            "station_id",
            "date",
            "element",
            "value",
            "m_flag",
            "q_flag",
            "s_flag",
        ])

        for line in r.text.splitlines():
            station = line[0:11].strip()
            year = line[11:15]
            element = line[17:21]

            for day in range(1, 32):
                base = 21 + (day - 1) * 8
                value = line[base:base+5].strip()
                m_flag = line[base+5].strip()
                q_flag = line[base+6].strip()
                s_flag = line[base+7].strip()

                if value == "-9999":
                    continue

                date = f"{year}-{str(day).zfill(2)}"

                writer.writerow([
                    station,
                    date,
                    element,
                    int(value),
                    m_flag or None,
                    q_flag or None,
                    s_flag or None,
                ])

    print(f"[DOWNLOADED + PARSED] {station_id}")
    return out_csv


In [None]:
for sid in station_ids["station_id"]:
    download_weather_to_csv(sid)

[DOWNLOADED + PARSED] US009052008
[DOWNLOADED + PARSED] US10adam001
[DOWNLOADED + PARSED] US10adam002
