In [5]:
import pandas as pd
from sqlalchemy import create_engine, text

engine = create_engine(
    "postgresql+psycopg2://postgres:postgres@postgres:5432/etl_db"
)

ga_station_ids = pd.read_sql(
    text("""
        SELECT station_id
        FROM silver.stations
        WHERE state = 'GA'
    """),
    engine
)

ga_station_ids


Unnamed: 0,station_id
0,USC00094862
1,US1GAAP0001
2,US1GAAP0002
3,US1GAAP0004
4,US1GAAP0005
...,...
1402,USW00093836
1403,USW00093840
1404,USW00093842
1405,USW00093845


In [6]:
from pathlib import Path
import requests
import csv

from datetime import date as dt_date

START_DATE = dt_date(2015, 1, 1)
END_DATE   = dt_date.today()


BASE_URL = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/all"
OUT_DIR = Path("/app/data/landing/weather")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def download_weather_to_csv(station_id: str) -> Path:
    url = f"{BASE_URL}/{station_id}.dly"
    out_csv = OUT_DIR / f"{station_id}.csv"

    if out_csv.exists():
        print(f"[SKIP] {station_id}")
        return out_csv

    r = requests.get(url, timeout=60)
    r.raise_for_status()

    with open(out_csv, "w", newline="") as fout:
        writer = csv.writer(fout)
        writer.writerow([
            "station_id",
            "date",
            "element",
            "value",
            "m_flag",
            "q_flag",
            "s_flag",
        ])

        for line in r.text.splitlines():
            station = line[0:11].strip()
            year = int(line[11:15])
            month = int(line[15:17])
            element = line[17:21]


            for day in range(1, 32):
                base = 21 + (day - 1) * 8
                value = line[base:base+5].strip()
                m_flag = line[base+5].strip()
                q_flag = line[base+6].strip()
                s_flag = line[base+7].strip()

                if value == "-9999":
                    continue

                try:
                    obs_date = dt_date(year, month, day)
                except ValueError:
                    continue  # invalid calendar date (e.g. Feb 30)

                if not (START_DATE <= obs_date <= END_DATE):
                    continue

                writer.writerow([
                    station,
                    obs_date.isoformat(),
                    element,
                    int(value),
                    m_flag or None,
                    q_flag or None,
                    s_flag or None,
                ])


    print(f"[DOWNLOADED + PARSED] {station_id}")
    return out_csv


In [7]:
for sid in ga_station_ids["station_id"]:
    download_weather_to_csv(sid)

[SKIP] USC00094862
[SKIP] US1GAAP0001
[SKIP] US1GAAP0002
[SKIP] US1GAAP0004
[SKIP] US1GAAP0005
[SKIP] US1GAAP0006
[SKIP] US1GAAP0007
[SKIP] US1GAAT0002
[SKIP] US1GABB0001
[SKIP] US1GABB0003
[SKIP] US1GABB0004
[SKIP] US1GABB0005
[SKIP] US1GABB0006
[SKIP] US1GABB0007
[SKIP] US1GABB0010
[SKIP] US1GABB0016
[DOWNLOADED + PARSED] US1GABB0019
[DOWNLOADED + PARSED] US1GABB0021
[DOWNLOADED + PARSED] US1GABB0022
[DOWNLOADED + PARSED] US1GABD0001
[DOWNLOADED + PARSED] US1GABD0005
[DOWNLOADED + PARSED] US1GABH0002
[DOWNLOADED + PARSED] US1GABK0001
[DOWNLOADED + PARSED] US1GABK0006
[DOWNLOADED + PARSED] US1GABK0008
[DOWNLOADED + PARSED] US1GABK0010
[DOWNLOADED + PARSED] US1GABL0001
[DOWNLOADED + PARSED] US1GABL0002
[DOWNLOADED + PARSED] US1GABL0003
[DOWNLOADED + PARSED] US1GABN0001
[DOWNLOADED + PARSED] US1GABN0002
[DOWNLOADED + PARSED] US1GABN0005
[DOWNLOADED + PARSED] US1GABR0002
[DOWNLOADED + PARSED] US1GABR0004
[DOWNLOADED + PARSED] US1GABR0005
[DOWNLOADED + PARSED] US1GABR0006
[DOWNLOADED + PA