In [1]:
from pathlib import Path
import csv
import requests

BASE_URL = "https://www.ncei.noaa.gov/pub/data/ghcn/daily"
OUT_DIR = Path("/app/data/landing/stations")
OUT_DIR.mkdir(parents=True, exist_ok=True)

txt_path = OUT_DIR / "ghcnd-stations.txt"
csv_path = OUT_DIR / "ghcnd-stations.csv"

In [None]:

def download_station_to_csv() -> Path:
    """
    Downloads ghcnd-stations.txt and converts it to CSV.
    Returns path to the CSV file.
    """
    txt_path = OUT_DIR / "ghcnd-stations.txt"
    csv_path = OUT_DIR / "ghcnd-stations.csv"

    # -----------------------
    # Download
    # -----------------------
    if not txt_path.exists():
        r = requests.get(f"{BASE_URL}/ghcnd-stations.txt", timeout=60)
        r.raise_for_status()
        txt_path.write_bytes(r.content)
        print("[DOWNLOADED] ghcnd-stations.txt")
    else:
        print("[SKIP] ghcnd-stations.txt")

    # -----------------------
    # Parse → CSV
    # -----------------------
    with open(txt_path, "r", encoding="utf-8") as fin, open(
        csv_path, "w", newline="", encoding="utf-8"
    ) as fout:
        writer = csv.writer(fout)
        writer.writerow([
            "station_id",
            "latitude",
            "longitude",
            "elevation",
            "state",
            "name",
            "gsn",
            "hcn",
            "wmo",
        ])

        for line in fin:
            station_id = line[0:11].strip()
            if not station_id:
                continue

            writer.writerow([
                station_id,
                line[12:20].strip(),
                line[21:30].strip(),
                line[31:37].strip(),
                line[38:40].strip(),
                line[41:71].strip(),
                line[72:75].strip(),
                line[76:79].strip(),
                line[80:85].strip(),
            ])

    print(f"[DONE] Station CSV → {csv_path}")
    txt_path.unlink() # remove txt file used to read -> csv | stream it directly over rather then -> txt -> csv
    return csv_path

In [5]:
stations_csv = download_station_to_csv()

#test
import pandas as pd

df = pd.read_csv(stations_csv)

print(df.head(5))


[DOWNLOADED] ghcnd-stations.txt
[DONE] Station CSV → /app/data/landing/stations/ghcnd-stations.csv
    station_id  latitude  longitude  elevation state                   name  \
0  ACW00011604   17.1167   -61.7833       10.1   NaN  ST JOHNS COOLIDGE FLD   
1  ACW00011647   17.1333   -61.7833       19.2   NaN               ST JOHNS   
2  AE000041196   25.3330    55.5170       34.0   NaN    SHARJAH INTER. AIRP   
3  AEM00041194   25.2550    55.3640       10.4   NaN             DUBAI INTL   
4  AEM00041217   24.4330    54.6510       26.8   NaN         ABU DHABI INTL   

   gsn  hcn      wmo  
0  NaN  NaN      NaN  
1  NaN  NaN      NaN  
2  GSN  NaN  41196.0  
3  NaN  NaN  41194.0  
4  NaN  NaN  41217.0  
