In [1]:
from pathlib import Path
import pandas as pd
import shutil
from sqlalchemy import create_engine

In [2]:
engine = create_engine(
    "postgresql+psycopg2://postgres:postgres@postgres:5432/etl_db"
)

In [3]:
WEATHER_LANDING_DIR = Path("/app/data/landing/weather")
WEATHER_ARCHIVE_DIR = Path("/app/data/archive/weather")

csv_files = sorted(WEATHER_LANDING_DIR.glob("*.csv"))

print(f"Found {len(csv_files)} weather CSV files")


Found 1407 weather CSV files


In [4]:
for csv_path in WEATHER_LANDING_DIR.glob("*.csv"):
    print(f"Ingesting {csv_path.name}")

    df = pd.read_csv(csv_path)

    # Rename to match bronze schema
    df = df.rename(columns={"date": "obs_date"})

    # Bronze-only metadata
    df["source_file"] = csv_path.name

    # Write to Bronze
    df.to_sql(
        name="weather_daily",
        schema="bronze",
        con=engine,
        if_exists="append",
        index=False,
        method="multi",
        chunksize=10_000
    )

    # Move file to archive after successful ingest
    archive_path = WEATHER_ARCHIVE_DIR / csv_path.name
    shutil.move(csv_path, archive_path)

    print(f"Archived → {archive_path.name}")

print("✅ Weather ingestion + archiving complete")


Ingesting US1GAAP0001.csv
Archived → US1GAAP0001.csv
Ingesting US1GAAP0002.csv
Archived → US1GAAP0002.csv
Ingesting US1GAAP0004.csv
Archived → US1GAAP0004.csv
Ingesting US1GAAP0005.csv
Archived → US1GAAP0005.csv
Ingesting US1GAAP0006.csv
Archived → US1GAAP0006.csv
Ingesting US1GAAP0007.csv
Archived → US1GAAP0007.csv
Ingesting US1GAAT0002.csv
Archived → US1GAAT0002.csv
Ingesting US1GABB0001.csv
Archived → US1GABB0001.csv
Ingesting US1GABB0003.csv
Archived → US1GABB0003.csv
Ingesting US1GABB0004.csv
Archived → US1GABB0004.csv
Ingesting US1GABB0005.csv
Archived → US1GABB0005.csv
Ingesting US1GABB0006.csv
Archived → US1GABB0006.csv
Ingesting US1GABB0007.csv
Archived → US1GABB0007.csv
Ingesting US1GABB0010.csv
Archived → US1GABB0010.csv
Ingesting US1GABB0016.csv
Archived → US1GABB0016.csv
Ingesting US1GABB0019.csv
Archived → US1GABB0019.csv
Ingesting US1GABB0021.csv
Archived → US1GABB0021.csv
Ingesting US1GABB0022.csv
Archived → US1GABB0022.csv
Ingesting US1GABD0001.csv
Archived → US1GABD00

In [5]:
pd.read_sql(
    """
    SELECT
        COUNT(*) AS rows,
        COUNT(DISTINCT station_id) AS stations,
        MIN(obs_date) AS min_date,
        MAX(obs_date) AS max_date
    FROM bronze.weather_daily
    """,
    engine
)


Unnamed: 0,rows,stations,min_date,max_date
0,4690999,918,2015-01-01,2026-02-05
