In [5]:
import pandas as pd
from sqlalchemy import create_engine, text

In [6]:
engine = create_engine(
    "postgresql+psycopg2://postgres:postgres@postgres:5432/etl_db"
)

In [7]:
query = """
SELECT
    *
FROM bronze.weather_daily
"""

In [8]:
df = pd.read_sql(query, engine)
df.head()


Unnamed: 0,station_id,obs_date,element,value,m_flag,q_flag,s_flag,source_file,ingested_at
0,US1GAAP0004,2017-03-16,PRCP,0,,,N,US1GAAP0004.csv,2026-02-10 15:54:52.075489+00:00
1,US1GAAP0004,2017-03-17,PRCP,0,,,N,US1GAAP0004.csv,2026-02-10 15:54:52.075489+00:00
2,US1GAAP0004,2017-03-18,PRCP,0,,,N,US1GAAP0004.csv,2026-02-10 15:54:52.075489+00:00
3,US1GAAP0004,2017-03-19,PRCP,0,,,N,US1GAAP0004.csv,2026-02-10 15:54:52.075489+00:00
4,US1GAAP0004,2017-03-20,PRCP,0,,,N,US1GAAP0004.csv,2026-02-10 15:54:52.075489+00:00


In [9]:
ELEMENT_CONFIG = {
    # Temperature
    "TMAX": {"scale": 10.0, "unit": "celsius"},
    "TMIN": {"scale": 10.0, "unit": "celsius"},
    "TAVG": {"scale": 10.0, "unit": "celsius"},

    # Precipitation / snow
    "PRCP": {"scale": 10.0, "unit": "mm"},
    "SNOW": {"scale": 10.0, "unit": "mm"},
    "SNWD": {"scale": 1.0,  "unit": "mm"},

    # Wind (optional but reasonable)
    # "AWND": {"scale": 10.0, "unit": "m_per_s"},
    # "WSF2": {"scale": 10.0, "unit": "m_per_s"},
    # "WSF5": {"scale": 10.0, "unit": "m_per_s"},
}

In [10]:
#Scale values & assign units
def scale(row):
    cfg = ELEMENT_CONFIG.get(row["element"])
    if not cfg:
        return None, None
    return row["value"] / cfg["scale"], cfg["unit"]

In [11]:
df["unit"] = df["element"].map(
    lambda e: ELEMENT_CONFIG.get(e, {}).get("unit")
)

df["value"] = df["value"] / df["element"].map(
    lambda e: ELEMENT_CONFIG.get(e, {}).get("scale", 1)
)

# Drop rows where unit is null (i.e., element not in config)
df = df[df["unit"].notna()].copy()

In [12]:
#deduplicate
df = df.drop_duplicates(
    subset=["station_id", "obs_date", "element"]
)

In [13]:
df_silver = df[
    ["station_id", "obs_date", "element", "value", "unit"]
].copy()

In [14]:
with engine.begin() as conn:
    conn.execute(text("TRUNCATE TABLE silver.weather_daily"))


In [15]:
df_silver.to_sql(
    name="weather_daily",
    schema="silver",
    con=engine,
    if_exists="append",
    index=False,
    method="multi",
    chunksize=10_000
)


3716927

In [17]:
pd.read_sql(
    """
    SELECT
    *
    FROM silver.weather_daily
    where element != 'PRCP' and value > 0
    limit 100
    """,
    engine
)


Unnamed: 0,station_id,obs_date,element,value,unit,created_at,last_updated
0,US1GAAP0004,2018-01-04,SNWD,76.0,mm,2026-02-10 15:59:46.082856+00:00,2026-02-10 15:59:46.082856+00:00
1,US1GABB0006,2018-01-17,SNOW,3.2,mm,2026-02-10 15:59:46.082856+00:00,2026-02-10 15:59:46.082856+00:00
2,US1GABB0006,2025-01-22,SNOW,5.3,mm,2026-02-10 15:59:46.082856+00:00,2026-02-10 15:59:46.082856+00:00
3,US1GABB0006,2026-01-18,SNOW,0.6,mm,2026-02-10 15:59:46.082856+00:00,2026-02-10 15:59:46.082856+00:00
4,US1GABB0016,2018-01-17,SNOW,0.8,mm,2026-02-10 15:59:46.082856+00:00,2026-02-10 15:59:46.082856+00:00
...,...,...,...,...,...,...,...
95,US1GACB0004,2015-02-24,SNOW,1.8,mm,2026-02-10 15:59:46.082856+00:00,2026-02-10 15:59:46.082856+00:00
96,US1GACB0004,2015-02-26,SNOW,1.3,mm,2026-02-10 15:59:46.082856+00:00,2026-02-10 15:59:46.082856+00:00
97,US1GACB0004,2015-02-24,SNWD,13.0,mm,2026-02-10 15:59:46.082856+00:00,2026-02-10 15:59:46.082856+00:00
98,US1GACB0004,2015-02-26,SNWD,5.0,mm,2026-02-10 15:59:46.082856+00:00,2026-02-10 15:59:46.082856+00:00
