In [27]:
import pandas as pd
from sqlalchemy import create_engine, text

In [28]:
engine = create_engine(
    "postgresql+psycopg2://postgres:postgres@postgres:5432/etl_db"
)

In [30]:
query = """
SELECT
    *
FROM bronze.weather_daily
limit 10000
"""

In [31]:
df = pd.read_sql(query, engine)
# df.head()


In [32]:
# Drop bad-quality data
df = df[df["q_flag"].isna()].copy()

In [33]:
#Parse date (YYYY-DDD -> Date)
df["obs_date"] = pd.to_datetime(
    df["obs_date"],
    format="%Y-%j",
    errors="coerce"
)

df = df[df["obs_date"].notna()]

In [34]:
#Remove missing / sentinel values
df = df[df["value"].notna()]
df = df[df["value"] != -9999]

In [35]:
ELEMENT_CONFIG = {
    # Temperature
    "TMAX": {"scale": 10.0, "unit": "celsius"},
    "TMIN": {"scale": 10.0, "unit": "celsius"},
    "TAVG": {"scale": 10.0, "unit": "celsius"},

    # Precipitation / snow
    "PRCP": {"scale": 10.0, "unit": "mm"},
    "SNOW": {"scale": 10.0, "unit": "mm"},
    "SNWD": {"scale": 1.0,  "unit": "mm"},

    # Wind (optional but reasonable)
    # "AWND": {"scale": 10.0, "unit": "m_per_s"},
    # "WSF2": {"scale": 10.0, "unit": "m_per_s"},
    # "WSF5": {"scale": 10.0, "unit": "m_per_s"},
}

In [36]:
#Scale values & assign units
def scale(row):
    cfg = ELEMENT_CONFIG.get(row["element"])
    if not cfg:
        return None, None
    return row["value"] / cfg["scale"], cfg["unit"]

In [37]:
df[["value", "unit"]] = df.apply(
    scale, axis=1, result_type="expand"
)

df = df[df["unit"].notna()]


In [38]:
#deduplicate
df = df.drop_duplicates(
    subset=["station_id", "obs_date", "element"]
)

In [39]:
df_silver = df[
    ["station_id", "obs_date", "element", "value", "unit"]
].copy()

In [40]:
with engine.begin() as conn:
    conn.execute(text("TRUNCATE TABLE silver.weather_daily"))


In [41]:
df_silver.to_sql(
    name="weather_daily",
    schema="silver",
    con=engine,
    if_exists="append",
    index=False,
    method="multi",
    chunksize=10_000
)


2449

In [42]:
pd.read_sql(
    """
    SELECT
        COUNT(*) AS rows,
        COUNT(DISTINCT station_id) AS stations,
        MIN(obs_date) AS min_date,
        MAX(obs_date) AS max_date
    FROM silver.weather_daily
    """,
    engine
)


Unnamed: 0,rows,stations,min_date,max_date
0,2449,5,2004-01-01,2025-01-31
