# 03 — Weather Merge

In [None]:
print('TODO')

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import requests

RAW      = Path("../data/raw")
INTERIM  = Path("../data/interim")
PROCESSED= Path("../data/processed")
ANALYTICS= Path("../analytics/looker_studio_datasources")
for p in [INTERIM, PROCESSED, ANALYTICS]: p.mkdir(parents=True, exist_ok=True)


In [2]:
counts_path = INTERIM / "traffic_by_hour.csv"      # you created this in 02
assert counts_path.exists(), f"Missing counts: {counts_path}"

counts = pd.read_csv(counts_path, parse_dates=["date_time"])
counts = counts.sort_values("date_time").reset_index(drop=True)

start_ts = counts["date_time"].min()
end_ts   = counts["date_time"].max()
print("Counts window:", start_ts, "→", end_ts)

# We’ll use calendar dates for the weather API
start_date = start_ts.date().isoformat()
end_date   = end_ts.date().isoformat()
start_date, end_date


Counts window: 2025-03-01 00:00:00+11:00 → 2025-03-31 23:00:00+11:00


('2025-03-01', '2025-03-31')

In [3]:
# Melbourne CBD (Flinders St / Town Hall area)
lat, lon = -37.8136, 144.9631

# Open-Meteo archive (ERA5)
base_url = "https://archive-api.open-meteo.com/v1/era5"
params = {
    "latitude": lat,
    "longitude": lon,
    "start_date": start_date,
    "end_date": end_date,
    "hourly": "temperature_2m,precipitation,rain,wind_speed_10m",
    "timezone": "Australia/Melbourne"
}

r = requests.get(base_url, params=params, timeout=60)
r.raise_for_status()
j = r.json()

# Turn hourly arrays into a DataFrame
weather = pd.DataFrame(j["hourly"])
# Make sure the timestamp is proper datetime
weather["date_time"] = pd.to_datetime(weather["time"])   # already in local time because of timezone=
weather = weather.drop(columns=["time"])

# Quick sanity check
print(weather.head(3))
print("Weather rows:", len(weather), "from", weather["date_time"].min(), "to", weather["date_time"].max())


   temperature_2m  precipitation  rain  wind_speed_10m           date_time
0            17.8            0.0   0.0             5.7 2025-03-01 00:00:00
1            17.4            0.0   0.0             3.6 2025-03-01 01:00:00
2            16.8            0.0   0.0             2.2 2025-03-01 02:00:00
Weather rows: 744 from 2025-03-01 00:00:00 to 2025-03-31 23:00:00


In [4]:
# Ensure numeric
for c in ["temperature_2m","precipitation","rain","wind_speed_10m"]:
    weather[c] = pd.to_numeric(weather[c], errors="coerce")

# A tiny helper flag for charts/filters later
weather["rain_flag"] = (weather["precipitation"] > 0).astype(int)

# Temperature band (nice for heatmaps)
bins = [-100, 10, 18, 24, 30, 100]
labels = ["<10°C","10–18°C","18–24°C","24–30°C",">30°C"]
weather["temp_band"] = pd.cut(weather["temperature_2m"], bins=bins, labels=labels)

# Save a copy in interim
weather_out = INTERIM / "weather_hourly_melbourne.csv"
weather.to_csv(weather_out, index=False)
print("Wrote:", weather_out, "| Rows:", len(weather))


Wrote: ../data/interim/weather_hourly_melbourne.csv | Rows: 744


In [5]:
# Keep just the columns we need from counts (adjust names if yours differ)
counts_small = counts[["sensor_id","date_time","hourly_counts"]].copy()

joined = counts_small.merge(
    weather,
    on="date_time",
    how="left",
    validate="m:1"       # each hour maps to one weather row
)

print("Joined rows:", len(joined))
print("Missing weather rows after join:", joined["temperature_2m"].isna().sum())

# The join should keep all count rows and attach weather columns
joined.head(3)


ValueError: You are trying to merge on datetime64[ns, UTC+11:00] and datetime64[ns] columns for key 'date_time'. If you wish to proceed you should use pd.concat

In [6]:
import pandas as pd

MEL_TZ = "Australia/Melbourne"

def ensure_mel_tz(s):
    """Return a datetime Series that is tz-aware in Australia/Melbourne."""
    s = pd.to_datetime(s, errors="coerce")
    if getattr(s.dt, "tz", None) is None:
        # Times are *already local wall time* from Open-Meteo; localize (don't convert)
        return s.dt.tz_localize(MEL_TZ, ambiguous="infer", nonexistent="shift_forward")
    else:
        # If they already have a tz, convert to Melbourne to be safe
        return s.dt.tz_convert(MEL_TZ)

# 1) Standardise dtypes
counts_small["date_time"]  = ensure_mel_tz(counts_small["date_time"])
weather["date_time"]       = ensure_mel_tz(weather["date_time"])

# 2) Align to exact hour on both sides (defensive)
counts_small["date_time"]  = counts_small["date_time"].dt.floor("h")
weather["date_time"]       = weather["date_time"].dt.floor("h")

print("counts tz:", counts_small["date_time"].dt.tz.unique())
print("weather tz:", weather["date_time"].dt.tz.unique())
print("counts range:", counts_small["date_time"].min(), "→", counts_small["date_time"].max())
print("weather range:", weather["date_time"].min(), "→", weather["date_time"].max())


AttributeError: 'Australia/Melbourne' object has no attribute 'unique'

In [7]:
import pandas as pd

MEL_TZ = "Australia/Melbourne"

def ensure_mel_tz(s: pd.Series) -> pd.Series:
    """Return a datetime Series that is tz-aware in Australia/Melbourne."""
    s = pd.to_datetime(s, errors="coerce")
    if getattr(s.dt, "tz", None) is None:
        # Open-Meteo times are already local wall time; attach the Melbourne tz
        return s.dt.tz_localize(MEL_TZ, ambiguous="infer", nonexistent="shift_forward")
    else:
        # If there is a tz already, convert into Melbourne tz
        return s.dt.tz_convert(MEL_TZ)


In [8]:
# If you haven't already:
# counts_small = counts[["sensor_id","date_time","hourly_counts"]].copy()

# Standardise tz
counts_small["date_time"] = ensure_mel_tz(counts_small["date_time"])
weather["date_time"]      = ensure_mel_tz(weather["date_time"])

# Align exactly to the hour to avoid off-by-seconds
counts_small["date_time"] = counts_small["date_time"].dt.floor("h")
weather["date_time"]      = weather["date_time"].dt.floor("h")

# Debug prints (no .unique() here)
print("counts tz:",  counts_small["date_time"].dt.tz)
print("weather tz:", weather["date_time"].dt.tz)
print("counts range:",  counts_small["date_time"].min(), "→", counts_small["date_time"].max())
print("weather range:", weather["date_time"].min(),      "→", weather["date_time"].max())


counts tz: Australia/Melbourne
weather tz: Australia/Melbourne
counts range: 2025-03-01 00:00:00+11:00 → 2025-03-31 23:00:00+11:00
weather range: 2025-03-01 00:00:00+11:00 → 2025-03-31 23:00:00+11:00


In [9]:
wx_cols = ["date_time", "temperature_2m", "precipitation", "rain", "wind_speed_10m"]
weather_small = weather[wx_cols].copy()

joined = counts_small.merge(
    weather_small,
    on="date_time",
    how="left",
    validate="m:1",   # many counts rows -> 1 weather row
)

print("Joined rows:", len(joined))
missing = joined["temperature_2m"].isna().sum()
print("Missing weather rows after join:", missing)

if missing:
    # Show a few problem timestamps if any
    display(joined.loc[joined["temperature_2m"].isna(), ["date_time","sensor_id"]].head(10))


Joined rows: 64040
Missing weather rows after join: 0


In [10]:
# from your repo root
git add notebooks/03_weather_merge.ipynb data/interim/weather_hourly_melbourne.csv
git commit -m "03: weather merge — standardise tz to Australia/Melbourne and hour-align"
git push


SyntaxError: invalid decimal literal (2250711377.py, line 2)