# 02 — Clean & Geo‑Join (Sensors → SA2)

In [None]:
print('TODO')

In [3]:
import os, pathlib
print("cwd:", os.getcwd())
print("notebook file is in:", pathlib.Path(".").resolve())


cwd: /Users/poojithraj/Documents/melbourne-foot-traffic-marketing/notebooks
notebook file is in: /Users/poojithraj/Documents/melbourne-foot-traffic-marketing/notebooks


In [4]:
from pathlib import Path
raw = Path("../data/raw")   # <-- go up one level, then into data/raw
print("Looking in:", raw.resolve())
assert raw.exists(), "raw folder not found—path is wrong"
for p in sorted(raw.glob("*")):
    print(p.name, f"{p.stat().st_size/1e6:.2f} MB")


Looking in: /Users/poojithraj/Documents/melbourne-foot-traffic-marketing/data/raw
com_counts_2025_03.csv 5.04 MB
com_counts_2025_04.csv 4.11 MB
com_counts_2025_05.csv 4.94 MB
sensor_locations.geojson 0.00 MB


In [1]:
import re, glob
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd

# because this notebook is inside /notebooks, the project root is one level up
BASE = Path("..").resolve()
RAW = BASE / "data" / "raw"
INTERIM = BASE / "data" / "interim"
INTERIM.mkdir(parents=True, exist_ok=True)

MEL_TZ = "Australia/Melbourne"

def std_col(name: str) -> str:
    return re.sub(r"[^a-z0-9]+", "_", name.strip().lower()).strip("_")

def month_bounds_from_filename(p: Path):
    m = re.search(r"(\d{4})[_-](\d{2})", p.stem)
    if not m: return None, None
    y, mth = int(m.group(1)), int(m.group(2))
    start = pd.Timestamp(year=y, month=mth, day=1, tz=MEL_TZ)
    end   = (start + pd.offsets.MonthEnd(1)).replace(hour=23)
    return start, end

def filter_to_its_month(df: pd.DataFrame, src_file: Path) -> pd.DataFrame:
    start, end = month_bounds_from_filename(src_file)
    if start is None: return df
    mask = (df["date_time"] >= start) & (df["date_time"] <= end)
    return df.loc[mask].copy()


In [2]:
def coerce_counts_schema(df: pd.DataFrame) -> pd.DataFrame:
    original_cols = df.columns.tolist()
    df = df.rename(columns={c: std_col(c) for c in df.columns})

    # sensor id
    sid = None
    for c in ("sensor_id","location_id","sensorid","locationid"):
        if c in df.columns: sid = c; break
    assert sid, f"Could not find sensor id. Saw: {original_cols}"

    # counts
    cnt = None
    for c in ("hourly_counts","count","total_of_directions","total"):
        if c in df.columns: cnt = c; break
    assert cnt, f"Could not find counts column. Saw: {original_cols}"

    # timestamp (either a single column or date+hour)
    tscol = None
    for c in ("date_time","datetime","datehour"):
        if c in df.columns: tscol = c; break

    if tscol:
        s = pd.to_datetime(df[tscol], errors="coerce")
        if s.dt.tz is None:
            s = s.dt.tz_localize(MEL_TZ, ambiguous="NaT", nonexistent="NaT")
        else:
            s = s.dt.tz_convert(MEL_TZ)
        df["date_time"] = s
    else:
        dcol = next((c for c in ("sensing_date","date","day") if c in df.columns), None)
        hcol = next((c for c in ("hourday","hour","hr") if c in df.columns), None)
        assert dcol and hcol, f"Need date + hour. Saw: {original_cols}"
        dd = pd.to_datetime(df[dcol], errors="coerce")
        hh = pd.to_numeric(df[hcol], errors="coerce").astype("Int64")
        s  = dd + pd.to_timedelta(hh.astype(float), unit="h")
        s  = s.dt.tz_localize(MEL_TZ, ambiguous="NaT", nonexistent="NaT")
        df["date_time"] = s

    df["sensor_id"] = df[sid].astype(str).str.strip()
    df["hourly_counts"] = pd.to_numeric(df[cnt], errors="coerce")
    return df[["sensor_id","date_time","hourly_counts"]].copy()

# load & combine
csv_paths = sorted(RAW.glob("com_counts_*.csv"))
assert csv_paths, f"No files found in {RAW}"
frames = []
for p in csv_paths:
    t = pd.read_csv(p)
    t = coerce_counts_schema(t)
    t = t.dropna(subset=["sensor_id","date_time","hourly_counts"])
    t = filter_to_its_month(t, p)     # removes stray June/July rows from Apr/May files
    t["source_file"] = p.name
    frames.append(t)

counts = pd.concat(frames, ignore_index=True)

# de-dupe & validate
before = len(counts)
counts = counts.drop_duplicates(subset=["sensor_id","date_time"])
dups = before - len(counts)
neg  = (counts["hourly_counts"] < 0).sum()
assert neg == 0, f"Negative counts found: {neg}"
assert counts["date_time"].isna().sum() == 0, "Unparseable timestamps."

print(f"Rows: {len(counts):,} | Duplicates removed: {dups:,}")
print("Date range:", counts['date_time'].min(), "→", counts['date_time'].max())
print("Sensors:", counts['sensor_id'].nunique())
counts.head(3)


Rows: 64,040 | Duplicates removed: 0
Date range: 2025-03-01 00:00:00+11:00 → 2025-03-31 23:00:00+11:00
Sensors: 96


Unnamed: 0,sensor_id,date_time,hourly_counts,source_file
0,107,2025-03-01 18:00:00+11:00,237,com_counts_2025_03.csv
1,20,2025-03-01 14:00:00+11:00,602,com_counts_2025_03.csv
2,107,2025-03-15 02:00:00+11:00,22,com_counts_2025_03.csv


In [3]:
loc_geo = RAW / "sensor_locations.geojson"
assert loc_geo.exists(), f"{loc_geo} not found"

sensors = geopandas_read = gpd.read_file(loc_geo)
sensors = sensors.rename(columns={c: std_col(c) for c in sensors.columns})

required = {"location_id","sensor_name","sensor_description","installation_date"}
missing = required - set(sensors.columns)
assert not missing, f"Missing fields: {missing}. Present: {list(sensors.columns)}"

latcol = "latitude" if "latitude" in sensors.columns else "lat"
loncol = "longitude" if "longitude" in sensors.columns else "lon"
assert latcol in sensors.columns and loncol in sensors.columns, "latitude/longitude not found."

sensors_clean = sensors.copy()
sensors_clean["sensor_id"]  = sensors_clean["location_id"].astype(str).str.strip()
sensors_clean["latitude"]   = pd.to_numeric(sensors_clean[latcol], errors="coerce")
sensors_clean["longitude"]  = pd.to_numeric(sensors_clean[loncol], errors="coerce")
assert sensors_clean["latitude"].notna().all() and sensors_clean["longitude"].notna().all(), "Some sensors missing coords."

sensors_clean = sensors_clean[["sensor_id","sensor_name","sensor_description","installation_date","latitude","longitude"]]
sensors_clean = sensors_clean.drop_duplicates(subset=["sensor_id"])

print("Sensors table shape:", sensors_clean.shape)
sensors_clean.head(3)


Sensors table shape: (5, 6)


Unnamed: 0,sensor_id,sensor_name,sensor_description,installation_date,latitude,longitude
0,181,Eli368_T,368 Elizabeth Street,2025-03-26,-37.810095,144.961431
1,184,Eli124_T,124 Elizabeth Street,2025-06-28,-37.815124,144.96372
3,185,Eli197_T,197 Elizabeth Street,2025-06-28,-37.813746,144.962762


In [4]:
# coverage
cnt_ids = set(counts["sensor_id"].unique())
loc_ids = set(sensors_clean["sensor_id"].unique())
missing = cnt_ids - loc_ids
coverage = 100 * (1 - len(missing)/max(1,len(cnt_ids)))
print(f"Sensor ID coverage: {coverage:.1f}% (missing {len(missing)})")
if missing:
    print("Example missing IDs:", list(sorted(missing))[:10])

# save
counts_out  = INTERIM / "traffic_by_hour.csv"
sensors_out = INTERIM / "sensor_locations_clean.csv"
counts[["sensor_id","date_time","hourly_counts"]].sort_values(["sensor_id","date_time"]).to_csv(counts_out, index=False)
sensors_clean.to_csv(sensors_out, index=False)
print("Wrote:", counts_out)
print("Wrote:", sensors_out)


Sensor ID coverage: 2.1% (missing 94)
Example missing IDs: ['1', '10', '107', '108', '109', '11', '117', '118', '12', '123']
Wrote: /Users/poojithraj/Documents/melbourne-foot-traffic-marketing/data/interim/traffic_by_hour.csv
Wrote: /Users/poojithraj/Documents/melbourne-foot-traffic-marketing/data/interim/sensor_locations_clean.csv


In [5]:
from pathlib import Path
p = Path("../data/raw/sensor_locations.geojson")
print("Exists:", p.exists(), "| Size MB:", round(p.stat().st_size/1e6, 2))


Exists: True | Size MB: 0.05


In [6]:
from pathlib import Path
p = Path("../data/raw/sensor_locations.geojson")
print("Exists:", p.exists(), "| Size MB:", round(p.stat().st_size/1e6, 2))


Exists: True | Size MB: 0.05


In [7]:
import geopandas as gpd, pandas as pd, re
from pathlib import Path

loc_geo = Path("../data/raw/sensor_locations.geojson")
sensors = gpd.read_file(loc_geo).rename(columns=lambda c: re.sub(r"[^a-z0-9]+","_", c.lower()).strip("_"))

required = {"location_id","sensor_name","sensor_description","installation_date"}
missing = required - set(sensors.columns)
assert not missing, f"Missing fields: {missing}. Present: {list(sensors.columns)}"

latcol = "latitude" if "latitude" in sensors.columns else "lat"
loncol = "longitude" if "longitude" in sensors.columns else "lon"
assert latcol in sensors.columns and loncol in sensors.columns, "latitude/longitude not found."

sensors_clean = sensors.copy()
sensors_clean["sensor_id"]  = sensors_clean["location_id"].astype(str).str.strip()
sensors_clean["latitude"]   = pd.to_numeric(sensors_clean[latcol], errors="coerce")
sensors_clean["longitude"]  = pd.to_numeric(sensors_clean[loncol], errors="coerce")
assert sensors_clean["latitude"].notna().all() and sensors_clean["longitude"].notna().all()

sensors_clean = sensors_clean[["sensor_id","sensor_name","sensor_description","installation_date","latitude","longitude"]]
sensors_clean = sensors_clean.drop_duplicates(subset=["sensor_id"])

print("Sensors table shape:", sensors_clean.shape)
sensors_clean.head(3)


Sensors table shape: (135, 6)


Unnamed: 0,sensor_id,sensor_name,sensor_description,installation_date,latitude,longitude
0,1,Bou292_T,Bourke Street Mall (North),2009-03-24,-37.813494,144.965153
1,4,Swa123_T,Town Hall (West),2009-03-23,-37.81488,144.966088
2,10,BouHbr_T,Victoria Point,2009-04-23,-37.818765,144.947105


In [8]:
cnt_ids = set(counts["sensor_id"].unique())
loc_ids = set(sensors_clean["sensor_id"].unique())
missing = cnt_ids - loc_ids
coverage = 100 * (1 - len(missing)/max(1,len(cnt_ids)))
print(f"Sensor ID coverage: {coverage:.1f}% (missing {len(missing)})")
if missing:
    print("Example missing IDs:", list(sorted(missing))[:10])


Sensor ID coverage: 100.0% (missing 0)


In [9]:
from pathlib import Path
INTERIM = Path("..")/"data"/"interim"
INTERIM.mkdir(parents=True, exist_ok=True)
counts[["sensor_id","date_time","hourly_counts"]].sort_values(["sensor_id","date_time"]).to_csv(INTERIM/"traffic_by_hour.csv", index=False)
sensors_clean.to_csv(INTERIM/"sensor_locations_clean.csv", index=False)
print("Wrote:", INTERIM/"traffic_by_hour.csv")
print("Wrote:", INTERIM/"sensor_locations_clean.csv")


Wrote: ../data/interim/traffic_by_hour.csv
Wrote: ../data/interim/sensor_locations_clean.csv


In [10]:
git add data/interim/*.csv notebooks/02_clean_join_geo.ipynb
git commit -m "Use full sensor locations; coverage ok; save clean tables"
git push


SyntaxError: invalid decimal literal (970244286.py, line 1)

In [11]:
# sanity checks (prints repo status, remote and branch)
!git -C .. rev-parse --is-inside-work-tree
!git -C .. status
!git -C .. remote -v
!git -C .. branch

# stage the notebook you edited (DO NOT add data/interim)
!git -C .. add notebooks/02_clean_join_geo.ipynb

# if you already created KPI CSVs earlier, stage that folder too (safe even if empty)
!git -C .. add -A analytics/looker_studio_datasources

# commit
!git -C .. commit -m "Clean CoM hourly counts + full sensor locations (coverage 100%); save interim outputs"

# push to GitHub (main branch)
!git -C .. push -u origin main


true
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mdeleted:    notebooks/01_download_coM.ipynb[m
	[31mmodified:   notebooks/02_clean_join_geo.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mMiniforge3-MacOSX-arm64.sh[m
	[31mnotebooks/01_download_coM_FIXED.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")
origin	https://github.com/RajPoo7/melbourne-foot-traffic-marketing.git (fetch)
origin	https://github.com/RajPoo7/melbourne-foot-traffic-marketing.git (push)
* [32mmain[m
fatal: pathspec 'analytics/looker_studio_datasources' did not match any files
[main 8dd3a7a] Clean CoM hourly counts + full sensor locations (coverage 100%); save interim outputs
 1 file changed, 657 insertions(+), 1 deletion(-)
Enumerating objects: 7, 

In [12]:
import pandas as pd
from pathlib import Path

# Load clean hourly data
df = pd.read_csv("../data/interim/traffic_by_hour.csv", parse_dates=["date_time"])
df["weekday"] = df["date_time"].dt.day_name()
df["hour"]    = df["date_time"].dt.hour

# 1) Weekday × hour heatmap
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
heatmap = (df.groupby(["weekday","hour"])["hourly_counts"]
             .mean()
             .reset_index()
             .rename(columns={"hourly_counts":"avg_count"}))
heatmap["weekday"] = pd.Categorical(heatmap["weekday"], categories=weekday_order, ordered=True)
heatmap = heatmap.sort_values(["weekday","hour"])

# 2) Top 5 "power hours" per sensor
power = (df.groupby(["sensor_id","hour"])["hourly_counts"]
           .mean()
           .reset_index()
           .rename(columns={"hourly_counts":"avg_count"}))
power["rank_in_sensor"] = power.groupby("sensor_id")["avg_count"].rank(method="first", ascending=False)
power_hours = power.query("rank_in_sensor <= 5").sort_values(["sensor_id","rank_in_sensor"])

# Save to a git-tracked folder
outdir = Path("../analytics/looker_studio_datasources")
outdir.mkdir(parents=True, exist_ok=True)
heatmap.to_csv(outdir/"heatmap.csv", index=False)
power_hours.to_csv(outdir/"power_hours.csv", index=False)
print("Wrote:", outdir/"heatmap.csv")
print("Wrote:", outdir/"power_hours.csv")


Wrote: ../analytics/looker_studio_datasources/heatmap.csv
Wrote: ../analytics/looker_studio_datasources/power_hours.csv


In [13]:
# stage the two CSVs and your notebook
!git -C .. add analytics/looker_studio_datasources/*.csv notebooks/02_clean_join_geo.ipynb

# commit & push
!git -C .. commit -m "Export heatmap & power-hours for dashboard"
!git -C .. push


zsh:1: no matches found: analytics/looker_studio_datasources/*.csv
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mdeleted:    notebooks/01_download_coM.ipynb[m
	[31mmodified:   notebooks/02_clean_join_geo.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mMiniforge3-MacOSX-arm64.sh[m
	[31manalytics/[m
	[31mnotebooks/01_download_coM_FIXED.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")
Everything up-to-date


In [14]:
!git -C .. add notebooks/01_download_coM.ipynb notebooks/01_download_coM_FIXED.ipynb
!git -C .. commit -m "Keep both 01_* notebooks (original + FIXED)"
!git -C .. push


[main 1eb8475] Keep both 01_* notebooks (original + FIXED)
 2 files changed, 178 insertions(+), 25 deletions(-)
 delete mode 100644 notebooks/01_download_coM.ipynb
 create mode 100644 notebooks/01_download_coM_FIXED.ipynb
Enumerating objects: 6, done.
Counting objects: 100% (6/6), done.
Delta compression using up to 8 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 3.37 KiB | 3.37 MiB/s, done.
Total 4 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/RajPoo7/melbourne-foot-traffic-marketing.git
   8dd3a7a..1eb8475  main -> main


In [15]:
# Remove the old file from the repo, keep FIXED
!git -C .. rm --cached notebooks/01_download_coM.ipynb  # stops tracking but leaves your local copy
!git -C .. add notebooks/01_download_coM_FIXED.ipynb
!git -C .. commit -m "Replace 01_download_coM with 01_download_coM_FIXED"
!git -C .. push


fatal: pathspec 'notebooks/01_download_coM.ipynb' did not match any files
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   notebooks/02_clean_join_geo.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mMiniforge3-MacOSX-arm64.sh[m
	[31manalytics/[m

no changes added to commit (use "git add" and/or "git commit -a")
Everything up-to-date


In [16]:
import pandas as pd
from pathlib import Path

# Load clean hourly series
df = pd.read_csv("../data/interim/traffic_by_hour.csv", parse_dates=["date_time"])
df["weekday"] = df["date_time"].dt.day_name()
df["hour"]    = df["date_time"].dt.hour

# 1) Weekday × hour heatmap
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
heatmap = (df.groupby(["weekday","hour"])["hourly_counts"]
             .mean()
             .reset_index()
             .rename(columns={"hourly_counts":"avg_count"}))
heatmap["weekday"] = pd.Categorical(heatmap["weekday"], categories=weekday_order, ordered=True)
heatmap = heatmap.sort_values(["weekday","hour"])

# 2) Top 5 “power hours” per sensor
power = (df.groupby(["sensor_id","hour"])["hourly_counts"]
           .mean()
           .reset_index()
           .rename(columns={"hourly_counts":"avg_count"}))
power["rank_in_sensor"] = power.groupby("sensor_id")["avg_count"].rank(method="first", ascending=False)
power_hours = power.query("rank_in_sensor <= 5").sort_values(["sensor_id","rank_in_sensor"])

# Save (this folder is tracked by git)
outdir = Path("../analytics/looker_studio_datasources")
outdir.mkdir(parents=True, exist_ok=True)
heatmap.to_csv(outdir/"heatmap.csv", index=False)
power_hours.to_csv(outdir/"power_hours.csv", index=False)
print("Wrote:", outdir/"heatmap.csv")
print("Wrote:", outdir/"power_hours.csv")


Wrote: ../analytics/looker_studio_datasources/heatmap.csv
Wrote: ../analytics/looker_studio_datasources/power_hours.csv


In [17]:
# Stage the two CSVs explicitly (avoids the wildcard issue)
!git -C .. add analytics/looker_studio_datasources/heatmap.csv analytics/looker_studio_datasources/power_hours.csv

# Stage the notebook you edited
!git -C .. add notebooks/02_clean_join_geo.ipynb

# Commit & push
!git -C .. commit -m "Export heatmap & power-hours for dashboard"
!git -C .. push


[main c0cb15f] Export heatmap & power-hours for dashboard
 3 files changed, 909 insertions(+), 2 deletions(-)
 create mode 100644 analytics/looker_studio_datasources/heatmap.csv
 create mode 100644 analytics/looker_studio_datasources/power_hours.csv
Enumerating objects: 11, done.
Counting objects: 100% (11/11), done.
Delta compression using up to 8 threads
Compressing objects: 100% (8/8), done.
Writing objects: 100% (8/8), 8.73 KiB | 4.36 MiB/s, done.
Total 8 (delta 3), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/RajPoo7/melbourne-foot-traffic-marketing.git
   1eb8475..c0cb15f  main -> main
