In [2]:
import pandas as pd
from pathlib import Path
import re

# Change this to the full path if needed.
# to implement, keep repositories on your computer in the following structure:
#Desktop/mdm2 data files/README.txt, Sensor_Location.csv, Hourly count data (dir with all csv files); final big_table.csv will be saved in mdm2.. dir
BASE_DIR = Path("mdm2 data files")

LOC_PATH = BASE_DIR / "Sensor_Location.csv"
COUNTS_DIR = BASE_DIR / "Hourly count data"

# --- 1) Load locations ---
loc = pd.read_csv(LOC_PATH)

# standardising column names 
loc_cols = {c.lower().strip(): c for c in loc.columns}
# expected keys: sensornumber, latitude, longitude (or similar)
sensor_col = loc_cols.get("sensornumber", loc_cols.get("sensor", None))
lat_col    = loc_cols.get("latitude",  loc_cols.get("lat", None))
lon_col    = loc_cols.get("longitude", loc_cols.get("lon", None))

if sensor_col is None or lat_col is None or lon_col is None:
    raise ValueError(f"Unexpected Sensor_Location columns: {list(loc.columns)}")

loc = loc.rename(columns={sensor_col: "sensor_id", lat_col: "latitude", lon_col: "longitude"})
loc["sensor_id"] = loc["sensor_id"].astype(int)

# 2) Finding sensor files inside Hourly count data
sensor_files = sorted(
    [p for p in COUNTS_DIR.glob("*.csv") if re.fullmatch(r"\d+\.csv", p.name)],
    key=lambda p: int(p.stem)
)

print(f"Found {len(sensor_files)} sensor CSVs in: {COUNTS_DIR}")

#3) Load + stack
dfs = []
for p in sensor_files:
    sid = int(p.stem)
    df = pd.read_csv(p)
    df["sensor_id"] = sid
    dfs.append(df)

big = pd.concat(dfs, ignore_index=True)

#  4) Parse datetime + add basic time features
# Most files have a column called 'date' that includes date+time to the hour
if "date" not in big.columns:
    raise ValueError(f"No 'date' column found in counts data. Columns are: {list(big.columns)}")

big["datetime"] = pd.to_datetime(big["date"], errors="coerce")
big = big.drop(columns=["date"])

big["hour"] = big["datetime"].dt.hour
big["date_only"] = big["datetime"].dt.date
big["dow"] = big["datetime"].dt.dayofweek  # Monday=0

#  5) Joining sensor locations 
big = big.merge(loc, on="sensor_id", how="left")

# 6) checks
print("Bad timestamps:", big["datetime"].isna().sum())
print("Rows with missing lat/lon:", big["latitude"].isna().sum())

for col in ["ped", "car", "cyc"]:
    if col in big.columns:
        print(f"Negative {col} rows:", (big[col] < 0).sum())
    else:
        print(f"WARNING: column '{col}' not found. Available: {list(big.columns)}")

rows_per_sensor = big.groupby("sensor_id").size().sort_values()
print("Rows per sensor (min/max):", rows_per_sensor.min(), rows_per_sensor.max())
print("Example sensors:", rows_per_sensor.index[:10].tolist())

big.head()

OUT_PATH = Path("mdm2 data files") / "big_table.csv"
big.to_csv(OUT_PATH, index=False)
print("Saved to:", OUT_PATH.resolve())



Found 58 sensor CSVs in: mdm2 data files/Hourly count data
Bad timestamps: 0
Rows with missing lat/lon: 0
Negative ped rows: 0
Negative car rows: 0
Negative cyc rows: 0
Rows per sensor (min/max): 8784 8784
Example sensors: [1, 35, 36, 37, 38, 39, 40, 41, 42, 43]
Saved to: /Users/shavarshmelikyan/Desktop/mdm2 data files/big_table.csv
