In [1]:
## 1. Import Packages and Setup

import pandas as pd
import geopandas as gpd
import glob
import pathlib
import os

# Define file paths
DATA_RAW_DIR = pathlib.Path.cwd().parent / "data" / "raw"
DATA_INTERIM_DIR = pathlib.Path.cwd().parent / "data" / "interim"

# Create interim directory if it doesn't exist
DATA_INTERIM_DIR.mkdir(parents=True, exist_ok=True)

## 2.1 Triple-Check A: Hourly Data Found?

# 1. Define the search pattern for the three CSV files
file_pattern = str(DATA_RAW_DIR / "com_counts_2025_*.csv")
print(f"Searching for files: {file_pattern}")

# 2. Get the list of file paths
file_list = glob.glob(file_pattern)

# Assert that all three files are found
expected_files = 3
if len(file_list) != expected_files:
    print(f"ERROR: Found {len(file_list)} files, expected {expected_files}.")
    print("Please check your 'data/raw' folder for missing or misnamed files.")

# 3. Read the files into one DataFrame
all_data = []
for file in file_list:
    df = pd.read_csv(file)
    all_data.append(df)

df_hourly = pd.concat(all_data, ignore_index=True)

# 4. Data Cleaning and Transformation to match project standard structure
# FIX 1: Rename columns to the standard format and create the Date_Time column
df_hourly = df_hourly.rename(columns={
    'Location_ID': 'Sensor_ID',
    'Total_of_Directions': 'Hourly_Counts'
})

# Create the 'Date_Time' column by combining 'Sensing_Date' and 'HourDay'
df_hourly['Date_Time'] = pd.to_datetime(
    df_hourly['Sensing_Date'] + ' ' + df_hourly['HourDay'].astype(str).str.zfill(2) + ':00:00'
)

# C. Extract other required date/time components for the subsequent notebooks
df_hourly['Year'] = df_hourly['Date_Time'].dt.year
df_hourly['Month'] = df_hourly['Date_Time'].dt.month
df_hourly['m_date'] = df_hourly['Date_Time'].dt.day
df_hourly['Day'] = df_hourly['Date_Time'].dt.day_name()
df_hourly['Time'] = df_hourly['Date_Time'].dt.hour

# 5. Filter Data to expected date range
# FIX 2: Filter out rogue June/July data that caused the ValueError.
start_date = pd.Timestamp('2025-03-01 00:00:00')
end_date = pd.Timestamp('2025-05-31 23:00:00')

df_hourly = df_hourly[
    (df_hourly['Date_Time'] >= start_date) & 
    (df_hourly['Date_Time'] <= end_date)
].copy()

# Final check of the Date Range
min_date = df_hourly['Date_Time'].min()
max_date = df_hourly['Date_Time'].max()

# The data should cover exactly March 1 to May 31, 2025
if min_date == start_date and max_date == end_date:
    print(f"SUCCESS: Data range is correct. Min Date: {min_date}, Max Date: {max_date}")
else:
    print(f"WARNING: Date range mismatch. Min Date: {min_date}, Max Date: {max_date}")
    
# Save the combined, checked hourly data to interim folder
df_hourly.to_csv(DATA_INTERIM_DIR / "hourly_counts_combined.csv", index=False)
print(f"Successfully saved combined data to {DATA_INTERIM_DIR / 'hourly_counts_combined.csv'}")

## 2.2 Triple-Check B: Sensor Locations Match?

# 1. Load the sensor location GeoJSON file
try:
    gdf_sensors = gpd.read_file(DATA_RAW_DIR / "sensor_locations.geojson")
    print(f"SUCCESS: Loaded {len(gdf_sensors)} sensor locations.")
except FileNotFoundError:
    print(f"ERROR: Sensor location file not found at {DATA_RAW_DIR / 'sensor_locations.geojson'}. Please check file name and location.")
    assert False, "Sensor location file not found."

# 2. Get unique Sensor IDs from both data sources
hourly_ids = set(df_hourly['Sensor_ID'].unique())
location_ids = set(gdf_sensors['location_id'].unique()) # Note: The GeoJSON uses 'location_id'

# 3. Check for mismatches
# IDs that are in the hourly counts but NOT in the location file
ids_in_counts_not_locations = hourly_ids - location_ids
print(f"IDs in counts but not in locations: {len(ids_in_counts_not_locations)}")

# IDs that are in the location file but NOT in the hourly counts
ids_in_locations_not_counts = location_ids - hourly_ids
print(f"IDs in locations but not in counts: {len(ids_in_locations_not_counts)}")

# The number of IDs that are in the counts but not locations should be 0
assert len(ids_in_counts_not_locations) == 0, f"Mismatch found! Missing location data for IDs: {ids_in_counts_not_locations}"

# Save the sensor locations to the interim folder (cleaned version)
gdf_sensors[['location_id', 'sensor_name', 'street_name', 'installation_date', 'direction_1', 'direction_2', 'status', 'latitude', 'longitude', 'geometry']].to_file(
    DATA_INTERIM_DIR / "sensor_locations_clean.geojson", driver="GeoJSON"
)
print(f"Successfully saved cleaned sensor data to {DATA_INTERIM_DIR / 'sensor_locations_clean.geojson'}")

Searching for files: /Users/poojithraj/Documents/melbourne-foot-traffic-marketing/data/raw/com_counts_2025_*.csv
Successfully saved combined data to /Users/poojithraj/Documents/melbourne-foot-traffic-marketing/data/interim/hourly_counts_combined.csv
SUCCESS: Loaded 11 sensor locations.
IDs in counts but not in locations: 94
IDs in locations but not in counts: 3


AssertionError: Mismatch found! Missing location data for IDs: {np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(14), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(23), np.int64(24), np.int64(25), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(35), np.int64(36), np.int64(37), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(56), np.int64(58), np.int64(59), np.int64(61), np.int64(62), np.int64(63), np.int64(66), np.int64(67), np.int64(68), np.int64(69), np.int64(70), np.int64(71), np.int64(72), np.int64(75), np.int64(76), np.int64(77), np.int64(78), np.int64(79), np.int64(84), np.int64(85), np.int64(86), np.int64(87), np.int64(107), np.int64(108), np.int64(109), np.int64(117), np.int64(118), np.int64(123), np.int64(124), np.int64(130), np.int64(131), np.int64(132), np.int64(133), np.int64(134), np.int64(135), np.int64(136), np.int64(137), np.int64(138), np.int64(139), np.int64(140), np.int64(141), np.int64(142), np.int64(143), np.int64(161), np.int64(162), np.int64(164), np.int64(165), np.int64(166), np.int64(167), np.int64(179)}