In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree

# -------------------------------------------------
# 1) Load CCTV data (British National Grid)
# -------------------------------------------------
CCTV_PATH = "../data/context/cctv/Council_CCTV_cameras.csv"
cctv = pd.read_csv(CCTV_PATH)

east_col = "FEAT_CENT_EAST"
north_col = "FEAT_CENT_NORTH"

cctv[east_col] = pd.to_numeric(cctv[east_col], errors="coerce")
cctv[north_col] = pd.to_numeric(cctv[north_col], errors="coerce")
cctv = cctv.dropna(subset=[east_col, north_col]).copy()

print("CCTV rows:", len(cctv))

# -------------------------------------------------
# 2) OSGB36 → WGS84 conversion (pure math)
# -------------------------------------------------

def osgb36_to_wgs84(E, N):

    # Airy 1830 ellipsoid
    a = 6377563.396
    b = 6356256.909
    F0 = 0.9996012717
    lat0 = np.deg2rad(49)
    lon0 = np.deg2rad(-2)
    N0 = -100000
    E0 = 400000
    e2 = 1 - (b*b)/(a*a)
    n = (a - b) / (a + b)

    lat = lat0
    M = 0

    while True:
        lat_prev = lat
        M = b * F0 * (
            (1 + n + (5/4)*n**2 + (5/4)*n**3)*(lat - lat0)
            - (3*n + 3*n**2 + (21/8)*n**3)*np.sin(lat - lat0)*np.cos(lat + lat0)
            + ((15/8)*n**2 + (15/8)*n**3)*np.sin(2*(lat - lat0))*np.cos(2*(lat + lat0))
            - (35/24)*n**3*np.sin(3*(lat - lat0))*np.cos(3*(lat + lat0))
        )
        lat = (N - N0 - M) / (a*F0) + lat
        if abs(lat - lat_prev) < 1e-10:
            break

    nu = a*F0 / np.sqrt(1 - e2*np.sin(lat)**2)
    rho = a*F0*(1 - e2) / (1 - e2*np.sin(lat)**2)**1.5
    eta2 = nu/rho - 1

    tan_lat = np.tan(lat)
    sec_lat = 1/np.cos(lat)

    dE = E - E0

    lat = lat - (tan_lat/(2*rho*nu))*dE**2 \
        + (tan_lat/(24*rho*nu**3))*(5 + 3*tan_lat**2 + eta2 - 9*tan_lat**2*eta2)*dE**4

    lon = lon0 + (sec_lat/nu)*dE \
        - (sec_lat/(6*nu**3))*(1 + 2*tan_lat**2 + eta2)*dE**3

    return np.rad2deg(lat), np.rad2deg(lon)


# Vectorized conversion
latitudes = []
longitudes = []

for E, N in zip(cctv[east_col], cctv[north_col]):
    lat, lon = osgb36_to_wgs84(E, N)
    latitudes.append(lat)
    longitudes.append(lon)

cctv["latitude"] = latitudes
cctv["longitude"] = longitudes

print("Lat range:", (min(latitudes), max(latitudes)))
print("Lon range:", (min(longitudes), max(longitudes)))

# Bristol should be approx:
# Latitude ~ 51.4 – 51.5
# Longitude ~ -2.7 – -2.5

# -------------------------------------------------
# 3) Load sensor + cluster data
# -------------------------------------------------
MASTER_PATH = "../data/mdm2_data_files/big_table_with_weather_and_rain_with_geo_clusters_v2.csv"
big = pd.read_csv(MASTER_PATH)

sensor_cluster = (
    big[["sensor_id", "longitude", "latitude", "cluster_geo"]]
    .dropna()
    .drop_duplicates("sensor_id")
)

# --- Approx meters from lat/lon (good enough at Bristol latitudes) ---
# 1 degree latitude ~ 111,320 m
# 1 degree longitude ~ 111,320 * cos(latitude) m
def lonlat_to_xy_m(lon, lat, lon0=None, lat0=None):
    if lon0 is None: lon0 = np.mean(lon)
    if lat0 is None: lat0 = np.mean(lat)
    lat_rad = np.deg2rad(lat0)
    x = (lon - lon0) * 111320.0 * np.cos(lat_rad)
    y = (lat - lat0) * 111320.0
    return x, y

# Convert sensors to local x,y meters
sx, sy = lonlat_to_xy_m(sensor_cluster["longitude"].values, sensor_cluster["latitude"].values)
sensor_xy = np.column_stack([sx, sy])

# Convert CCTV to same local x,y meters
cx, cy = lonlat_to_xy_m(cctv["longitude"].values, cctv["latitude"].values,
                        lon0=np.mean(sensor_cluster["longitude"].values),
                        lat0=np.mean(sensor_cluster["latitude"].values))
cctv_xy = np.column_stack([cx, cy])

# KDTree on CCTV points
from scipy.spatial import cKDTree
tree_cctv = cKDTree(cctv_xy)

# Count cameras within radius r for each sensor
r = 300  # meters (change to 200/500 as you like)
counts = tree_cctv.query_ball_point(sensor_xy, r=r)
sensor_cluster["cctv_300m"] = [len(lst) for lst in counts]

print(sensor_cluster[["sensor_id", "cluster_geo", "cctv_300m"]].head())

# Save sensor-level variable
sensor_cluster.to_csv("../reports/sensor_cctv_within_300m.csv", index=False)

# Merge into your big modelling dataset (hourly table)
big = pd.read_csv("../data/mdm2_data_files/big_table_with_weather_and_rain_with_geo_clusters_v2.csv")
big = big.merge(sensor_cluster[["sensor_id", "cctv_300m"]], on="sensor_id", how="left")
big["cctv_300m"] = big["cctv_300m"].fillna(0)

OUT = "../data/mdm2_data_files/big_table_with_weather_rain_clusters_cctv300m.csv"
big.to_csv(OUT, index=False)

print("Saved:", OUT)



# -------------------------------------------------
# 4) Assign CCTV → nearest sensor
# -------------------------------------------------
sensor_coords = sensor_cluster[["longitude", "latitude"]].values
tree = cKDTree(sensor_coords)

cctv_coords = cctv[["longitude", "latitude"]].values
dist, idx = tree.query(cctv_coords, k=1)

cctv["nearest_sensor_id"] = sensor_cluster.iloc[idx]["sensor_id"].values
cctv["cluster_geo"] = sensor_cluster.iloc[idx]["cluster_geo"].values

# -------------------------------------------------
# 5) Cluster summary
# -------------------------------------------------
summary = (
    cctv.groupby("cluster_geo")
    .size()
    .reset_index(name="cctv_count")
)

sensor_counts = (
    sensor_cluster.groupby("cluster_geo")["sensor_id"]
    .nunique()
    .reset_index(name="n_sensors")
)

summary = summary.merge(sensor_counts, on="cluster_geo", how="left")
summary["cctv_per_sensor"] = summary["cctv_count"] / summary["n_sensors"]

print("\n=== CCTV SUMMARY BY CLUSTER ===")
print(summary)

# -------------------------------------------------
# 6) Save outputs
# -------------------------------------------------
summary.to_csv("../reports/cctv_by_cluster_summary.csv", index=False)
cctv.to_csv("../reports/cctv_with_assigned_cluster.csv", index=False)

print("Saved to reports/")

  from pandas.core.computation.check import NUMEXPR_INSTALLED


CCTV rows: 1498
Lat range: (51.4036847765172, 51.60864811193833)
Lon range: (-2.6992860338855915, -2.390290003242058)
       sensor_id  cluster_geo  cctv_300m
0              1            2         52
8248           2            2         53
16496          4            2        184
24745          5            2         49
32992          6            2         49
Saved: ../data/mdm2_data_files/big_table_with_weather_rain_clusters_cctv300m.csv

=== CCTV SUMMARY BY CLUSTER ===
   cluster_geo  cctv_count  n_sensors  cctv_per_sensor
0            0         416         24        17.333333
1            1         234          6        39.000000
2            2         848         16        53.000000
Saved to reports/


In [2]:
df = pd.read_csv("../data/mdm2_data_files/big_table_with_weather_rain_clusters_cctv300m.csv")

summary = (
    df.groupby("cluster_geo")["cctv_300m"]
    .agg(["mean", "median", "min", "max"])
)

print(summary)

                  mean  median  min  max
cluster_geo                             
0            40.373587    25.0    0  121
1             2.999838     1.0    0    7
2            49.877288    49.0    0  184
