In [1]:
import sys
sys.path.append(".")

%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import h3
import math

ROOT = Path("/Users/pang/Codes/GISRUK")
OUT_DATA = ROOT / "outputs" / "data"
OUT_TAB  = ROOT / "outputs" / "tables"
OUT_MODELS = ROOT / "outputs" / "models"
OUT_TAB.mkdir(parents=True, exist_ok=True)

In [3]:
# choose horizon
HORIZON = "1W"   # "1W" / "1M" / "3M"
MODEL_NAME = "Hybrid"  # only used in filenames

path_pred = OUT_DATA / f"uk_{HORIZON.lower()}_pred_hybrid.parquet"
df = pd.read_parquet(path_pred)

# standardize
if "user_id" not in df.columns and "userid" in df.columns:
    df = df.rename(columns={"userid":"user_id"})
df["user_id"] = df["user_id"].astype(str)
df["start_time"] = pd.to_datetime(df["start_time"])
df["end_time"] = pd.to_datetime(df["end_time"])
if "duration_min" not in df.columns:
    df["duration_min"] = (df["end_time"] - df["start_time"]).dt.total_seconds()/60.0
df["duration_min"] = pd.to_numeric(df["duration_min"], errors="coerce").fillna(0.0)
df["hex_id"] = df["hex_id"].astype(str)
df["label"] = df["y_pred"].astype(str) if "y_pred" in df.columns else df["label"].astype(str)

print("Loaded:", path_pred.name, "rows:", len(df), "users:", df["user_id"].nunique())
df.head()

Loaded: uk_1w_pred_hybrid.parquet rows: 5946 users: 231


Unnamed: 0,user_id,start_time,end_time,activity_duration,hex_id,lat,lon,duration_min,y_pred,label
0,01A0C941FBA63C0D2B54BB78C37F1E08,2021-10-12 08:52:56,2021-10-12 17:09:26,29790,8a194ad30187fff,51.515712,-0.081365,496.5,STUDY,STUDY
1,01A0C941FBA63C0D2B54BB78C37F1E08,2021-10-12 17:48:22,2021-10-12 20:35:30,10028,8a194ad3018ffff,51.514437,-0.080267,167.133333,STUDY,STUDY
2,01A0C941FBA63C0D2B54BB78C37F1E08,2021-10-12 22:05:16,2021-10-13 07:00:03,32087,8a194ac0c49ffff,51.290604,-0.105064,534.783333,HOME,HOME
3,01A0C941FBA63C0D2B54BB78C37F1E08,2021-10-13 07:41:23,2021-10-13 12:03:24,15721,8a194ac0c49ffff,51.290604,-0.105064,262.016667,HOME,HOME
4,01A0C941FBA63C0D2B54BB78C37F1E08,2021-10-13 12:55:26,2021-10-13 13:14:29,1143,8a194ac057a7fff,51.323346,-0.107137,19.05,HEALTH,HEALTH


In [4]:
def cell_to_latlon(cell):
    if hasattr(h3, "cell_to_latlng"):
        lat, lon = h3.cell_to_latlng(cell)
    else:
        lat, lon = h3.h3_to_geo(cell)
    return float(lat), float(lon)

def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    p1, p2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2-lat1)
    dlmb = math.radians(lon2-lon1)
    a = math.sin(dphi/2)**2 + math.cos(p1)*math.cos(p2)*math.sin(dlmb/2)**2
    return 2*R*math.asin(math.sqrt(a))

# precompute centroids for hexes appearing in this horizon
centroids = {h: cell_to_latlon(h) for h in df["hex_id"].unique()}

In [5]:
def time_midpoint_flags(df):
    mid = df["start_time"] + pd.to_timedelta(df["duration_min"]/2, unit="m")
    hh = mid.dt.hour + mid.dt.minute/60.0
    is_night = (hh >= 20) | (hh < 6)
    is_weekday = (df["start_time"].dt.weekday < 5)
    is_workhour = is_weekday & (hh >= 9) & (hh < 17)
    return is_night, is_workhour

def connected_components(hexes, dist_km=1.0):
    # adjacency based on centroid distance
    hexes = list(hexes)
    adj = {h:set() for h in hexes}
    for i,h1 in enumerate(hexes):
        lat1, lon1 = centroids.get(h1, (np.nan,np.nan))
        if not np.isfinite(lat1): 
            continue
        for j in range(i+1, len(hexes)):
            h2 = hexes[j]
            lat2, lon2 = centroids.get(h2, (np.nan,np.nan))
            if not np.isfinite(lat2):
                continue
            if haversine_km(lat1, lon1, lat2, lon2) <= dist_km:
                adj[h1].add(h2); adj[h2].add(h1)

    seen=set(); comps=[]
    for h in hexes:
        if h in seen: 
            continue
        stack=[h]; comp=set()
        while stack:
            x=stack.pop()
            if x in seen: 
                continue
            seen.add(x); comp.add(x)
            stack.extend(list(adj[x]-seen))
        comps.append(comp)
    return comps

def pick_primary_cluster(gu, dist_km=1.0, k_candidates=10):
    # gu has columns hex_id + dwell (already aggregated)
    gu = gu.sort_values("dwell", ascending=False).head(k_candidates).copy()
    hexes = gu["hex_id"].astype(str).tolist()
    w = gu.set_index("hex_id")["dwell"].to_dict()
    comps = connected_components(hexes, dist_km=dist_km)
    if not comps:
        return set(), None
    comps = sorted(comps, key=lambda c: sum(w.get(h,0.0) for h in c), reverse=True)
    primary = comps[0]
    # representative hex: max dwell within primary
    rep = max(primary, key=lambda h: w.get(h, 0.0))
    return set(primary), rep

def kring_expand(hex_set, k=1):
    if not hex_set:
        return set()
    out=set()
    for h in hex_set:
        out |= (set(h3.grid_disk(h, k)) if hasattr(h3,"grid_disk") else set(h3.k_ring(h, k)))
    return set(map(str, out))

def build_primary_anchors(df, dist_km_home=1.0, dist_km_work=1.0,
                          k_candidates=10, expand_k=1,
                          home_min_dwell=60, work_min_dwell=60):
    d = df.copy()
    is_night, is_workhour = time_midpoint_flags(d)
    d["is_night"] = is_night
    d["is_workhour"] = is_workhour

    # aggregate candidate dwell by user-hex
    home_cand = d[(d["label"]=="HOME") & (d["is_night"])].groupby(["user_id","hex_id"], as_index=False)["duration_min"].sum()
    home_cand = home_cand.rename(columns={"duration_min":"dwell"})
    home_cand = home_cand[home_cand["dwell"] >= home_min_dwell].copy()

    work_cand = d[(d["label"]=="WORK") & (d["is_workhour"])].groupby(["user_id","hex_id"], as_index=False)["duration_min"].sum()
    work_cand = work_cand.rename(columns={"duration_min":"dwell"})
    work_cand = work_cand[work_cand["dwell"] >= work_min_dwell].copy()

    primary_home = {}
    primary_home_rep = {}
    primary_work = {}
    primary_work_rep = {}

    for u, gu in home_cand.groupby("user_id", sort=False):
        cl, rep = pick_primary_cluster(gu, dist_km=dist_km_home, k_candidates=k_candidates)
        primary_home[u] = cl
        primary_home_rep[u] = rep

    for u, gu in work_cand.groupby("user_id", sort=False):
        cl, rep = pick_primary_cluster(gu, dist_km=dist_km_work, k_candidates=k_candidates)
        primary_work[u] = cl
        primary_work_rep[u] = rep

    # expand for tolerance
    home_set = {u: kring_expand(s, k=expand_k) for u,s in primary_home.items()}
    work_set = {u: kring_expand(s, k=expand_k) for u,s in primary_work.items()}

    return home_set, work_set, primary_home_rep, primary_work_rep

home_set, work_set, home_rep, work_rep = build_primary_anchors(
    df,
    dist_km_home=1.0,
    dist_km_work=1.0,
    k_candidates=10,
    expand_k=1,
    home_min_dwell=60,
    work_min_dwell=60
)

print("Users with primary home:", len(home_set), "primary work:", len(work_set))

Users with primary home: 231 primary work: 166


In [6]:
def od_normalize_labels(df, home_set, work_set, out_col="label_od"):
    d = df.copy()
    d[out_col] = d["label"].astype(str)

    # HOME not in primary-home set -> OTHER
    def in_home(u,h):
        return str(h) in home_set.get(u, set())

    def in_work(u,h):
        return str(h) in work_set.get(u, set())

    mask_home = (d[out_col]=="HOME") & (~d.apply(lambda r: in_home(r["user_id"], r["hex_id"]), axis=1))
    mask_work = (d[out_col]=="WORK") & (~d.apply(lambda r: in_work(r["user_id"], r["hex_id"]), axis=1))

    d.loc[mask_home, out_col] = "OTHER"
    d.loc[mask_work, out_col] = "OTHER"

    # attach primary reps (for OD extraction)
    d["primary_home_hex"] = d["user_id"].map(home_rep)
    d["primary_work_hex"] = d["user_id"].map(work_rep)

    return d

df_od = od_normalize_labels(df, home_set, work_set, out_col="label_od")

# save
out_parquet = OUT_DATA / f"uk_{HORIZON.lower()}_pred_{MODEL_NAME.lower()}_od.parquet"
df_od.to_parquet(out_parquet, index=False)
print("Saved:", out_parquet)

Saved: /Users/pang/Codes/GISRUK/outputs/data/uk_1w_pred_hybrid_od.parquet


In [7]:
def multi_anchor_rate(df, label_col, anchor_label, time_mask, dwell_th=60):
    d = df.copy()
    d = d[time_mask].copy()
    d = d[d[label_col] == anchor_label].copy()
    agg = d.groupby(["user_id","hex_id"])["duration_min"].sum()
    agg = agg[agg >= dwell_th]
    cnt = agg.groupby("user_id").size()
    if len(cnt)==0:
        return 0.0
    return float((cnt >= 2).mean()), cnt.describe()

is_night, is_workhour = time_midpoint_flags(df)
rate_home_raw, desc_home_raw = multi_anchor_rate(df, "label", "HOME", is_night, 60)
rate_home_od,  desc_home_od  = multi_anchor_rate(df_od, "label_od", "HOME", is_night, 60)

rate_work_raw, desc_work_raw = multi_anchor_rate(df, "label", "WORK", is_workhour, 60)
rate_work_od,  desc_work_od  = multi_anchor_rate(df_od, "label_od", "WORK", is_workhour, 60)

print("HOME multi-rate (night,>=60min) raw:", rate_home_raw, "| od-normalized:", rate_home_od)
print("WORK multi-rate (workhour,>=60min) raw:", rate_work_raw, "| od-normalized:", rate_work_od)

HOME multi-rate (night,>=60min) raw: 0.05627705627705628 | od-normalized: 0.008658008658008658
WORK multi-rate (workhour,>=60min) raw: 0.25903614457831325 | od-normalized: 0.13855421686746988


In [8]:
od = df_od.dropna(subset=["primary_home_hex","primary_work_hex"]).copy()
od = od[od["primary_home_hex"].notna() & od["primary_work_hex"].notna()].copy()

# unique per user
od_u = od.groupby("user_id", as_index=False).agg(
    home_hex=("primary_home_hex","first"),
    work_hex=("primary_work_hex","first")
)

od_u["od_pair"] = od_u["home_hex"].astype(str) + " -> " + od_u["work_hex"].astype(str)
od_pair = od_u["od_pair"].value_counts().reset_index()
od_pair.columns = ["od_pair","n_users"]

out_od = OUT_TAB / f"uk_{HORIZON.lower()}_commute_od_pairs.csv"
od_pair.to_csv(out_od, index=False)
print("Saved:", out_od)
od_pair.head(10)

Saved: /Users/pang/Codes/GISRUK/outputs/tables/uk_1w_commute_od_pairs.csv


Unnamed: 0,od_pair,n_users
0,8a194ad139a7fff -> 8a194ad0858ffff,1
1,8a195da4e457fff -> 8a195da4e8cffff,1
2,8a194ad044affff -> 8a194ad009b7fff,1
3,8a194e6a3317fff -> 8a194e682ccffff,1
4,8a195dad8b57fff -> 8a195dad8b57fff,1
5,8a195da680effff -> 8a194ad304e7fff,1
6,8a194e6e3b57fff -> 8a194e651a97fff,1
7,8a194ad2c1b7fff -> 8a194ad23b07fff,1
8,8a194e7996d7fff -> 8a194e6a64cffff,1
9,8a194ad1856ffff -> 8a195da49d5ffff,1


In [9]:
def max_pairwise_km(hex_set):
    hs = list(hex_set)
    mx = 0.0
    for i in range(len(hs)):
        lat1,lon1 = centroids.get(hs[i], (np.nan,np.nan))
        if not np.isfinite(lat1): continue
        for j in range(i+1, len(hs)):
            lat2,lon2 = centroids.get(hs[j], (np.nan,np.nan))
            if not np.isfinite(lat2): continue
            mx = max(mx, haversine_km(lat1,lon1,lat2,lon2))
    return mx

mx = [max_pairwise_km(home_set.get(u,set())) for u in home_set.keys()]
pd.Series(mx).describe()

count    231.000000
mean       0.025907
std        0.059261
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        0.390666
dtype: float64