In [1]:
import sys
sys.path.append("...")
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.config import ACTIVITIES, ABBR, TZ_PARIS, TZ_LONDON
from src.viz_style import apply_nature_style

apply_nature_style()

In [3]:
from src.utils_time import to_local_time_series, split_cross_midnight, week_start_monday
from src.utils_split import split_users_by_hash
from src.regularity import regularity_report, summarize_reg, compute_user_hex_stats, infer_home_work_anchors, make_hex_lookup

In [5]:
from pathlib import Path

ROOT = Path("..")
OUT_DATA = ROOT / "outputs" / "data"
OUT_DATA.mkdir(parents=True, exist_ok=True)

paris_poi_raw = pd.read_parquet(ROOT / "assets/pois/paris/fr_hex_poi_res10.parquet")
uk_poi_raw = pd.read_parquet(ROOT / "assets/pois/uk/uk_hex_poi_h10.parquet")

POI_COLS = ["poi_edu_cnt","poi_health_cnt","poi_retail_cnt","poi_leisure_cnt",
            "poi_transport_cnt","poi_accom_cnt","poi_office_cnt"]

def standardize_poi(df):
    d = df.copy()
    d["hex_id"] = d["hex_id"].astype(str)
    for c in POI_COLS:
        if c not in d.columns:
            d[c] = 0
        d[c] = pd.to_numeric(d[c], errors="coerce").fillna(0).astype(float)
    d["poi_total_cnt"] = d[POI_COLS].sum(axis=1)
    return d

paris_poi_raw = standardize_poi(paris_poi_raw)
uk_poi_raw = standardize_poi(uk_poi_raw)

print("Paris POI rows:", len(paris_poi_raw))
print("UK POI rows:", len(uk_poi_raw))

Paris POI rows: 77235
UK POI rows: 1587402


In [6]:
def prep_stays(path):
    d = pd.read_parquet(path)
    if "user_id" not in d.columns and "userid" in d.columns:
        d = d.rename(columns={"userid":"user_id"})
    d["user_id"] = d["user_id"].astype(str)
    d["start_time"] = pd.to_datetime(d["start_time"], errors="coerce")
    d["end_time"] = pd.to_datetime(d["end_time"], errors="coerce")
    if "duration_min" not in d.columns:
        d["duration_min"] = (d["end_time"] - d["start_time"]).dt.total_seconds()/60.0
    d["hex_id"] = d["hex_id"].astype(str).replace({"": np.nan, "nan": np.nan})
    d = d.dropna(subset=["hex_id","start_time","end_time"]).copy()
    return d

paris_train = prep_stays(OUT_DATA / "paris_stays_train.parquet")
paris_valid = prep_stays(OUT_DATA / "paris_stays_valid.parquet")

# UK 3M file from notebook 02 (choose latest)
uk_3m_path = sorted(OUT_DATA.glob("uk_london_stays_3m_u*.parquet"))[-1]
uk_3m = prep_stays(uk_3m_path)

centers_paris = sorted(set(pd.concat([paris_train["hex_id"], paris_valid["hex_id"]]).unique()))
centers_uk = sorted(set(uk_3m["hex_id"].unique()))

print("Centers Paris:", len(centers_paris))
print("Centers UK:", len(centers_uk))

Centers Paris: 29628
Centers UK: 9392


In [7]:
import h3
import math
from collections import defaultdict

K_RING = 4      # ~500m buffer approximation at H3 res10
BETA = 1.5      # distance decay

def h3_grid_ring(cell, r):
    # exact ring at distance r
    if hasattr(h3, "grid_ring"):
        return h3.grid_ring(cell, r)
    # fallback: approximate (slower): disk then filter by distance
    disk = h3.grid_disk(cell, r) if hasattr(h3, "grid_disk") else h3.k_ring(cell, r)
    dist_fn = h3.grid_distance if hasattr(h3, "grid_distance") else h3.h3_distance
    return [x for x in disk if dist_fn(cell, x) == r]

def build_lookup_counts(poi_df):
    # hex -> np.array counts
    return {r.hex_id: np.array([getattr(r, c) for c in POI_COLS], dtype=float)
            for r in poi_df[["hex_id"]+POI_COLS].itertuples(index=False)}

def huff_agg_for_centers(centers, lookup, k=4, beta=1.5, progress_every=5000):
    weights = [1.0/((r+1.0)**beta) for r in range(k+1)]
    out = np.zeros((len(centers), len(POI_COLS)), dtype=float)

    for i, hx in enumerate(centers):
        s = np.zeros(len(POI_COLS), dtype=float)

        # ring 0
        v0 = lookup.get(hx)
        if v0 is not None:
            s += weights[0] * v0

        # rings 1..k
        for r in range(1, k+1):
            w = weights[r]
            for nb in h3_grid_ring(hx, r):
                v = lookup.get(str(nb))
                if v is not None:
                    s += w * v

        out[i] = s
        if progress_every and (i+1) % progress_every == 0:
            print(f"processed {i+1:,}/{len(centers):,}")

    df = pd.DataFrame(out, columns=[c for c in POI_COLS])
    df.insert(0, "hex_id", centers)
    df["poi_total_cnt"] = df[POI_COLS].sum(axis=1)
    return df

In [8]:
paris_lookup = build_lookup_counts(paris_poi_raw)
paris_poi_huff = huff_agg_for_centers(centers_paris, paris_lookup, k=K_RING, beta=BETA, progress_every=10000)

paris_out = OUT_DATA / f"paris_poi_huff_k{K_RING}_b{BETA}.parquet"
paris_poi_huff.to_parquet(paris_out, index=False)
print("Saved:", paris_out)
paris_poi_huff.head()

processed 10,000/29,628
processed 20,000/29,628
Saved: /Users/pang/Codes/GISRUK/outputs/data/paris_poi_huff_k4_b1.5.parquet


Unnamed: 0,hex_id,poi_edu_cnt,poi_health_cnt,poi_retail_cnt,poi_leisure_cnt,poi_transport_cnt,poi_accom_cnt,poi_office_cnt,poi_total_cnt
0,8a186132365ffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8a18613344a7fff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8a1861a0568ffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8a1861b09027fff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8a1861b5d6dffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
uk_lookup = build_lookup_counts(uk_poi_raw)
uk_poi_huff = huff_agg_for_centers(centers_uk, uk_lookup, k=K_RING, beta=BETA, progress_every=10000)

uk_out = OUT_DATA / f"uk_poi_huff_k{K_RING}_b{BETA}_u{uk_3m['user_id'].nunique()}.parquet"
uk_poi_huff.to_parquet(uk_out, index=False)
print("Saved:", uk_out)
uk_poi_huff.head()

Saved: /Users/pang/Codes/GISRUK/outputs/data/uk_poi_huff_k4_b1.5_u231.parquet


Unnamed: 0,hex_id,poi_edu_cnt,poi_health_cnt,poi_retail_cnt,poi_leisure_cnt,poi_transport_cnt,poi_accom_cnt,poi_office_cnt,poi_total_cnt
0,8a1870cb699ffff,0.0,0.0,0.0,1.635446,1.68145,1.0,1.0,5.316896
1,8a1874158477fff,0.0,0.0,2.0,0.089443,1.906893,0.0,1.25,5.246336
2,8a18747185a7fff,1.442996,1.332114,1.068549,2.160556,3.662028,1.507328,4.172281,15.345852
3,8a187471a947fff,0.464443,2.8728,2.362353,2.335217,3.292884,1.574232,11.247221,24.149149
4,8a187471a94ffff,0.621336,1.766136,1.484243,1.587796,2.789774,0.925221,6.78375,15.958255


In [10]:
SEM_COLS = ["poi_edu_cnt","poi_health_cnt","poi_retail_cnt","poi_leisure_cnt"]

def attach_poi(stays, poi_df):
    return stays.merge(poi_df[["hex_id"]+POI_COLS+["poi_total_cnt"]], on="hex_id", how="left").fillna(0)

def semantic_zero_breakdown(stays_p):
    stays_p = stays_p.copy()
    stays_p["poi_sem_cnt"] = stays_p[SEM_COLS].sum(axis=1)
    sem0 = stays_p["poi_sem_cnt"] == 0
    total0 = stays_p["poi_total_cnt"] == 0
    return {
        "rate_sem0": float(sem0.mean()),
        "rate_total0": float(total0.mean()),
        "rate_sem0_totalpos": float((sem0 & (~total0)).mean()),
        "rate_sem0_total0": float((sem0 & total0).mean()),
    }

# Paris valid: raw vs huff
paris_valid_raw = attach_poi(paris_valid, paris_poi_raw)
paris_valid_huf = attach_poi(paris_valid, paris_poi_huff)

print("Paris VALID raw:", semantic_zero_breakdown(paris_valid_raw))
print("Paris VALID huff:", semantic_zero_breakdown(paris_valid_huf))

# UK 1W (use subset of uk_3m to compare; or load your uk_1w parquet)
uk_1w_path = sorted(OUT_DATA.glob("uk_london_stays_1w_u*.parquet"))[-1]
uk_1w = prep_stays(uk_1w_path)

uk_1w_raw = attach_poi(uk_1w, uk_poi_raw)
uk_1w_huf = attach_poi(uk_1w, uk_poi_huff)

print("UK 1W raw:", semantic_zero_breakdown(uk_1w_raw))
print("UK 1W huff:", semantic_zero_breakdown(uk_1w_huf))

Paris VALID raw: {'rate_sem0': 0.36994452038272896, 'rate_total0': 0.27747849159765214, 'rate_sem0_totalpos': 0.09246602878507679, 'rate_sem0_total0': 0.27747849159765214}
Paris VALID huff: {'rate_sem0': 0.013508080726863391, 'rate_total0': 0.01117632869663102, 'rate_sem0_totalpos': 0.002331752030232371, 'rate_sem0_total0': 0.01117632869663102}
UK 1W raw: {'rate_sem0': 0.5144635048772284, 'rate_total0': 0.31113353514968045, 'rate_sem0_totalpos': 0.20332996972754794, 'rate_sem0_total0': 0.31113353514968045}
UK 1W huff: {'rate_sem0': 0.003699966363942146, 'rate_total0': 0.0003363605785401951, 'rate_sem0_totalpos': 0.003363605785401951, 'rate_sem0_total0': 0.0003363605785401951}


In [12]:
def ratio_stats(stays_p):
    d = stays_p.copy()
    sem = d["poi_edu_cnt"] + d["poi_health_cnt"] + d["poi_retail_cnt"] + d["poi_leisure_cnt"]
    d["sem"] = sem
    d["retail_ratio"] = d["poi_retail_cnt"] / (sem + 1.0)
    d["leisure_ratio"] = d["poi_leisure_cnt"] / (sem + 1.0)
    d["edu_ratio"] = d["poi_edu_cnt"] / (sem + 1.0)
    d["health_ratio"] = d["poi_health_cnt"] / (sem + 1.0)
    return d[["retail_ratio","leisure_ratio","edu_ratio","health_ratio"]].describe(percentiles=[0.5,0.9,0.95,0.99])

print("Paris VALID huff ratio stats:")
print(ratio_stats(paris_valid_huf))

print("\nUK 1W huff ratio stats:")
print(ratio_stats(uk_1w_huf))

Paris VALID huff ratio stats:
       retail_ratio  leisure_ratio     edu_ratio  health_ratio
count  12437.000000   12437.000000  12437.000000  12437.000000
mean       0.311882       0.470395      0.078628      0.046784
std        0.200763       0.174699      0.069573      0.043217
min        0.000000       0.000000      0.000000      0.000000
50%        0.313018       0.458343      0.062762      0.040229
90%        0.572918       0.704179      0.163758      0.093432
95%        0.635998       0.771824      0.212558      0.120723
99%        0.772662       0.876334      0.326606      0.199530
max        0.916428       0.986403      0.642741      0.649801

UK 1W huff ratio stats:
       retail_ratio  leisure_ratio    edu_ratio  health_ratio
count   5946.000000    5946.000000  5946.000000   5946.000000
mean       0.278326       0.345350     0.103984      0.123687
std        0.156386       0.139782     0.094142      0.090281
min        0.000000       0.000000     0.000000      0.000000
50%  

In [11]:
def build_poi_lookup(df):
    d = df.copy()
    d["hex_id"] = d["hex_id"].astype(str)
    return {r.hex_id: r for r in d[["hex_id"]+POI_COLS+["poi_total_cnt"]].itertuples(index=False)}

poi_lookup_paris_huff = build_poi_lookup(paris_poi_huff)
poi_lookup_uk_huff = build_poi_lookup(uk_poi_huff)

print("Lookup Paris huff:", len(poi_lookup_paris_huff))
print("Lookup UK huff:", len(poi_lookup_uk_huff))

Lookup Paris huff: 29628
Lookup UK huff: 9392
