In [1]:
import sys
sys.path.append("...")
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.config import ACTIVITIES, ABBR, TZ_PARIS, TZ_LONDON
from src.viz_style import apply_nature_style

apply_nature_style()

In [3]:
from src.utils_time import to_local_time_series, split_cross_midnight, week_start_monday
from src.utils_split import split_users_by_hash
from src.regularity import regularity_report, summarize_reg, compute_user_hex_stats, infer_home_work_anchors, make_hex_lookup

In [10]:
from pathlib import Path

ROOT = Path("...")
DATA_OUT = ROOT / "outputs" / "data"
TAB_OUT = ROOT / "outputs" / "tables"
TAB_OUT.mkdir(parents=True, exist_ok=True)

paris_poi_path = ROOT / "assets/pois/paris/fr_hex_poi_res10.parquet"
uk_poi_path = ROOT / "assets/pois/uk/uk_hex_poi_h10.parquet"

paris_poi = pd.read_parquet(paris_poi_path)
uk_poi = pd.read_parquet(uk_poi_path)

print("Paris POI rows:", len(paris_poi), "cols:", list(paris_poi.columns))
print("UK POI rows:", len(uk_poi), "cols:", list(uk_poi.columns))

Paris POI rows: 77235 cols: ['hex_id', 'poi_accom_cnt', 'poi_edu_cnt', 'poi_health_cnt', 'poi_leisure_cnt', 'poi_office_cnt', 'poi_retail_cnt', 'poi_transport_cnt', 'poi_total_cnt']
UK POI rows: 1587402 cols: ['hex_id', 'poi_accom_cnt', 'poi_edu_cnt', 'poi_health_cnt', 'poi_leisure_cnt', 'poi_office_cnt', 'poi_other_cnt', 'poi_retail_cnt', 'poi_transport_cnt', 'poi_total_cnt']


In [8]:
POI_COLS = ["poi_edu_cnt","poi_health_cnt","poi_retail_cnt","poi_leisure_cnt",
            "poi_transport_cnt","poi_accom_cnt","poi_office_cnt"]

def standardize_poi(df):
    d = df.copy()
    d["hex_id"] = d["hex_id"].astype(str)
    for c in POI_COLS:
        if c not in d.columns:
            d[c] = 0
        d[c] = pd.to_numeric(d[c], errors="coerce").fillna(0).astype(float)
    d["poi_total_cnt"] = d[POI_COLS].sum(axis=1)
    return d

paris_poi = standardize_poi(paris_poi)
uk_poi = standardize_poi(uk_poi)

paris_poi[["poi_total_cnt"]+POI_COLS].describe(percentiles=[0.5,0.9,0.95,0.99])

Unnamed: 0,poi_total_cnt,poi_edu_cnt,poi_health_cnt,poi_retail_cnt,poi_leisure_cnt,poi_transport_cnt,poi_accom_cnt,poi_office_cnt
count,77235.0,77235.0,77235.0,77235.0,77235.0,77235.0,77235.0,77235.0
mean,4.513653,0.242015,0.127157,0.933903,2.059908,0.825183,0.078255,0.247232
std,6.234538,0.729008,0.507858,3.202304,3.477598,1.733872,0.900183,0.871455
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
90%,9.0,1.0,0.0,2.0,5.0,3.0,0.0,1.0
95%,15.0,2.0,1.0,5.0,7.0,4.0,0.0,2.0
99%,33.0,4.0,2.0,15.0,15.0,7.0,2.0,4.0
max,152.0,24.0,29.0,139.0,152.0,66.0,78.0,44.0


In [11]:
paris_train_path = DATA_OUT / "paris_stays_train.parquet"
paris_valid_path = DATA_OUT / "paris_stays_valid.parquet"

paris_train = pd.read_parquet(paris_train_path)
paris_valid = pd.read_parquet(paris_valid_path)

# UK paths (use your cohort-size encoded filenames from notebook 02)
# Example: uk_london_stays_1w_u231.parquet
uk_1w_path = sorted(DATA_OUT.glob("uk_london_stays_1w_u*.parquet"))[-1]
uk_1m_path = sorted(DATA_OUT.glob("uk_london_stays_1m_u*.parquet"))[-1]
uk_3m_path = sorted(DATA_OUT.glob("uk_london_stays_3m_u*.parquet"))[-1]

uk_1w = pd.read_parquet(uk_1w_path)
uk_1m = pd.read_parquet(uk_1m_path)
uk_3m = pd.read_parquet(uk_3m_path)

def prep_stays(df, user_col="user_id"):
    d = df.copy()
    if user_col not in d.columns and "userid" in d.columns:
        d = d.rename(columns={"userid":"user_id"})
    d["user_id"] = d["user_id"].astype(str)
    d["start_time"] = pd.to_datetime(d["start_time"])
    d["end_time"] = pd.to_datetime(d["end_time"])
    if "duration_min" not in d.columns:
        d["duration_min"] = (d["end_time"] - d["start_time"]).dt.total_seconds()/60.0
    d["hex_id"] = d["hex_id"].astype(str).replace({"": np.nan, "nan": np.nan})
    d = d.dropna(subset=["hex_id"]).copy()
    return d

paris_train = prep_stays(paris_train)
paris_valid = prep_stays(paris_valid)
uk_1w = prep_stays(uk_1w)
uk_1m = prep_stays(uk_1m)
uk_3m = prep_stays(uk_3m)

print("Loaded stays:",
      "Paris train", len(paris_train),
      "Paris valid", len(paris_valid),
      "UK 1W", len(uk_1w),
      "UK 1M", len(uk_1m),
      "UK 3M", len(uk_3m))

Loaded stays: Paris train 49803 Paris valid 12437 UK 1W 5946 UK 1M 24782 UK 3M 61523


In [12]:
def join_coverage(stays, poi):
    poi_set = set(poi["hex_id"].astype(str).unique())
    return float(stays["hex_id"].astype(str).isin(poi_set).mean())

rows = []
rows.append(("Paris_train", join_coverage(paris_train, paris_poi)))
rows.append(("Paris_valid", join_coverage(paris_valid, paris_poi)))
rows.append(("UK_1W", join_coverage(uk_1w, uk_poi)))
rows.append(("UK_1M", join_coverage(uk_1m, uk_poi)))
rows.append(("UK_3M", join_coverage(uk_3m, uk_poi)))

cov = pd.DataFrame(rows, columns=["dataset","poi_join_coverage"])
cov_path = TAB_OUT / "poi_join_coverage.csv"
cov.to_csv(cov_path, index=False)
print("Saved:", cov_path)
cov

Saved: /Users/pang/Codes/GISRUK/outputs/tables/poi_join_coverage.csv


Unnamed: 0,dataset,poi_join_coverage
0,Paris_train,0.72078
1,Paris_valid,0.722522
2,UK_1W,0.774807
3,UK_1M,0.774998
4,UK_3M,0.773418


In [13]:
SEM_COLS = ["poi_edu_cnt","poi_health_cnt","poi_retail_cnt","poi_leisure_cnt"]

def attach_poi(stays, poi):
    return stays.merge(poi[["hex_id"] + POI_COLS + ["poi_total_cnt"]], on="hex_id", how="left").fillna(0)

def poi_sem_zero_rate(stays_p):
    stays_p["poi_sem_cnt"] = stays_p[SEM_COLS].sum(axis=1)
    return float((stays_p["poi_sem_cnt"] == 0).mean())

def poi_total_stats(stays_p):
    return stays_p["poi_total_cnt"].describe(percentiles=[0.5,0.9,0.95,0.99])

for name, d, poi in [("Paris_train", paris_train, paris_poi),
                     ("Paris_valid", paris_valid, paris_poi),
                     ("UK_1W", uk_1w, uk_poi),
                     ("UK_1M", uk_1m, uk_poi),
                     ("UK_3M", uk_3m, uk_poi)]:
    dp = attach_poi(d, poi)
    zr = poi_sem_zero_rate(dp)
    print("\n==", name, "==")
    print("semantic POI=0 rate:", zr)
    print(poi_total_stats(dp))


== Paris_train ==
semantic POI=0 rate: 0.37180491135072186
count    49803.000000
mean         8.169387
std         12.635552
min          0.000000
50%          3.000000
90%         24.000000
95%         34.000000
99%         55.000000
max        152.000000
Name: poi_total_cnt, dtype: float64

== Paris_valid ==
semantic POI=0 rate: 0.36994452038272896
count    12437.000000
mean         8.692289
std         13.135681
min          0.000000
50%          4.000000
90%         26.000000
95%         36.000000
99%         55.000000
max        147.000000
Name: poi_total_cnt, dtype: float64

== UK_1W ==
semantic POI=0 rate: 0.5144635048772284
count    5946.000000
mean        7.358729
std        14.180287
min         0.000000
50%         2.000000
90%        22.000000
95%        36.000000
99%        74.000000
max       251.000000
Name: poi_total_cnt, dtype: float64

== UK_1M ==
semantic POI=0 rate: 0.5188443224921314
count    24782.000000
mean         7.692963
std         15.526703
min          0.

In [14]:
def semantic_zero_breakdown(stays_p):
    stays_p = stays_p.copy()
    stays_p["poi_sem_cnt"] = stays_p[["poi_edu_cnt","poi_health_cnt","poi_retail_cnt","poi_leisure_cnt"]].sum(axis=1)
    stays_p["poi_total_cnt"] = stays_p["poi_total_cnt"]

    sem0 = stays_p["poi_sem_cnt"] == 0
    total0 = stays_p["poi_total_cnt"] == 0

    return {
        "rate_sem0": float(sem0.mean()),
        "rate_total0": float(total0.mean()),
        "rate_sem0_totalpos": float((sem0 & (~total0)).mean()),
        "rate_sem0_total0": float((sem0 & total0).mean()),
    }

uk1w_p = attach_poi(uk_1w, uk_poi)
print(semantic_zero_breakdown(uk1w_p))

parisv_p = attach_poi(paris_valid, paris_poi)
print(semantic_zero_breakdown(parisv_p))

{'rate_sem0': 0.5144635048772284, 'rate_total0': 0.2251934073326606, 'rate_sem0_totalpos': 0.2892700975445678, 'rate_sem0_total0': 0.2251934073326606}
{'rate_sem0': 0.36994452038272896, 'rate_total0': 0.27747849159765214, 'rate_sem0_totalpos': 0.09246602878507679, 'rate_sem0_total0': 0.27747849159765214}
