In [1]:
import sys
sys.path.append(".")
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.config import ACTIVITIES, ABBR, TZ_PARIS, TZ_LONDON
from src.viz_style import apply_nature_style

apply_nature_style()

In [3]:
from src.utils_time import to_local_time_series, split_cross_midnight, week_start_monday
from src.utils_split import split_users_by_hash
from src.regularity import regularity_report, summarize_reg, compute_user_hex_stats, infer_home_work_anchors, make_hex_lookup

In [4]:
from pathlib import Path

ROOT = Path(".")
OUT_DATA = ROOT / "outputs" / "data"
OUT_MODELS = ROOT / "outputs" / "models"
OUT_FIG = ROOT / "outputs" / "figures"
OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_FIG.mkdir(parents=True, exist_ok=True)

train = pd.read_parquet(OUT_DATA / "paris_stays_train.parquet")
valid = pd.read_parquet(OUT_DATA / "paris_stays_valid.parquet")

for d in [train, valid]:
    d["user_id"] = d["user_id"].astype(str)
    d["start_time"] = pd.to_datetime(d["start_time"])
    d["end_time"] = pd.to_datetime(d["end_time"])
    d["duration_min"] = pd.to_numeric(d["duration_min"], errors="coerce")
    d["hex_id"] = d["hex_id"].astype(str).replace({"": np.nan, "nan": np.nan})
    d["y_true"] = d["y_true"].astype(str)
    d.dropna(subset=["hex_id","start_time","end_time","duration_min","y_true"], inplace=True)

poi = pd.read_parquet(OUT_DATA / "paris_poi_huff_k4_b1.5.parquet")
poi["hex_id"] = poi["hex_id"].astype(str)

POI_COLS = ["poi_edu_cnt","poi_health_cnt","poi_retail_cnt","poi_leisure_cnt",
            "poi_transport_cnt","poi_accom_cnt","poi_office_cnt","poi_total_cnt"]
for c in POI_COLS:
    if c not in poi.columns:
        poi[c] = 0

print("Train stays:", len(train), "users:", train["user_id"].nunique())
print("Valid stays:", len(valid), "users:", valid["user_id"].nunique())
print("POI rows:", len(poi))

Train stays: 49803 users: 2482
Valid stays: 12437 users: 621
POI rows: 29628


In [5]:
import h3
import math

def cell_to_latlon(cell):
    if hasattr(h3, "cell_to_latlng"):
        lat, lon = h3.cell_to_latlng(cell)
    else:
        lat, lon = h3.h3_to_geo(cell)
    return float(lat), float(lon)

def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    p1, p2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2-lat1)
    dlmb = math.radians(lon2-lon1)
    a = math.sin(dphi/2)**2 + math.cos(p1)*math.cos(p2)*math.sin(dlmb/2)**2
    return 2*R*math.asin(math.sqrt(a))

# Precompute centroids for all hex in train+valid
all_hex = pd.Index(pd.concat([train["hex_id"], valid["hex_id"]]).unique()).astype(str)
centroids = {h: cell_to_latlon(h) for h in all_hex}
print("Centroids computed:", len(centroids))

Centroids computed: 29628


In [6]:
import numpy as np
import pandas as pd

def infer_anchors_and_home_area(stays_df, k_home_area=3):
    d = stays_df.copy().sort_values(["user_id","start_time"])
    d["date"] = d["start_time"].dt.date

    mid = d["start_time"] + pd.to_timedelta(d["duration_min"]/2, unit="m")
    hh = mid.dt.hour + mid.dt.minute/60.0
    night = (hh >= 20) | (hh < 6)
    weekday = (d["start_time"].dt.weekday < 5)
    workhour = weekday & (hh >= 9) & (hh < 17)

    d["night_dwell"] = np.where(night, d["duration_min"], 0.0)
    d["work_dwell"]  = np.where(workhour, d["duration_min"], 0.0)

    # per user-hex agg
    g = d.groupby(["user_id","hex_id"], as_index=False).agg(
        night_dwell=("night_dwell","sum"),
        work_dwell=("work_dwell","sum"),
        visit_days=("date", lambda x: x.nunique()),
        dwell=("duration_min","sum"),
    )

    home_hex = g.sort_values(["user_id","night_dwell"], ascending=[True,False]) \
                .drop_duplicates("user_id")[["user_id","hex_id"]].rename(columns={"hex_id":"home_hex"})
    work_hex = g.merge(home_hex, on="user_id", how="left")
    work_hex = work_hex[work_hex["hex_id"] != work_hex["home_hex"]]
    work_hex = work_hex.sort_values(["user_id","work_dwell"], ascending=[True,False]) \
                       .drop_duplicates("user_id")[["user_id","hex_id"]].rename(columns={"hex_id":"work_hex"})

    # home area: top-K by night_dwell
    home_area = g[g["night_dwell"] > 0].sort_values(["user_id","night_dwell"], ascending=[True,False]) \
                  .groupby("user_id").head(k_home_area)
    home_area_lookup = home_area.groupby("user_id")["hex_id"].apply(lambda x: set(x.astype(str))).to_dict()

    anchors = home_hex.merge(work_hex, on="user_id", how="left")
    anchors["home_hex"] = anchors["home_hex"].astype(str)
    anchors["work_hex"] = anchors["work_hex"].astype(str)
    return anchors, home_area_lookup

anchors_train, home_area_train = infer_anchors_and_home_area(train, k_home_area=3)
anchors_valid, home_area_valid = infer_anchors_and_home_area(valid, k_home_area=3)

home_train = dict(zip(anchors_train["user_id"], anchors_train["home_hex"]))
work_train = dict(zip(anchors_train["user_id"], anchors_train["work_hex"]))
home_valid = dict(zip(anchors_valid["user_id"], anchors_valid["home_hex"]))
work_valid = dict(zip(anchors_valid["user_id"], anchors_valid["work_hex"]))

print("Anchors train users:", len(home_train), "valid users:", len(home_valid))

Anchors train users: 2482 valid users: 621


In [7]:
def add_poi_features(df, poi_df):
    d = df.merge(poi_df[["hex_id"]+POI_COLS], on="hex_id", how="left").fillna(0)
    sem = d["poi_edu_cnt"] + d["poi_health_cnt"] + d["poi_retail_cnt"] + d["poi_leisure_cnt"]
    d["sem_total"] = sem
    d["edu_ratio"] = d["poi_edu_cnt"]/(sem+1.0)
    d["health_ratio"] = d["poi_health_cnt"]/(sem+1.0)
    d["retail_ratio"] = d["poi_retail_cnt"]/(sem+1.0)
    d["leisure_ratio"] = d["poi_leisure_cnt"]/(sem+1.0)
    return d

def add_time_duration_features(df):
    d = df.copy()
    st = d["start_time"]
    d["hour"] = st.dt.hour
    d["dow"] = st.dt.weekday
    d["is_weekday"] = (d["dow"] < 5).astype(int)
    d["log_dur"] = np.log1p(d["duration_min"].astype(float))
    return d

def add_distance_features(df, home_lookup, work_lookup, home_area_lookup):
    d = df.copy()
    # hex centroid
    latlon = d["hex_id"].map(lambda h: centroids.get(str(h), (np.nan, np.nan)))
    d["lat"] = [x[0] for x in latlon]
    d["lon"] = [x[1] for x in latlon]

    # anchor centroids
    def anchor_latlon(u, anchor_map):
        h = anchor_map.get(u, None)
        return centroids.get(str(h), (np.nan, np.nan))

    home_ll = d["user_id"].map(lambda u: anchor_latlon(u, home_lookup))
    work_ll = d["user_id"].map(lambda u: anchor_latlon(u, work_lookup))
    d["home_lat"] = [x[0] for x in home_ll]; d["home_lon"] = [x[1] for x in home_ll]
    d["work_lat"] = [x[0] for x in work_ll]; d["work_lon"] = [x[1] for x in work_ll]

    # distances
    d["dist_to_home_km"] = [
        haversine_km(a,b,c,e) if np.isfinite(a) and np.isfinite(c) else np.nan
        for a,b,c,e in zip(d["lat"], d["lon"], d["home_lat"], d["home_lon"])
    ]
    d["dist_to_work_km"] = [
        haversine_km(a,b,c,e) if np.isfinite(a) and np.isfinite(c) else np.nan
        for a,b,c,e in zip(d["lat"], d["lon"], d["work_lat"], d["work_lon"])
    ]

    # distance to home_area (min)
    def dist_to_home_area_min(u, lat, lon):
        hs = home_area_lookup.get(u, None)
        if not hs:
            return np.nan
        best = np.inf
        for hh in hs:
            ll = centroids.get(str(hh))
            if ll is None: 
                continue
            best = min(best, haversine_km(lat, lon, ll[0], ll[1]))
        return best if np.isfinite(best) else np.nan

    d["dist_to_home_area_min_km"] = [
        dist_to_home_area_min(u, la, lo) if np.isfinite(la) else np.nan
        for u, la, lo in zip(d["user_id"], d["lat"], d["lon"])
    ]

    # binary proximity flags
    d["near_home_0p5km"] = (d["dist_to_home_km"] <= 0.5).astype(int)
    d["near_work_0p5km"] = (d["dist_to_work_km"] <= 0.5).astype(int)

    return d

def add_prev_distance(df):
    d = df.sort_values(["user_id","start_time"]).copy()
    prev_hex = d.groupby("user_id")["hex_id"].shift(1)
    prev_ll = prev_hex.map(lambda h: centroids.get(str(h), (np.nan, np.nan)))
    prev_lat = [x[0] for x in prev_ll]; prev_lon = [x[1] for x in prev_ll]

    d["dist_prev_km"] = [
        haversine_km(la, lo, pla, plo) if np.isfinite(la) and np.isfinite(pla) else np.nan
        for la,lo,pla,plo in zip(d["lat"], d["lon"], prev_lat, prev_lon)
    ]
    return d

def build_features(stays, home_lookup, work_lookup, home_area_lookup):
    d = stays.copy()
    d = add_time_duration_features(d)
    d = add_poi_features(d, poi)
    d = add_distance_features(d, home_lookup, work_lookup, home_area_lookup)
    d = add_prev_distance(d)
    return d

X_train_df = build_features(train, home_train, work_train, home_area_train)
X_valid_df = build_features(valid, home_valid, work_valid, home_area_valid)

print(X_train_df.columns)

Index(['user_id', 'start_time', 'end_time', 'duration_min', 'hex_id', 'y_true',
       'hour', 'dow', 'is_weekday', 'log_dur', 'poi_edu_cnt', 'poi_health_cnt',
       'poi_retail_cnt', 'poi_leisure_cnt', 'poi_transport_cnt',
       'poi_accom_cnt', 'poi_office_cnt', 'poi_total_cnt', 'sem_total',
       'edu_ratio', 'health_ratio', 'retail_ratio', 'leisure_ratio', 'lat',
       'lon', 'home_lat', 'home_lon', 'work_lat', 'work_lon',
       'dist_to_home_km', 'dist_to_work_km', 'dist_to_home_area_min_km',
       'near_home_0p5km', 'near_work_0p5km', 'dist_prev_km'],
      dtype='object')


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.ensemble import HistGradientBoostingClassifier

FEATURES = [
    # time + duration
    "hour","dow","is_weekday","duration_min","log_dur",
    # POI counts
    "poi_edu_cnt","poi_health_cnt","poi_retail_cnt","poi_leisure_cnt","poi_total_cnt",
    # POI ratios
    "edu_ratio","health_ratio","retail_ratio","leisure_ratio",
    # distances
    "dist_to_home_km","dist_to_work_km","dist_to_home_area_min_km","near_home_0p5km","near_work_0p5km",
    # prev distance
    "dist_prev_km",
]

def prep_xy(df):
    X = df[FEATURES].replace([np.inf,-np.inf], np.nan)
    # HistGB can handle NaN; for MLP we will impute later
    y = df["y_true"].astype(str).values
    return X, y

Xtr, ytr = prep_xy(X_train_df)
Xva, yva = prep_xy(X_valid_df)

# sample weights to reduce class imbalance impact
w = compute_sample_weight(class_weight="balanced", y=ytr)

# --- GBDT (fast, strong baseline) ---
gbdt = HistGradientBoostingClassifier(
    learning_rate=0.08,
    max_depth=6,
    max_iter=400,
    random_state=0
)
gbdt.fit(Xtr, ytr, sample_weight=w)

pred = gbdt.predict(Xva)

acc = accuracy_score(yva, pred)
macro = f1_score(yva, pred, average="macro", labels=ACTIVITIES)
macro_weak = f1_score(yva, pred, average="macro", labels=["WORK","STUDY","PURCHASE","LEISURE","HEALTH","OTHER"])

print("GBDT ACC:", acc, "Macro:", macro, "Macro excl HOME:", macro_weak)
print(classification_report(yva, pred, labels=ACTIVITIES, digits=3, zero_division=0))

# Save model
import joblib
joblib.dump(gbdt, OUT_MODELS / "paris_gbdt_baseline.joblib")
print("Saved:", OUT_MODELS / "paris_gbdt_baseline.joblib")

GBDT ACC: 0.6265980541931334 Macro: 0.4907338508592717 Macro excl HOME: 0.4275618679574135
              precision    recall  f1-score   support

        HOME      0.877     0.863     0.870      4153
        WORK      0.799     0.646     0.714      2566
       STUDY      0.158     0.392     0.225       237
    PURCHASE      0.518     0.583     0.548      1832
     LEISURE      0.464     0.512     0.487      1290
      HEALTH      0.148     0.528     0.231       301
       OTHER      0.508     0.278     0.360      2058

    accuracy                          0.627     12437
   macro avg      0.496     0.543     0.491     12437
weighted avg      0.673     0.627     0.638     12437

Saved: /Users/pang/Codes/GISRUK/outputs/models/paris_gbdt_baseline.joblib


In [9]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.ensemble import HistGradientBoostingClassifier

WEAK = ["WORK","STUDY","PURCHASE","LEISURE","HEALTH","OTHER"]

def fit_eval(feature_list, name):
    Xtr = X_train_df[feature_list].replace([np.inf,-np.inf], np.nan)
    ytr = X_train_df["y_true"].astype(str).values
    Xva = X_valid_df[feature_list].replace([np.inf,-np.inf], np.nan)
    yva = X_valid_df["y_true"].astype(str).values

    w = compute_sample_weight(class_weight="balanced", y=ytr)

    clf = HistGradientBoostingClassifier(
        learning_rate=0.08, max_depth=6, max_iter=400, random_state=0
    )
    clf.fit(Xtr, ytr, sample_weight=w)
    pred = clf.predict(Xva)

    acc = accuracy_score(yva, pred)
    macro = f1_score(yva, pred, average="macro", labels=ACTIVITIES)
    macro_weak = f1_score(yva, pred, average="macro", labels=WEAK)
    return {"setting": name, "ACC": acc, "Macro": macro, "Macro_weak": macro_weak}

BASE = ["hour","dow","is_weekday","duration_min","log_dur"]

POI_ONLY = ["poi_edu_cnt","poi_health_cnt","poi_retail_cnt","poi_leisure_cnt","poi_total_cnt",
            "edu_ratio","health_ratio","retail_ratio","leisure_ratio"]

ANCHOR_DIST = ["dist_to_home_km","dist_to_work_km","dist_to_home_area_min_km","near_home_0p5km","near_work_0p5km"]

PREV_DIST = ["dist_prev_km"]

ABL = []
ABL.append(fit_eval(BASE, "Base: time+duration"))
ABL.append(fit_eval(BASE+POI_ONLY, "Base + POI"))
ABL.append(fit_eval(BASE+ANCHOR_DIST, "Base + anchor-dist"))
ABL.append(fit_eval(BASE+POI_ONLY+ANCHOR_DIST, "Base + POI + anchor-dist"))
ABL.append(fit_eval(BASE+POI_ONLY+ANCHOR_DIST+PREV_DIST, "All (+prev-dist)"))

pd.DataFrame(ABL).sort_values("Macro_weak", ascending=False)

Unnamed: 0,setting,ACC,Macro,Macro_weak
4,All (+prev-dist),0.626598,0.490734,0.427562
3,Base + POI + anchor-dist,0.614939,0.480119,0.418197
1,Base + POI,0.53775,0.440396,0.390492
2,Base + anchor-dist,0.573772,0.438313,0.370905
0,Base: time+duration,0.480743,0.392999,0.336348


In [12]:
labels = ["HOME","WORK","STUDY","PURCHASE","LEISURE","HEALTH","OTHER"]

def save_row_norm_cm(y_true, y_pred, out_csv):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cmn = cm / np.maximum(cm.sum(axis=1, keepdims=True), 1)
    np.savetxt(out_csv, cmn, delimiter=",", fmt="%.6f")
    print("Saved:", out_csv)
    return cmn

In [13]:
save_row_norm_cm(yva, pred, OUT_DATA / "cm_gbdt_row_norm.csv")  # pred 是 GBDT 的预测

Saved: /Users/pang/Codes/GISRUK/outputs/data/cm_gbdt_row_norm.csv


array([[0.86274982, 0.01854081, 0.00674211, 0.03274741, 0.03106188,
        0.02143029, 0.02672767],
       [0.03858145, 0.64575214, 0.14497272, 0.04286828, 0.03546376,
        0.05884645, 0.0335152 ],
       [0.02953586, 0.35443038, 0.39240506, 0.02531646, 0.092827  ,
        0.0464135 , 0.05907173],
       [0.05076419, 0.02565502, 0.00491266, 0.58296943, 0.07860262,
        0.14792576, 0.10917031],
       [0.0627907 , 0.05813953, 0.03488372, 0.14341085, 0.51162791,
        0.09767442, 0.09147287],
       [0.05647841, 0.04318937, 0.00996678, 0.21262458, 0.05980066,
        0.5282392 , 0.089701  ],
       [0.10009718, 0.05830904, 0.01895044, 0.24003887, 0.1739553 ,
        0.13022352, 0.27842566]])