In [6]:
import sys
sys.path.append(".")
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

from pathlib import Path

import ipywidgets as widgets
from IPython.display import display, clear_output
import folium
import h3
import math

from src.viz_style import apply_nature_style
from src.utils_time import to_local_time_series

apply_nature_style()

TZ_LONDON = "Europe/London"

In [11]:
ROOT = Path(".")
OUT_DATA = ROOT / "outputs" / "data"
OUT_TAB  = ROOT / "outputs" / "tables"

REGISTRY = {
    "1W": {
        "Rule-based": OUT_DATA / "uk_1w_pred_rulebased.parquet",
        "GBDT":       OUT_DATA / "uk_1w_pred_gbdt.parquet",
        "HMM":        OUT_DATA / "uk_1w_pred_hmm.parquet",
        "Hybrid":     OUT_DATA / "uk_1w_pred_hybrid.parquet",  
        "Hybrid(OD-normalized)": OUT_DATA / "uk_1w_pred_hybrid_od.parquet"
    },
    "1M": {
        "Rule-based": OUT_DATA / "uk_1m_pred_rulebased.parquet",
        "GBDT":       OUT_DATA / "uk_1m_pred_gbdt.parquet",
        "HMM":        OUT_DATA / "uk_1m_pred_hmm.parquet",
        "Hybrid":     OUT_DATA / "uk_1m_pred_hybrid.parquet",
    },
    "3M": {
        "Rule-based": OUT_DATA / "uk_3m_pred_rulebased.parquet",
        "GBDT":       OUT_DATA / "uk_3m_pred_gbdt.parquet",
        "HMM":        OUT_DATA / "uk_3m_pred_hmm.parquet",
        "Hybrid":     OUT_DATA / "uk_3m_pred_hybrid.parquet",
    },

}

def available_models(horizon):
    return [m for m, p in REGISTRY[horizon].items() if p.exists()]

print("Available prediction files:")
for h in ["1W","1M","3M"]:
    print(h, "->", available_models(h))

Available prediction files:
1W -> ['Rule-based', 'GBDT', 'HMM', 'Hybrid', 'Hybrid(OD-normalized)']
1M -> ['GBDT', 'HMM', 'Hybrid']
3M -> ['GBDT', 'Hybrid']


In [12]:

# Always initialize (prevents NameError even if files missing)
group_to_users = {}

# 1) Load review sample pack (entropy_high/low/drop)
sample_path = OUT_TAB / "uk_1w_review_sample_pack.csv"
if sample_path.exists():
    sample_pack = pd.read_csv(sample_path)
    sample_pack["user_id"] = sample_pack["user_id"].astype(str)
    for g in sample_pack["group"].unique():
        group_to_users[g] = sorted(sample_pack[sample_pack["group"] == g]["user_id"].unique())
    print("Loaded sample pack:", sample_path.name, "| groups:", list(group_to_users.keys()))
else:
    print("No sample pack found:", sample_path)

# 2) Add secondary-night-home group (>=60min)
sec_path = OUT_TAB / "uk_1w_secondary_night_home_users_ge60min.csv"
if sec_path.exists():
    sec_users = pd.read_csv(sec_path)["user_id"].astype(str).unique().tolist()
    group_to_users["secondary_night_home_ge60min"] = sorted(sec_users)
    print("Added group: secondary_night_home_ge60min | n =", len(sec_users))
else:
    print("No secondary-night-home list found:", sec_path)

# Final check
print("Final groups:", ["All users"] + sorted(group_to_users.keys()))

Loaded sample pack: uk_1w_review_sample_pack.csv | groups: ['entropy_high', 'entropy_low', 'entropy_drop_high']
Added group: secondary_night_home_ge60min | n = 13
Final groups: ['All users', 'entropy_drop_high', 'entropy_high', 'entropy_low', 'secondary_night_home_ge60min']


In [13]:
_cache = {}

def load_pred(horizon, model):
    key = (horizon, model)
    if key in _cache:
        return _cache[key]

    path = REGISTRY[horizon][model]
    df = pd.read_parquet(path)

    # standardize columns
    if "user_id" not in df.columns and "userid" in df.columns:
        df = df.rename(columns={"userid": "user_id"})
    df["user_id"] = df["user_id"].astype(str)
    df["start_time"] = pd.to_datetime(df["start_time"])
    df["end_time"] = pd.to_datetime(df["end_time"])
    if "duration_min" not in df.columns:
        df["duration_min"] = (df["end_time"] - df["start_time"]).dt.total_seconds()/60.0
    df["duration_min"] = pd.to_numeric(df["duration_min"], errors="coerce").fillna(0.0)
    df["hex_id"] = df["hex_id"].astype(str)

    # prediction column: unify to "label"
    if "label_od" in df.columns:
        df["label"] = df["label_od"].astype(str)
    elif "y_pred" in df.columns:
        df["label"] = df["y_pred"].astype(str)
    elif "y_pred_s4" in df.columns:
        df["label"] = df["y_pred_s4"].astype(str)
    elif "y_pred_s3" in df.columns:
        df["label"] = df["y_pred_s3"].astype(str)
    else:
        raise ValueError(...)
        
    # keep only necessary cols (reduce memory)
    keep_cols = ["user_id","start_time","end_time","duration_min","hex_id","label"]
    for c in ["lat","lon"]:
        if c in df.columns:
            keep_cols.append(c)

    df = df[keep_cols].copy()
    _cache[key] = df
    return df

In [14]:
ACT_COL = {
    "HOME":"#1f77b4","WORK":"#ff7f0e","STUDY":"#2ca02c","PURCHASE":"#d62728",
    "LEISURE":"#9467bd","HEALTH":"#8c564b","OTHER":"#e377c2",
}

def week_start_monday_local(ts: pd.Timestamp) -> pd.Timestamp:
    return (ts - pd.Timedelta(days=ts.weekday())).normalize()

def filter_user_week(df, user_id, week_start_str, tz=TZ_LONDON):
    d = df[df["user_id"] == str(user_id)].copy()
    d["start_time"] = to_local_time_series(d["start_time"], tz=tz, assume_utc_if_naive=True)
    d["end_time"]   = to_local_time_series(d["end_time"], tz=tz, assume_utc_if_naive=True)
    d = d.sort_values("start_time").copy()

    ws = pd.to_datetime(week_start_str)
    if ws.tzinfo is None:
        ws = ws.tz_localize(tz)
    we = ws + pd.Timedelta(days=7)

    d = d[(d["start_time"] >= ws) & (d["start_time"] < we)].copy()
    return d

def night_home_share(d_week: pd.DataFrame, label_col="label"):
    d = d_week.copy()
    mid = d["start_time"] + pd.to_timedelta(d["duration_min"]/2, unit="m")
    hh = mid.dt.hour + mid.dt.minute/60.0
    night = (hh >= 20) | (hh < 6)
    d["_night_dwell"] = np.where(night, d["duration_min"], 0.0)
    d["_night_home"] = np.where(night & (d[label_col] == "HOME"), d["duration_min"], 0.0)
    g = d.groupby("user_id", as_index=False).agg(night_dwell=("_night_dwell","sum"), night_home=("_night_home","sum"))
    g["night_home_share"] = g["night_home"] / g["night_dwell"].replace(0, np.nan)
    return float(g["night_home_share"].fillna(0.0).mean()) if len(g) else 0.0

def tophex_entropy(d_week: pd.DataFrame, label_col="label", top_n=5):
    d = d_week.copy()
    dwell = d.groupby(["user_id","hex_id"], as_index=False)["duration_min"].sum()
    dwell["rank"] = dwell.groupby("user_id")["duration_min"].rank(method="first", ascending=False)
    top = dwell[dwell["rank"] <= top_n][["user_id","hex_id"]]
    d = d.merge(top, on=["user_id","hex_id"], how="inner")

    def ent_of_labels(s):
        vc = s.value_counts().values.astype(float)
        p = vc / vc.sum()
        return float(-(p*np.log(np.clip(p,1e-12,1.0))).sum())

    rows=[]
    for (u,h), g in d.groupby(["user_id","hex_id"], sort=False):
        rows.append(ent_of_labels(g[label_col].astype(str)))
    return float(np.mean(rows)) if rows else 0.0

def label_switch_rate(d_week: pd.DataFrame, label_col="label"):
    d = d_week.sort_values(["user_id","start_time"]).copy()
    d["date"] = d["start_time"].dt.date
    def switches_one_day(g):
        labs = g[label_col].astype(str).values
        if len(labs)<=1:
            return 0
        return int(np.sum(labs[1:] != labs[:-1]))
    sw = d.groupby(["user_id","date"]).apply(switches_one_day)
    return float(sw.mean()) if len(sw) else 0.0

def proxy_summary(d_week):
    return night_home_share(d_week), tophex_entropy(d_week), label_switch_rate(d_week)

In [15]:
def assign_lanes(day_df):
    """Greedy lane assignment to avoid overlaps within a day."""
    day_df = day_df.sort_values("start_time").copy()
    lane_ends = []
    lane = []
    for idx, r in day_df.iterrows():
        placed = False
        for li, last_end in enumerate(lane_ends):
            if r["start_time"] >= last_end:
                lane_ends[li] = r["end_time"]
                lane.append((idx, li))
                placed = True
                break
        if not placed:
            lane_ends.append(r["end_time"])
            lane.append((idx, len(lane_ends)-1))
    lane_map = dict(lane)
    day_df["lane"] = [lane_map[i] for i in day_df.index]
    return day_df, len(lane_ends)


def split_cross_midnight_for_ui(d):
    out_rows = []
    for r in d.itertuples(index=False):
        u = str(r.user_id)
        st = r.start_time
        en = r.end_time
        dur = float(r.duration_min)
        hx = str(r.hex_id)
        lab = str(r.label)
        imp = bool(getattr(r, "imputed_midnight", False))

        if en.date() == st.date():
            out_rows.append((u, st, en, dur, hx, lab, imp))
            continue

        cur = st
        while cur.date() < en.date():
            midn = (cur.normalize() + pd.Timedelta(days=1))
            out_rows.append((u, cur, midn, (midn-cur).total_seconds()/60.0, hx, lab, imp))
            cur = midn
        out_rows.append((u, cur, en, (en-cur).total_seconds()/60.0, hx, lab, imp))

    cols = ["user_id","start_time","end_time","duration_min","hex_id","label","imputed_midnight"]
    return pd.DataFrame(out_rows, columns=cols)

def plot_week_timeline(d_week, title, max_lanes=10, impute_gap_min=30):
    """
    Draw stays by true start/end.
    If a stay starts at 00:00 and the previous observed stay ended much earlier (>impute_gap_min),
    treat that 00:00 start as imputed and DO NOT draw it (leave blank).
    """
    apply_nature_style()
    dow_names = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]

    d = d_week.copy()
    d = d.sort_values(["user_id","start_time"]).copy()

    # ---- detect likely day-start imputation ----
    d["prev_end"] = d.groupby("user_id")["end_time"].shift(1)
    d["gap_prev_min"] = (d["start_time"] - d["prev_end"]).dt.total_seconds()/60.0

    # start at midnight?
    st = d["start_time"]
    is_midnight = (st.dt.hour==0) & (st.dt.minute==0) & (st.dt.second==0)

    # if gap from previous end to 00:00 is large -> likely imputed
    # (prev_end exists AND gap_prev_min is large positive)
    d["imputed_midnight"] = is_midnight & d["prev_end"].notna() & (d["gap_prev_min"] > impute_gap_min)

    # keep only needed cols
    d = d[["user_id","start_time","end_time","duration_min","hex_id","label","imputed_midnight"]].copy()

    # split for per-day plotting
    d = split_cross_midnight_for_ui(d)
    d["dow"] = d["start_time"].dt.weekday

    fig, ax = plt.subplots(figsize=(12, 4.8))

    for dow in range(7):
        day = d[d["dow"]==dow].copy()
        if len(day)==0:
            continue

        # assign lanes
        day, nlanes = assign_lanes(day)
        nlanes = min(nlanes, max_lanes)
        lane_h = 0.8 / max(1, nlanes)

        # draw in time order; HOME underlay then others (so non-HOME visible)
        day_home  = day[day["label"]=="HOME"].sort_values("start_time")
        day_other = day[day["label"]!="HOME"].sort_values("start_time")

        def draw_rows(rows, z, alpha):
            for _, r in rows.iterrows():
                # ---- skip imputed midnight segments (leave blank) ----
                if bool(r["imputed_midnight"]) and r["start_time"].hour == 0:
                    continue

                lane = int(r["lane"])
                if lane >= max_lanes:
                    lane = max_lanes - 1

                st, en = r["start_time"], r["end_time"]
                x0 = st.hour + st.minute/60 + st.second/3600
                x1 = en.hour + en.minute/60 + en.second/3600
                y = dow + 0.10 + lane * lane_h

                col = ACT_COL.get(str(r["label"]), "#777777")
                ax.plot([x0,x1],[y,y], lw=7, color=col, solid_capstyle="butt",
                        alpha=alpha, zorder=z)

        draw_rows(day_home,  z=1, alpha=0.95)
        draw_rows(day_other, z=3, alpha=1.00)

    ax.set_xlim(0,24)
    ax.set_xticks(np.arange(0,24,2))
    ax.set_xlabel("Hour of day (local)")
    ax.set_ylim(0,7)
    ax.set_yticks(np.arange(0.5,7.5,1.0))
    ax.set_yticklabels(dow_names)
    ax.set_title(title)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.tick_params(direction="in")

    handles = [Line2D([0],[0], color=ACT_COL[a], lw=7) for a in ["HOME","WORK","STUDY","PURCHASE","LEISURE","HEALTH","OTHER"]]
    ax.legend(handles, ["HOME","WORK","STUDY","PURCHASE","LEISURE","HEALTH","OTHER"],
              ncol=4, loc="upper center", bbox_to_anchor=(0.5, -0.18), frameon=False)

    plt.tight_layout(rect=[0,0.08,1,1])
    plt.show()
    plt.close(fig)

In [16]:
def h3_to_geojson_polygon(cell):
    boundary = h3.cell_to_boundary(cell, geo_json=False) if hasattr(h3, "cell_to_boundary") else h3.h3_to_geo_boundary(cell, geo_json=False)
    coords = [(lon, lat) for lat, lon in boundary]
    coords.append(coords[0])
    return [coords]

def to_parent(cell, res_parent):
    return h3.cell_to_parent(cell, res_parent) if hasattr(h3, "cell_to_parent") else h3.h3_to_parent(cell, res_parent)

def map_user_week(d_week, display_parent_res=8, min_dwell=10):
    d = d_week.copy()

    # parent aggregation for visibility
    if display_parent_res is not None:
        d["hex_show"] = d["hex_id"].astype(str).apply(lambda x: to_parent(x, display_parent_res))
    else:
        d["hex_show"] = d["hex_id"].astype(str)

    # dwell by (hex,label)
    g = d.groupby(["hex_show","label"], as_index=False)["duration_min"].sum()
    mix = g.pivot_table(index="hex_show", columns="label", values="duration_min", fill_value=0)

    dwell_total = mix.sum(axis=1)
    dom = mix.idxmax(axis=1)

    span = d.groupby("hex_show").agg(
        earliest=("start_time","min"),
        latest=("end_time","max"),
        n=("hex_show","size")
    )

    summary = pd.DataFrame({"dominant": dom, "dwell_total": dwell_total}).join(span)
    summary = summary[summary["dwell_total"] >= min_dwell].copy()

    if len(summary) == 0:
        return folium.Map(location=[51.5,-0.1], zoom_start=12, tiles="CartoDB positron")

    # center by top dwell hex
    top_hex = summary["dwell_total"].idxmax()
    latc, lonc = h3.cell_to_latlng(top_hex) if hasattr(h3, "cell_to_latlng") else h3.h3_to_geo(top_hex)
    m = folium.Map(location=[latc, lonc], zoom_start=13, tiles="CartoDB positron")

    # center marker
    folium.CircleMarker(
        location=[latc, lonc],
        radius=5,
        fill=True,
        fill_opacity=0.9,
        popup="Top dwell hex center"
    ).add_to(m)

    all_points = []
    for hx, row in summary.iterrows():
        act = str(row["dominant"])
        col = ACT_COL.get(act, "#777777")

        # top3 label breakdown for this hex
        label_break = mix.loc[hx].sort_values(ascending=False)
        top3 = label_break.head(3)
        break_html = "<br>".join([f"{k}: {v:.0f} min" for k,v in top3.items() if v > 0])

        tooltip = f"{act} | {row['earliest']:%m-%d %H:%M}–{row['latest']:%m-%d %H:%M}"
        popup = (f"<b>hex</b>: {hx}<br><b>dominant</b>: {act}<br>"
                 f"<b>dwell_total</b>: {row['dwell_total']:.0f} min<br>"
                 f"<b>top labels</b>:<br>{break_html}<br>"
                 f"<b>span</b>: {row['earliest']:%Y-%m-%d %H:%M} – {row['latest']:%Y-%m-%d %H:%M}<br>"
                 f"<b>stays</b>: {int(row['n'])}")

        coords = h3_to_geojson_polygon(hx)
        for lon, lat in coords[0]:
            all_points.append((lat, lon))

        feature = {
            "type": "Feature",
            "properties": {"color": col},
            "geometry": {"type": "Polygon", "coordinates": coords},
        }

        folium.GeoJson(
            feature,
            style_function=lambda feat: {
                "fillColor": feat["properties"]["color"],
                "color": feat["properties"]["color"],
                "weight": 2,
                "fillOpacity": 0.35
            },
            tooltip=folium.Tooltip(tooltip),
            popup=folium.Popup(popup, max_width=360),
        ).add_to(m)

    if all_points:
        lats = [p[0] for p in all_points]
        lons = [p[1] for p in all_points]
        m.fit_bounds([[min(lats), min(lons)], [max(lats), max(lons)]])

    return m

In [17]:
# ---------- Widgets ----------
horizon_dd = widgets.Dropdown(options=["1W","1M","3M"], value="1W", description="Horizon:")
model_dd = widgets.Dropdown(options=available_models("1W"), description="Model:")

group_options = ["All users"] + sorted(group_to_users.keys()) if group_to_users else ["All users"]
group_dd = widgets.Dropdown(options=group_options, value="All users", description="Sample:")

user_dd = widgets.Dropdown(options=[], description="User:")
week_dd = widgets.Dropdown(options=[], description="Week:")

btn_prev = widgets.Button(description="Prev user", icon="arrow-left")
btn_next = widgets.Button(description="Next user", icon="arrow-right")

out_box = widgets.Output()
_lock = {"busy": False}

# ---------- Helpers ----------
def refresh_model_options():
    opts = available_models(horizon_dd.value)
    model_dd.options = opts
    if opts:
        if model_dd.value not in opts:
            model_dd.value = opts[0]
    else:
        model_dd.value = None

def get_user_pool():
    df = load_pred(horizon_dd.value, model_dd.value)
    all_users = sorted(df["user_id"].unique())

    # Sample-pack filtering ONLY for 1W; for 1M/3M always show all users
    if horizon_dd.value != "1W" or group_dd.value == "All users":
        return all_users

    pack_users = group_to_users.get(group_dd.value, [])
    pool = [u for u in pack_users if u in set(all_users)]
    return pool if pool else all_users

def refresh_users():
    if model_dd.value is None:
        user_dd.options = []
        user_dd.value = None
        return

    df = load_pred(horizon_dd.value, model_dd.value)
    all_users = sorted(df["user_id"].unique())

    # apply sample-pack filter only for 1W
    if horizon_dd.value == "1W" and group_dd.value != "All users":
        pack_users = group_to_users.get(group_dd.value, [])
        pool = [u for u in pack_users if u in set(all_users)]
        if not pool:
            pool = all_users
    else:
        pool = all_users

    #  preserve current user if possible
    current = user_dd.value
    user_dd.options = pool
    if current in pool:
        user_dd.value = current
    else:
        user_dd.value = pool[0] if pool else None

def refresh_weeks():
    if model_dd.value is None or user_dd.value is None:
        week_dd.options = []
        week_dd.value = None
        return

    df = load_pred(horizon_dd.value, model_dd.value)
    d = df[df["user_id"] == user_dd.value].copy()
    if len(d) == 0:
        week_dd.options = []
        week_dd.value = None
        return

    d["start_time"] = to_local_time_series(d["start_time"], tz=TZ_LONDON, assume_utc_if_naive=True)
    ws = sorted(d["start_time"].apply(week_start_monday_local).dropna().unique())
    week_dd.options = [str(x) for x in ws]
    week_dd.value = week_dd.options[0] if week_dd.options else None

def goto_user(delta):
    opts = list(user_dd.options)
    if not opts or user_dd.value is None:
        return
    i = opts.index(user_dd.value)
    j = max(0, min(len(opts)-1, i+delta))
    user_dd.value = opts[j]

btn_prev.on_click(lambda _: goto_user(-1))
btn_next.on_click(lambda _: goto_user(+1))

def render():
    with out_box:
        clear_output(wait=True)

        if model_dd.value is None or user_dd.value is None or week_dd.value is None:
            print("No selection / no data.")
            return

        df = load_pred(horizon_dd.value, model_dd.value)
        d_week = filter_user_week(df, user_dd.value, week_dd.value, tz=TZ_LONDON)

        print(f"Horizon={horizon_dd.value} | Model={model_dd.value} | Sample={group_dd.value}")
        print(f"User={user_dd.value} | Week={week_dd.value} | rows={len(d_week)}")
        
        mid0 = d_week[(d_week["start_time"].dt.hour==0) & (d_week["start_time"].dt.minute==0)].copy()
        print("Segments starting at 00:00 in this slice:", len(mid0))
        display(mid0[["start_time","end_time","duration_min","hex_id","label"]].head(10))
        
        # quick sanity: list non-HOME stays in this week
        non_home = d_week[d_week["label"]!="HOME"].copy().sort_values("start_time")
        print("\nNon-HOME stays (top 15):")
        display(non_home[["start_time","end_time","duration_min","hex_id","label"]].head(15))

        nh, ent, sw = proxy_summary(d_week)
        print(f"Proxy: night_home_share={nh:.3f} | tophex_entropy={ent:.3f} | switch_rate(day-avg)={sw:.2f}")

        plot_week_timeline(d_week, title=f"{model_dd.value} — timeline")
        m = map_user_week(d_week, display_parent_res=8, min_dwell=10)
        display(m)

def full_refresh():
    if _lock["busy"]:
        return
    _lock["busy"] = True
    try:
        # if horizon != 1W, force sample to All users to avoid empty pool
        if horizon_dd.value != "1W" and group_dd.value != "All users":
            group_dd.value = "All users"

        refresh_model_options()
        refresh_users()
        refresh_weeks()
        render()
    finally:
        _lock["busy"] = False

# ---------- Wire events ----------
def refresh_for_context_change():
    refresh_model_options()
    refresh_users()
    refresh_weeks()
    render()

def on_user_change(_=None):
    # user 改变时不要刷新 users 本身！
    refresh_weeks()
    render()

def on_week_change(_=None):
    render()

# wiring
horizon_dd.observe(lambda _: refresh_for_context_change(), names="value")
model_dd.observe(lambda _: refresh_for_context_change(), names="value")
group_dd.observe(lambda _: refresh_for_context_change(), names="value")

user_dd.observe(lambda _: on_user_change(), names="value")
week_dd.observe(lambda _: on_week_change(), names="value")

# ---------- Init ----------
full_refresh()

display(widgets.HBox([horizon_dd, model_dd, group_dd, user_dd, week_dd]))
display(widgets.HBox([btn_prev, btn_next]))
display(out_box)

HBox(children=(Dropdown(description='Horizon:', options=('1W', '1M', '3M'), value='1W'), Dropdown(description=…

HBox(children=(Button(description='Prev user', icon='arrow-left', style=ButtonStyle()), Button(description='Ne…

Output()

In [24]:
from pathlib import Path
import pandas as pd

# add an extra group: secondary-night-home users (>=60min) from notebook 08
sec_path = OUT_TAB / "uk_1w_secondary_night_home_users_ge60min.csv"
if sec_path.exists():
    sec_users = pd.read_csv(sec_path)["user_id"].astype(str).unique().tolist()
    group_to_users["secondary_night_home_ge60min"] = sorted(sec_users)
    print("Added sample group:", "secondary_night_home_ge60min", "| n =", len(sec_users))
else:
    print("No secondary-night-home list found at:", sec_path)

Added sample group: secondary_night_home_ge60min | n = 13
