In [None]:
import pandas as pd
from pathlib import Path
import geopandas as gpd
from shapely import wkb

csv_path = Path(r"D:\Siyu Zhao\data\Auckland region park\waitakere_trajectories.csv")


df = pd.read_csv(
    csv_path,
    sep=",",            
    header=0,
    dtype={
        "hashed_id": "string",
        "lat": "float64",          
        "lon": "float64",          
        "time": "int64",           
        "polygon_name": "category",
        "geom": "string"
    }
)

# 3. change unix_timestamp to datetime
df["datetime"] = pd.to_datetime(df["time"], unit="s", utc=True) 
df["datetime"] = df["datetime"].dt.tz_convert("Pacific/Auckland")  # Convert to Auckland timezone
df["timestamp"] = df["datetime"].apply(lambda x: x.timestamp())    

# 4. Convert the WKB geometry column to a GeoDataFrame
df["geometry"] = df["geom"].apply(lambda x: wkb.loads(bytes.fromhex(x))) # Convert WKB hex string to Shapely geometry


# # 5. print
# print(df.head())

In [None]:

df = df.reset_index(drop=True)  # Ensure clean index
df["row_id"] = df.index         # Add unique identifier for each row

data = []
row_ids = []

for _, group in df.groupby("hashed_id"):
    group_sorted = group.sort_values("time")
    coords = group_sorted[["lat", "lon", "time"]].to_numpy()
    data.append(coords)
    row_ids.extend(group_sorted["row_id"].tolist())

In [None]:
#  Convert each user's trajectory into a NumPy array of shape (N, 3) with columns
# data = [
#     group.sort_values("time")[["lat", "lon", "time"]].to_numpy()
#     for _, group in df.groupby("hashed_id")
# ]

# print(data)

In [None]:

from infostop import Infostop
import numpy as np

model = Infostop(
    r1=200,                            # Maximum distance to stay in the same place (for a stop)
    r2=100,                            # Maximum distance to group stops into one destination
    min_staying_time= 10 * 60,         # The minimum time a person must stay within a small area to be considered a stop.最短停留时间（小于就不是停留点）
    max_time_between= 24 * 60 * 60     # 24h The maximum time allowed between two nearby points to still count as the same stop. 最大停留时间（超过就分成多个停留）
)

labels = model.fit_predict(data)



In [None]:

all_labels = np.concatenate(labels)

num_valid_stops = np.sum(all_labels != -1)

print(f"200m，100m，10min有效的停留点个数：{num_valid_stops}")

In [None]:
from visualize import plot_map

folmap = plot_map(
    model,
    display_data="unique_stationary",
    polygons=True,
    heatmap=True,
    scatter=True, 
    scatter_opacity= 0.1,
    scatter_radius=3,
    #tiles="CartoDB positron",
    tiles="OpenStreetMap",
    zoom_start=13

)
folmap.m

In [None]:

import pandas as pd
import numpy as np

df_labels = pd.DataFrame({
    "row_id": row_ids,
    "destination_id": all_labels
})

# Merge labels back
df_result = df.merge(df_labels, on="row_id", how="left")
# df_result_clean = df_result[df_result["destination_id"] != -1]

print("有效停留点标签数：", np.sum(all_labels != -1))
print("最终 merge 后的停留点数：", len(df_result))
print(df_result.head())

In [None]:

output_path = r"D:\Siyu Zhao\data\Auckland region park\infostop.csv"
df_result.to_csv(output_path, index=False)


In [None]:
import pandas as pd
from pathlib import Path

path = Path(r"D:\Siyu Zhao\data\Auckland region park\infostop.csv")
df_result = pd.read_csv(path)


In [None]:

# Step 1: sort
df_result = df_result.sort_values(by=["hashed_id", "timestamp"]).reset_index(drop=True)

# Step 2: detect new session (when hashed_id or destination_id change)
df_result["prev_user"] = df_result["hashed_id"].shift(1)
df_result["prev_dest"] = df_result["destination_id"].shift(1)

df_result["new_session"] = (
    (df_result["hashed_id"] != df_result["prev_user"]) |
    (df_result["destination_id"] != df_result["prev_dest"])
).astype(int)

# Step 3: define session id
df_result["stay_session_id"] = df_result["new_session"].cumsum()

# Step 4: aggregate session-level data
session_df = (
    df_result.groupby("stay_session_id")
    .agg(
        hashed_id=("hashed_id", "first"),
        destination_id=("destination_id", "first"),
        start_ts=("timestamp", "min"),
        end_ts=("timestamp", "max"),
        point_count=("timestamp", "count"),
        lat=("lat", "mean"),
        lon=("lon", "mean")
    )
    .reset_index()
)

# Step 5: duration = end_ts - start_ts
session_df["duration_minutes"] = (session_df["end_ts"] - session_df["start_ts"]) / 60

# Step 6: convert timestamps
session_df["start_time"] = pd.to_datetime(session_df["start_ts"], unit="s", utc=True).dt.tz_convert("Pacific/Auckland")
session_df["end_time"] = pd.to_datetime(session_df["end_ts"], unit="s", utc=True).dt.tz_convert("Pacific/Auckland")

# Step 7: display result
print(session_df[["hashed_id", "destination_id", "start_time", "end_time", "duration_minutes", "lat", "lon"]].head())



In [None]:
print(len(session_df))

In [None]:

session_df_clean = session_df[session_df["destination_id"] != -1].copy()
print(len(session_df_clean))
print(len(session_df))
print(session_df_clean["duration_minutes"])


In [None]:

path = r"D:\Siyu Zhao\data\Auckland region park\infostop_stay_duration.csv"
session_df_clean.to_csv(path, index=False)

In [None]:
import pandas as pd


df = pd.read_csv(r"D:\Siyu Zhao\data\Auckland region park\infostop.csv")


df.drop(columns=['prev_user', 'prev_dest', 'new_session', 'stay_session_id'], inplace=True)


df.to_csv(r"D:\Siyu Zhao\data\Auckland region park\infostop.csv", index=False)

print(len(df))