In [6]:
import pandas as pd
from skmob import TrajDataFrame
from skmob.preprocessing import detection
# 1. Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely import wkb
from pathlib import Path

# 2. Read the CSV file containing trajectory data
csv_path = Path(r"\\tsclient\D\Siyu Zhao\data\Auckland region park\waitakere_trajectories.csv")


df = pd.read_csv(
    csv_path,
    sep=",",            
    header=0,
    dtype={
        "hashed_id": "string",
        "lat": "float64",          
        "lon": "float64",          
        "time": "int64",           
        "polygon_name": "category",
        "geom": "string"
    }
)

# 3. change unix_timestamp to datetime
df["datetime"] = pd.to_datetime(df["time"], unit="s", utc=True) 

# # 4. Convert the WKB geometry column to a GeoDataFrame
# df["geometry"] = df["geom"].apply(lambda x: wkb.loads(bytes.fromhex(x))) # Convert WKB hex string to Shapely geometry
# gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")  # Set the coordinate reference system to WGS 84

# 5. print
print(df.columns.tolist()) # Display the list of columns in the DataFrame

['hashed_id', 'lat', 'lon', 'time', 'polygon_name', 'geom', 'datetime']


In [None]:
# stay points detection
df = df.rename(columns={"lon": "lng"})  # Rename 'lon' to 'lng' for consistency with skmob

# 6. Create a TrajDataFrame from the DataFrame
tdf = TrajDataFrame(
    df[["hashed_id", "datetime", "lat", "lng"]],
    user_id='hashed_id',
    timestamp=True
)

# 7. Detect stay locations using skmob's detection module
stdf = detection.stay_locations(
    tdf,
    minutes_for_a_stop=5, 
    spatial_radius_km=0.05, 
    leaving_time=True 
)

  stdf = tdf.groupby(groupby, group_keys=False, as_index=False).apply(_stay_locations_trajectory, stop_radius=stop_radius,


In [11]:
# match stay points with POIs

from shapely.geometry import Point

# 8. Read the POIs from a GeoPackage file
pois_path = Path(r"\\tsclient\D\Siyu Zhao\data\Auckland region park\Auckland-parks-poi.gpkg")
pois = gpd.read_file(pois_path)

# 9. Convert the stay points back to a DataFrame
stdf['geometry'] = [Point(xy) for xy in zip(stdf['lng'], stdf['lat'])]  # Create Point geometries from lat/lng
stay_gdf = gpd.GeoDataFrame(stdf, geometry='geometry', crs="EPSG:4326") # Convert to GeoDataFrame with WGS 84 CRS

stay_gdf = stay_gdf.to_crs(epsg=3857) # Convert to Web Mercator CRS
pois = pois.to_crs(epsg=3857)         # Convert POIs to Web Mercator CRS

# 10. Perform a spatial join to find the nearest POI for each stay point
matched = gpd.sjoin_nearest(stay_gdf, pois, how="left", distance_col="distance")

# 11. Filter matched results to keep only those within a certain distance 
matched = matched[matched["distance"] <= 50]

print(matched.head())  # Display the first few rows of the matched DataFrame
print(matched.columns.tolist())  # Display the list of columns in the matched DataFrame
print(matched.shape)  # Display the shape of the matched DataFrame

                                         uid                  datetime  \
0   0001ebc00454fd08b2f233205c357b1f35a8a5fe 2019-05-16 06:03:57+00:00   
5   0004ec53405b3ce3052846a0519058689fc839af 2020-09-27 04:09:32+00:00   
6   0004ec53405b3ce3052846a0519058689fc839af 2020-09-27 04:14:39+00:00   
9   0004ec53405b3ce3052846a0519058689fc839af 2020-09-27 04:33:30+00:00   
10  0004ec53405b3ce3052846a0519058689fc839af 2020-09-27 04:39:37+00:00   

     lat_left         lng           leaving_datetime  \
0  -36.954955  174.473730  2019-05-16 08:28:33+00:00   
5  -36.954766  174.472970  2020-09-27 04:14:39+00:00   
6  -36.952642  174.468510  2020-09-27 04:21:23+00:00   
9  -36.953418  174.474422  2020-09-27 04:39:37+00:00   
10 -36.949889  174.467558  2020-09-27 04:45:39+00:00   

                             geometry  index_right          id   timestamp  \
0    POINT (19422326.78 -4432829.954)          896    79293437  1672816915   
5   POINT (19422242.178 -4432803.625)          896    79293437

In [13]:
import pandas as pd

# 12. Define the tags for tourism and transport categories
tourism_tags = {
    "tourism": [
        "attraction", "viewpoint",  "museum", "gallery",
         "sightseeing", "zoo", "theme_park", "aquarium", 
    ],
    "leisure": [
        "park", "nature_reserve", "garden", "playground", "picnic_site", "dog_park",
    ],
    "natural": [
        "waterfall", "peak", "beach", "spring", "volcano", 
        "cave_entrance" , "wetland", "bay" , "valley" ,
        "lake", "river", "stream",
    ],
    "sport" : [
        "surfing","scuba_diving","fishing","kayaking","rowing","rafting",
         "climbing","running","trail_running","mountain_biking","orienteering",
        "horse_riding","equestrian","paragliding","beachvolleyball","disc_golf"
    ],
    "amenity": [
        "restaurant", "cafe", "bar", "pub", "fast_food", "ice_cream", "toilets",
        "food_court","bbq","drinking_water","shower", "bench",
        "parking", "bicycle_parking", "ferry_terminal","fuel","bicycle_rental", "car_rental","car_sharing","taxi"
    ],
    "shop": [
        "convenience", "supermarket", "bakery", "mall",
        "outdoor","sports","surf","diving","bicycle","camping",
        "souvenir","gift","craft",
    ],
    "historic": [ "monument", "memorial","milestone", "heritage"],
                        
}

transport_tags = {
    "highway": [ "bus_stop","bus_station"],
    "public_transport": ["bus_stop", "ferry", "train_station","boarding_area"],
    "railway": ["subway","station"],
    "aeroway": ["aerodrome", "helipad"],
    "waterway": ["boat_ramp","dock"]
}

combined_tags = {**tourism_tags, **transport_tags} # Combine both dictionaries

# 13.Create a reverse mapping from value to category
value_to_category = {} 
for cat, values in combined_tags.items():
    for val in values:
        value_to_category[val] = cat

# 14. Define a function to extract the POI type from the matched DataFrame
def extract_poi_type(row):
    for col in row.index:
        if col in combined_tags:  # tourism, leisure, natural, etc.
            tag_value = row[col]
            if pd.notna(tag_value):
                return tag_value  # e.g., "park"
    return None

# 15. Apply the function to extract the POI type for each row in the matched DataFrame
matched["poi_type"] = matched.apply(extract_poi_type, axis=1)
# 16. Map the POI type to its category using the reverse mapping
matched["poi_category"] = matched["poi_type"].map(value_to_category)

print(matched.columns.tolist())

['uid', 'datetime', 'lat_left', 'lng', 'leaving_datetime', 'geometry', 'index_right', 'id', 'timestamp', 'visible', 'version', 'tags', 'lon', 'changeset', 'lat_right', 'addr:city', 'addr:housenumber', 'addr:housename', 'addr:postcode', 'addr:street', 'email', 'name', 'opening_hours', 'operator', 'phone', 'ref', 'website', 'tourism', 'zoo', 'leisure', 'outdoor_seating', 'natural', 'amenity', 'bicycle_parking', 'fountain', 'internet_access', 'parking', 'source', 'start_date', 'wikipedia', 'shop', 'historic', 'memorial', 'access', 'foot', 'highway', 'lit', 'public_transport', 'railway', 'osm_type', 'wetland', 'building', 'building:levels', 'drinking_water', 'area', 'surface', 'aeroway', 'distance', 'poi_type', 'poi_category']


In [17]:
# 17. Clean up the matched DataFrame to keep only relevant columns
columns_to_keep = [
    'uid', 'datetime', 'lat_left', 'lng', 'leaving_datetime', 'geometry',
    'index_right', 'id', 'name','lon', 'lat_right', 'distance', 'poi_type'
]
matched_cleaned = matched[columns_to_keep]
print(matched_cleaned.columns.tolist())
print(matched_cleaned.head())

['uid', 'datetime', 'lat_left', 'lng', 'leaving_datetime', 'geometry', 'index_right', 'id', 'name', 'lon', 'lat_right', 'distance', 'poi_type']
                                         uid                  datetime  \
0   0001ebc00454fd08b2f233205c357b1f35a8a5fe 2019-05-16 06:03:57+00:00   
5   0004ec53405b3ce3052846a0519058689fc839af 2020-09-27 04:09:32+00:00   
6   0004ec53405b3ce3052846a0519058689fc839af 2020-09-27 04:14:39+00:00   
9   0004ec53405b3ce3052846a0519058689fc839af 2020-09-27 04:33:30+00:00   
10  0004ec53405b3ce3052846a0519058689fc839af 2020-09-27 04:39:37+00:00   

     lat_left         lng           leaving_datetime  \
0  -36.954955  174.473730  2019-05-16 08:28:33+00:00   
5  -36.954766  174.472970  2020-09-27 04:14:39+00:00   
6  -36.952642  174.468510  2020-09-27 04:21:23+00:00   
9  -36.953418  174.474422  2020-09-27 04:39:37+00:00   
10 -36.949889  174.467558  2020-09-27 04:45:39+00:00   

                             geometry  index_right          id  \
0    POI

In [19]:

# 18. Save the cleaned DataFrame to a CSV file
matched_cleaned.to_csv(
    Path(r"\\tsclient\D\Siyu Zhao\data\Auckland region park\matched_pois.csv"),
    index=False
)