In [46]:
from srai.datasets import GeolifeDataset
import os
from tqdm import tqdm
import glob
import re
import pandas as pd
import geopandas as gpd

In [47]:
# set os var
# os.environ['HF_token'] = '' # not active

In [3]:
geolife = GeolifeDataset()
gdf = geolife.load(hf_token=os.environ['HF_token'])

In [4]:
gdf.head()

Unnamed: 0,latitude,longitude,altitude,time,mode,trajectory_id,user_id,geometry
0,"[39.988992, 39.990964, 39.993207]","[116.327023, 116.327041, 116.326827]","[128.937004593176, 221.128615485564, 217.19159...","[2000-01-01 23:12:19, 2000-01-01 23:13:21, 200...","[unknown, unknown, unknown]",20000101231219,163,"LINESTRING (116.32702 39.98899, 116.32704 39.9..."
1,"[39.9742333333333, 39.9743166666667, 39.974466...","[116.330383333333, 116.33045, 116.33045, 116.3...","[823.490813648294, 823.490813648294, 741.46981...","[2007-04-12 09:31:32, 2007-04-12 09:39:37, 200...","[unknown, unknown, unknown, unknown, unknown, ...",20070412093132,142,"LINESTRING (116.33038 39.97423, 116.33045 39.9..."
2,"[39.9755166666667, 39.97585, 39.9759833333333,...","[116.330283333333, 116.3304, 116.330466666667,...","[351.049868766404, 114.829396325459, 114.82939...","[2007-04-12 10:18:53, 2007-04-12 10:20:15, 200...","[unknown, unknown, bike, bike, walk]",20070412101853,161,"LINESTRING (116.33028 39.97552, 116.33040 39.9..."
3,"[39.9764666666667, 39.9764, 39.97625, 39.9762,...","[116.330066666667, 116.33015, 116.330266666667...","[173.884514435696, 173.884514435696, 173.88451...","[2007-04-12 10:21:16, 2007-04-12 10:21:22, 200...","[bike, bike, bike, bike, bike, bike, bike, bik...",20070412102116,163,"LINESTRING (116.33007 39.97647, 116.33015 39.9..."
4,"[39.97585, 39.9759833333333, 39.9761, 39.97623...","[116.3304, 116.330466666667, 116.3305, 116.330...","[114.829396325459, 114.829396325459, 118.11023...","[2007-04-12 10:23:25, 2007-04-12 10:24:37, 200...","[walk, walk, walk, walk]",20070412102325,161,"LINESTRING (116.33040 39.97585, 116.33047 39.9..."


In [53]:
def read_points(filename: str):
    # print(f"Reading {filename}")
    df = pd.read_csv(filename, skiprows=6, header=None
    )  # WGS84
    columns = [
        "latitude",
        "longitude",
        "zero",
        "altitude",
        "date",
        "date_str",
        "time",
        "trajectory_id",
    ]
    match2 = lambda x: x.split('/')[-1][:-4]
    # # print(match2(filename))
    # match = re.search(r"\\(\d+)\.plt$", filename).group(
    #     1
    # )  # get file name to apply to ID
    match = match2(filename)
    df["trajectory_id"] = match
    df.rename(columns=dict(zip(df.columns, columns)), inplace=True)
    df.drop(["zero"], inplace=True, axis=1)  # zero column is useless

    df["time"] = df["date_str"] + " " + df["time"]
    df["datetime"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S")

    df["mode"] = "unknown"  # default value
    if os.path.exists(os.path.join(geolife_dir, filename, "..", "..", "labels.txt")):
        labels = pd.read_csv(
            os.path.join(geolife_dir, filename, "..", "..", "labels.txt"),
            sep="\s+",
            skiprows=1,
            header=None,
        )
        labels["start_time"] = pd.to_datetime(labels[0] + " " + labels[1])
        labels["end_time"] = pd.to_datetime(labels[2] + " " + labels[3])
        labels.rename(columns={4: "mode"}, inplace=True)
        labels = labels[["start_time", "end_time", "mode"]]

        for i, row in labels.iterrows():
            mask = (df["datetime"] >= row["start_time"]) & (
                    df["datetime"] <= row["end_time"]
            )
            df.loc[mask, "mode"] = row["mode"]

    gdf = gpd.GeoDataFrame(
        df,
        columns=[
            "latitude",
            "longitude",
            "altitude",
            "date",
            "date_str",
            "time",
            "trajectory_id",
            "mode",
        ],
        geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
        crs="EPSG:4326",
    )

    return gdf


def read_user_points(user_folder: str = 'Data', user_name: str = "000"):
    # create geodataframe with user_id as column
    data_folder = os.path.join(user_folder, user_name)
    plt_files = glob.glob(os.path.join(data_folder, "Trajectory", "*.plt"))
    dataframes = []
    for file in plt_files:
        geo_df = read_points(file)
        geo_df["user_id"] = user_name
        dataframes.append(geo_df)
    try:
        final_geo_df = pd.concat(dataframes, ignore_index=True)
        return final_geo_df
    except ValueError:
        print(f"Error with {user_name}")
        return None
    


def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    # print(f"Processing {len(subfolders)} users trajectories")
    for sf in tqdm(
            subfolders,
            desc="Processing users trajectories",
            colour="cyan",
            smoothing=1.0,
            total=len(subfolders),
    ):
        # print(f"Processing {sf}")
        df = read_user_points(folder, sf)
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    return gpd.GeoDataFrame(df, geometry=df.geometry, crs="EPSG:4326")

In [54]:
geolife_dir = 'Data/'
gdf = read_all_users(geolife_dir)

Processing users trajectories:  17%|[36m█▋        [0m| 32/183 [00:09<01:20,  1.87it/s]

Error with from _source.ipynb


Processing users trajectories: 100%|[36m██████████[0m| 183/183 [01:20<00:00,  2.27it/s]


In [55]:
gdf.head()

Unnamed: 0,latitude,longitude,altitude,date,date_str,time,trajectory_id,mode,geometry,user_id
0,39.974294,116.399741,492.0,39816.056644,2009-01-03,2009-01-03 01:21:34,20090103012134,unknown,POINT (116.39974 39.97429),135
1,39.974292,116.399592,492.0,39816.056655,2009-01-03,2009-01-03 01:21:35,20090103012134,unknown,POINT (116.39959 39.97429),135
2,39.974309,116.399523,492.0,39816.056667,2009-01-03,2009-01-03 01:21:36,20090103012134,unknown,POINT (116.39952 39.97431),135
3,39.97432,116.399588,492.0,39816.05669,2009-01-03,2009-01-03 01:21:38,20090103012134,unknown,POINT (116.39959 39.97432),135
4,39.974365,116.39973,491.0,39816.056701,2009-01-03,2009-01-03 01:21:39,20090103012134,unknown,POINT (116.39973 39.97436),135


In [57]:
gdf.to_file('geolife.gpkg', driver='GPKG')

In [58]:
gdf.to_parquet('geolife.parquet')