# Convert Geolife dataset to GeoPandas DataFrame

In [None]:
from srai.datasets import GeolifeDataset
import os
from tqdm import tqdm
import glob
import re
import pandas as pd
import geopandas as gpd

In [None]:
hf_token = os.getenv('HF_token')
if hf_token is None:
    raise ValueError("HF_token environment variable is not set.")

In [None]:
geolife = GeolifeDataset()

gdf = geolife.load(hf_token=hf_token)

In [None]:
gdf.head()

In [None]:
def read_points(filename: str):
    df = pd.read_csv(filename, skiprows=6, header=None
    )  # WGS84
    columns = [
        "latitude",
        "longitude",
        "zero",
        "altitude",
        "date",
        "date_str",
        "time",
        "trajectory_id",
    ]
    match2 = lambda x: x.split('/')[-1][:-4]
    match = match2(filename)
    df["trajectory_id"] = match
    df.rename(columns=dict(zip(df.columns, columns)), inplace=True)
    df.drop(["zero"], inplace=True, axis=1)  # zero column is useless

    df["time"] = df["date_str"] + " " + df["time"]
    df["datetime"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S")

    df["mode"] = "unknown"  # default value
    if os.path.exists(os.path.join(geolife_dir, filename, "..", "..", "labels.txt")):
        labels = pd.read_csv(
            os.path.join(geolife_dir, filename, "..", "..", "labels.txt"),
            sep="\s+",
            skiprows=1,
            header=None,
        )
        labels["start_time"] = pd.to_datetime(labels[0] + " " + labels[1])
        labels["end_time"] = pd.to_datetime(labels[2] + " " + labels[3])
        labels.rename(columns={4: "mode"}, inplace=True)
        labels = labels[["start_time", "end_time", "mode"]]

        for i, row in labels.iterrows():
            mask = (df["datetime"] >= row["start_time"]) & (
                    df["datetime"] <= row["end_time"]
            )
            df.loc[mask, "mode"] = row["mode"]

    gdf = gpd.GeoDataFrame(
        df,
        columns=[
            "latitude",
            "longitude",
            "altitude",
            "date",
            "date_str",
            "time",
            "trajectory_id",
            "mode",
        ],
        geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
        crs="EPSG:4326",
    )

    return gdf


def read_user_points(user_folder: str = 'Data', user_name: str = "000"):
    # create geodataframe with user_id as column
    data_folder = os.path.join(user_folder, user_name)
    plt_files = glob.glob(os.path.join(data_folder, "Trajectory", "*.plt"))
    dataframes = []
    for file in plt_files:
        geo_df = read_points(file)
        geo_df["user_id"] = user_name
        dataframes.append(geo_df)
    try:
        final_geo_df = pd.concat(dataframes, ignore_index=True)
        return final_geo_df
    except ValueError:
        print(f"Error with {user_name}")
        return None
    


def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for sf in tqdm(
            subfolders,
            desc="Processing users trajectories",
            colour="cyan",
            smoothing=1.0,
            total=len(subfolders),
    ):
        df = read_user_points(folder, sf)
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    return gpd.GeoDataFrame(df, geometry=df.geometry, crs="EPSG:4326")

In [None]:
geolife_dir = 'Data/'
gdf = read_all_users(geolife_dir)

In [None]:
gdf.head()

In [None]:
if not os.path.exists('output_data'):
    os.makedirs('output_data')
gdf.to_parquet(os.path.join('output_data', 'geolife.parquet'))