In [1]:
import pyarrow.parquet
import os
import pandas as pd
import geopandas as gpd
import glob
from shapely.geometry import Point, LineString
from tqdm.auto import tqdm
import folium
import contextily

Line 1…6 are useless in this dataset, and can be ignored. Points are described in following lines, one for each line.
Field 1: Latitude in decimal degrees.
Field 2: Longitude in decimal degrees.
Field 3: All set to 0 for this dataset.
Field 4: Altitude in feet (-777 if not valid).
Field 5: Date - number of days (with fractional part) that have passed since 12/30/1899.
Field 6: Date as a string.
Field 7: Time as a string.


In [2]:
cwd = os.getcwd()
cwd

'C:\\Users\\Kacper Kozaczko\\Desktop\\Stuff\\PWr\\II_semestr\\Spatial\\GEO_EDA\\preprocessing\\geolife'

In [3]:
data_dir = os.path.join(cwd, "data")
geolife_dir = os.path.join(data_dir, "users")

In [4]:
def read_trajectory(filename: str):
    df = pd.read_csv(
        os.path.join(geolife_dir, filename), skiprows=6, header=None
    )  # WGS84
    columns = ["Latitude", "Longitude", "Zero", "Altitude", "Date", "Date_str", "Time"]
    df.rename(columns=dict(zip(df.columns, columns)), inplace=True)
    df["Time"] = df["Date_str"] + " " + df["Time"]

    gdf = gpd.GeoDataFrame(
        df,
        columns=columns,
        geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]),
        crs="EPSG:4326",
    )
    line = LineString(gdf.geometry)
    trajectory_gdf = gpd.GeoDataFrame(geometry=[line], crs="EPSG:4326")
    trajectory_gdf["Altitude"] = [df["Altitude"].values.tolist()]
    trajectory_gdf["Date"] = [df["Date"].values.tolist()]
    trajectory_gdf["Time"] = [df["Time"].values.tolist()]
    trajectory_gdf["Zero"] = [df["Zero"].values.tolist()]
    return trajectory_gdf


def read_trajectory_labels(filename: str):
    df = pd.read_csv(
        os.path.join(geolife_dir, filename), skiprows=6, header=None
    )  # WGS84
    columns = ["Latitude", "Longitude", "Zero", "Altitude", "Date", "Date_str", "Time"]
    df.rename(columns=dict(zip(df.columns, columns)), inplace=True)
    df["Time"] = df["Date_str"] + " " + df["Time"]
    df["datetime"] = pd.to_datetime(df["Time"])
    # Add a 'mode' column to df
    df["mode"] = "unknown"  # default value
    if os.path.exists(os.path.join(geolife_dir, filename, "..", "..", "labels.txt")):
        labels = pd.read_csv(
            os.path.join(geolife_dir, filename, "..", "..", "labels.txt"),
            sep="\s+",
            skiprows=1,
            header=None,
        )
        labels["start_time"] = pd.to_datetime(labels[0] + " " + labels[1])
        labels["end_time"] = pd.to_datetime(labels[2] + " " + labels[3])
        labels.rename(columns={4: "mode"}, inplace=True)
        labels = labels[["start_time", "end_time", "mode"]]

        for i, row in labels.iterrows():
            mask = (df["datetime"] >= row["start_time"]) & (
                df["datetime"] <= row["end_time"]
            )
            df.loc[mask, "mode"] = row["mode"]

    gdf = gpd.GeoDataFrame(
        df,
        columns=columns,
        geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]),
        crs="EPSG:4326",
    )
    line = LineString(gdf.geometry)
    trajectory_gdf = gpd.GeoDataFrame(geometry=[line], crs="EPSG:4326")
    trajectory_gdf["Altitude"] = [df["Altitude"].values.tolist()]
    trajectory_gdf["Date"] = [df["Date"].values.tolist()]
    trajectory_gdf["Time"] = [df["Time"].values.tolist()]
    trajectory_gdf["Zero"] = [df["Zero"].values.tolist()]
    trajectory_gdf["mode"] = [
        df["mode"].values.tolist()
    ]  # add 'mode' to trajectory_gdf
    return trajectory_gdf

In [5]:
filepath = os.path.join("000", "Trajectory", "20081023025304.plt")
filepath2 = os.path.join("021", "Trajectory", "20070429083432.plt")
read_trajectory_labels(filepath2)

Unnamed: 0,geometry,Altitude,Date,Time,Zero,mode
0,"LINESTRING (116.33035 39.97557, 116.33023 39.9...","[226.377952755906, 301.837270341207, 328.08398...","[39201.3573148148, 39201.3590277778, 39201.523...","[2007-04-29 08:34:32, 2007-04-29 08:37:00, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[unknown, unknown, taxi, taxi, taxi, taxi, tax..."


In [6]:
def read_user(user_folder, user_name):
    # create geodataframe with user_id as column
    data_folder = os.path.join(user_folder, user_name)
    plt_files = glob.glob(os.path.join(data_folder, "Trajectory", "*.plt"))
    dataframes = []
    for file in plt_files:
        geo_df = read_trajectory_labels(file)
        geo_df["user_id"] = user_name
        dataframes.append(geo_df)
    final_geo_df = pd.concat(dataframes, ignore_index=True)
    return final_geo_df

In [7]:
def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in tqdm(
        enumerate(subfolders),
        desc="Processing users trajectories",
        colour="cyan",
        smoothing=1.0,
        total=len(subfolders),
    ):
        df = read_user(folder, sf)
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    return gpd.GeoDataFrame(df, geometry=df.geometry, crs="EPSG:4326")

In [8]:
geolife = read_all_users(geolife_dir)

Processing users trajectories:   0%|          | 0/182 [00:00<?, ?it/s]

  df['datetime'] = pd.to_datetime(df["Time"])


In [9]:
geolife.tail()

Unnamed: 0,geometry,Altitude,Date,Time,Zero,mode,user_id
18665,"LINESTRING (116.30535 39.99042, 116.30653 39.9...","[748.031496062992, 748.031496062992, 721.78477...","[39475.1867476852, 39475.187662037, 39475.1893...","[2008-01-28 04:28:55, 2008-01-28 04:30:14, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[unknown, unknown, unknown, unknown, unknown, ...",181
18666,"LINESTRING (112.96523 28.16815, 112.97228 28.1...","[144.356955380577, 209.97375328084, 209.973753...","[39484.122650463, 39484.1251851852, 39484.1257...","[2008-02-06 02:56:37, 2008-02-06 03:00:16, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[unknown, unknown, unknown, unknown, unknown, ...",181
18667,"LINESTRING (116.30895 39.98547, 116.30972 39.9...","[223.097112860892, 223.097112860892, 223.09711...","[39494.3365972222, 39494.3372800926, 39494.337...","[2008-02-16 08:04:42, 2008-02-16 08:05:41, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[unknown, unknown, unknown, unknown, unknown, ...",181
18668,"LINESTRING (116.30278 39.98993, 116.30357 39.9...","[383.858267716535, 383.858267716535, 383.85826...","[39495.0427199074, 39495.0432175926, 39495.046...","[2008-02-17 01:01:31, 2008-02-17 01:02:14, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[unknown, unknown, unknown, unknown, unknown, ...",181
18669,"LINESTRING (111.70922 40.91815, 111.70953 40.9...","[3825.4593175853, 3809.05511811024, 3795.93175...","[39521.1235532407, 39521.1245833333, 39521.126...","[2008-03-14 02:57:55, 2008-03-14 02:59:24, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[unknown, unknown, unknown, unknown, unknown, ...",181


In [11]:
# geolife.to_parquet(os.path.join(data_dir, "geolife.parquet"))

In [10]:
geolife.head(25).explore()

In [150]:
geolife.user_id.unique()

array(['000', '001', '002', '003', '004', '005', '006', '007', '008',
       '009', '010', '011', '012', '013', '014', '015', '016', '017',
       '018', '019', '020', '021', '022', '023', '024'], dtype=object)

In [132]:
labels = pd.read_csv(
    os.path.join(data_dir, "021", "labels.txt"), sep="\s+", skiprows=1, header=None
)
labels["start_time"] = pd.to_datetime(labels[0] + " " + labels[1])
labels["end_time"] = pd.to_datetime(labels[2] + " " + labels[3])
labels.rename(columns={4: "mode"}, inplace=True)
labels = labels[["start_time", "end_time", "mode"]]

user_21 = geolife[geolife.user_id == "021"]
user_21

Unnamed: 0,geometry,Altitude,Date,Time,Zero,user_id
3041,"LINESTRING (116.33035 39.97557, 116.33023 39.9...","[226.377952755906, 301.837270341207, 328.08398...","[39201.3573148148, 39201.3590277778, 39201.523...","[2007-04-29 08:34:32, 2007-04-29 08:37:00, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",21
3042,"LINESTRING (116.26812 39.90363, 116.26787 39.8...","[209.97375328084, 209.97375328084, 209.9737532...","[39201.9355439815, 39201.9358564815, 39201.936...","[2007-04-29 22:27:11, 2007-04-29 22:27:38, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",21
3043,"LINESTRING (116.01207 29.67932, 116.01060 29.6...","[173.884514435696, 173.884514435696, 173.88451...","[39203.0438773148, 39203.0439814815, 39203.044...","[2007-05-01 01:03:11, 2007-05-01 01:03:20, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",21
3044,"LINESTRING (116.86690 28.21625, 116.86777 28.2...","[134.514435695538, 134.514435695538, 134.51443...","[39204.0065277778, 39204.0066087963, 39204.006...","[2007-05-02 00:09:24, 2007-05-02 00:09:31, 200...","[0, 0, 0, 0, 0]",21
3045,"LINESTRING (116.91310 28.20797, 116.91750 28.2...","[131.233595800525, 131.233595800525, 131.23359...","[39204.0092361111, 39204.0095486111, 39204.009...","[2007-05-02 00:13:18, 2007-05-02 00:13:45, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",21
3046,"LINESTRING (116.96910 28.09580, 116.96797 28.0...","[442.913385826772, 380.577427821522, 390.41994...","[39204.063275463, 39204.0636689815, 39204.0640...","[2007-05-02 01:31:07, 2007-05-02 01:31:41, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",21
3047,"LINESTRING (118.05218 28.91773, 118.05235 28.9...","[5134.51443569554, 5072.17847769029, 5088.5826...","[39205.0410648148, 39205.0436805556, 39205.044...","[2007-05-03 00:59:08, 2007-05-03 01:02:54, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",21
3048,"LINESTRING (117.99585 29.46928, 117.99562 29.4...","[1158.13648293963, 1174.5406824147, 1190.94488...","[39205.9932986111, 39205.99375, 39205.99395833...","[2007-05-03 23:50:21, 2007-05-03 23:51:00, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",21


In [122]:
print(labels)

            start_time            end_time  mode
0  2007-04-29 12:34:24 2007-04-29 12:53:45  taxi
1  2007-04-29 22:27:11 2007-04-30 04:28:00   car
2  2007-04-30 04:28:03 2007-04-30 06:49:39   car
3  2007-04-30 07:21:01 2007-04-30 14:46:04   car
4  2007-05-01 01:03:11 2007-05-01 02:30:34   car
5  2007-05-01 02:31:23 2007-05-01 04:00:09   car
6  2007-05-01 06:04:45 2007-05-01 06:08:57   car
7  2007-05-01 06:09:24 2007-05-01 10:27:00  walk
8  2007-05-01 10:27:34 2007-05-01 11:04:33   car
9  2007-05-01 12:32:19 2007-05-01 14:15:27  walk
10 2007-05-02 00:09:24 2007-05-02 01:31:41   car
11 2007-05-02 01:32:10 2007-05-02 02:40:20  walk
12 2007-05-02 02:41:58 2007-05-02 03:10:35   car
13 2007-05-02 03:12:25 2007-05-02 03:33:20  walk
14 2007-05-02 04:00:24 2007-05-02 04:13:19   car
15 2007-05-03 00:59:08 2007-05-03 04:00:00  walk
16 2007-05-03 04:00:00 2007-05-03 05:11:19  walk
17 2007-05-03 23:50:21 2007-05-04 03:47:34  walk


In [13]:
geolife.tail(35).explore()

In [101]:
# ax = geolife.head().plot(figsize=(12,6), markersize=5)
# contextily.add_basemap(ax, zoom=2)

In [19]:
geolife.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [141]:
def read_trajectory(filename: str):
    df = pd.read_csv(os.path.join(data_dir, filename), skiprows=6, header=None)  # WGS84
    columns = ["Latitude", "Longitude", "Zero", "Altitude", "Date", "Date_str", "Time"]
    df.rename(columns=dict(zip(df.columns, columns)), inplace=True)
    df["Time"] = df["Date_str"] + " " + df["Time"]
    df["datetime"] = pd.to_datetime(df["Time"])
    # Add a 'mode' column to df
    df["mode"] = "unknown"  # default value
    if os.path.exists(os.path.join(data_dir, filename, "..", "..", "labels.txt")):
        labels = pd.read_csv(
            os.path.join(data_dir, "021", "labels.txt"),
            sep="\s+",
            skiprows=1,
            header=None,
        )
        labels["start_time"] = pd.to_datetime(labels[0] + " " + labels[1])
        labels["end_time"] = pd.to_datetime(labels[2] + " " + labels[3])
        labels.rename(columns={4: "mode"}, inplace=True)
        labels = labels[["start_time", "end_time", "mode"]]

        for i, row in labels.iterrows():
            mask = (df["datetime"] >= row["start_time"]) & (
                df["datetime"] <= row["end_time"]
            )
            df.loc[mask, "mode"] = row["mode"]

    gdf = gpd.GeoDataFrame(
        df,
        columns=columns,
        geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]),
        crs="EPSG:4326",
    )
    line = LineString(gdf.geometry)
    trajectory_gdf = gpd.GeoDataFrame(geometry=[line], crs="EPSG:4326")
    trajectory_gdf["Altitude"] = [df["Altitude"].values.tolist()]
    trajectory_gdf["Date"] = [df["Date"].values.tolist()]
    trajectory_gdf["Time"] = [df["Time"].values.tolist()]
    trajectory_gdf["Zero"] = [df["Zero"].values.tolist()]
    trajectory_gdf["mode"] = [
        df["mode"].values.tolist()
    ]  # add 'mode' to trajectory_gdf
    return trajectory_gdf

In [142]:
filepath2 = os.path.join("021", "Trajectory", "20070429083432.plt")
read_trajectory(filepath2)

Unnamed: 0,geometry,Altitude,Date,Time,Zero,mode
0,"LINESTRING (116.33035 39.97557, 116.33023 39.9...","[226.377952755906, 301.837270341207, 328.08398...","[39201.3573148148, 39201.3590277778, 39201.523...","[2007-04-29 08:34:32, 2007-04-29 08:37:00, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[unknown, unknown, taxi, taxi, taxi, taxi, tax..."
