In [79]:
%load_ext autoreload
%autoreload 2

import os

import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import LineString
from shapely import wkt
import numpy as np
import swifter
from generator.preprocess import *
from generator.preprocess import remove_outlier_trajectories
from generator.road_network import RoadNetwork
import glob
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [80]:
# read data 
network = RoadNetwork()
network.load("../osm_data/sf")

all_files = glob.glob(os.path.join("../datasets/trajectories/SF/cabdata" , "*.txt"))

data = []

for filename in all_files:
    tdf = pd.read_csv(filename, index_col=None, header=None, delimiter=" ")
    tdf["tax_id"] = filename.split("/")[-1].split(".")[0].split("_")[1]
    data.append(tdf)

df = pd.concat(data, axis=0, ignore_index=True)
df = df.rename(columns={0: "lat", 1: "long", 2: "occupied", 3: "timestamp"})

In [45]:
df[50:60]

Unnamed: 0,lat,long,occupied,timestamp,tax_id
50,37.78554,-122.42929,1,1213032899,abdremlu
51,37.78459,-122.43728,1,1213032833,abdremlu
52,37.78468,-122.43977,1,1213032791,abdremlu
53,37.78704,-122.44025,1,1213032732,abdremlu
54,37.78834,-122.44049,1,1213032659,abdremlu
55,37.79543,-122.44103,1,1213032509,abdremlu
56,37.79638,-122.44218,0,1213032451,abdremlu
57,37.79638,-122.44218,0,1213032421,abdremlu
58,37.80007,-122.44288,0,1213032362,abdremlu
59,37.80334,-122.44365,0,1213032307,abdremlu


In [81]:

import time 
# group for each taxi
rows = []
for _, g in tqdm(df.groupby("tax_id")):
    # group each occupied trajectory
    trajectories_occu = g[g['occupied'] == 1].groupby((g['occupied'] != 1).cumsum())
    # trajectories_nooccu = g[g['occupied'] == 0].groupby((g['occupied'] != 0).cumsum())
    for _, t in trajectories_occu:
        if t.shape[0] < 5:
            continue
        data = t.to_numpy()
        data[:, 0], data[:, 1] = data[:, 1], data[:, 0].copy()
        seq = LineString(data[::-1, :2])
        rows.append((seq, data[::-1, 3]))
    
    # for _, t in trajectories_nooccu:
    #     if t.shape[0] < 5:
    #         continue
    #     data = t.to_numpy()
    #     seq = LineString(data[::-1, :2])
    #     rows.append((seq, data[::-1, 3]))

processed_df = pd.DataFrame(rows, columns=["POLYLINE", "timestamp"])
processed_df

100%|██████████| 536/536 [01:06<00:00,  8.10it/s]


Unnamed: 0,POLYLINE,timestamp
0,"LINESTRING (-122.41527 37.7874, -122.40859 37....","[1213083415, 1213083475, 1213083535, 121308360..."
1,"LINESTRING (-122.39951 37.78881, -122.4032 37....","[1213081170, 1213081539, 1213081540, 121308154..."
2,"LINESTRING (-122.41304 37.78711, -122.41499 37...","[1213078088, 1213078127, 1213078189, 121307824..."
3,"LINESTRING (-122.40998 37.80838, -122.41186 37...","[1213077504, 1213077580, 1213077634, 121307770..."
4,"LINESTRING (-122.40998 37.80839, -122.41113 37...","[1213076544, 1213076658, 1213076688, 121307679..."
...,...,...
407148,"LINESTRING (-122.43085 37.77202, -122.42726 37...","[1211050722, 1211050787, 1211050834, 121105089..."
407149,"LINESTRING (-122.40082 37.79218, -122.40131 37...","[1211049437, 1211049498, 1211049565, 121104961..."
407150,"LINESTRING (-122.4051 37.78521, -122.40291 37....","[1211047463, 1211047523, 1211047583, 121104765..."
407151,"LINESTRING (-122.38849 37.61609, -122.39863 37...","[1211045500, 1211045565, 1211045614, 121104575..."


In [82]:
city_bounds = network.bounds
df_clipped = clip_trajectories(processed_df.copy(), city_bounds, polyline_convert=True)
df_clipped = filter_min_points(df_clipped, 5)
df_clipped

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,timestamp,POLYLINE,coords
0,"[1213083415, 1213083475, 1213083535, 121308360...","LINESTRING (-122.41527 37.78740, -122.40859 37...","[(-122.41527, 37.7874), (-122.40859, 37.78833)..."
1,"[1213081170, 1213081539, 1213081540, 121308154...","LINESTRING (-122.39951 37.78881, -122.40320 37...","[(-122.39951, 37.78881), (-122.4032, 37.78734)..."
2,"[1213078088, 1213078127, 1213078189, 121307824...","LINESTRING (-122.41304 37.78711, -122.41499 37...","[(-122.41304, 37.78711), (-122.41499, 37.78709..."
3,"[1213077504, 1213077580, 1213077634, 121307770...","LINESTRING (-122.40998 37.80838, -122.41186 37...","[(-122.40998, 37.80838), (-122.41186, 37.80509..."
4,"[1213076544, 1213076658, 1213076688, 121307679...","LINESTRING (-122.40998 37.80839, -122.41113 37...","[(-122.40998, 37.80839), (-122.41113, 37.80156..."
...,...,...,...
685278,"[1211050722, 1211050787, 1211050834, 121105089...","LINESTRING (-122.43085 37.77202, -122.42726 37...","[(-122.43085, 37.77202), (-122.42726, 37.77255..."
685279,"[1211049437, 1211049498, 1211049565, 121104961...","LINESTRING (-122.40082 37.79218, -122.40131 37...","[(-122.40082, 37.79218), (-122.40131, 37.793),..."
685280,"[1211047463, 1211047523, 1211047583, 121104765...","LINESTRING (-122.40510 37.78521, -122.40291 37...","[(-122.4051, 37.78521), (-122.40291, 37.78603)..."
685281,"[1211045500, 1211045565, 1211045614, 121104575...","LINESTRING (-122.39403 37.70355, -122.39694 37...","[(-122.39402857932053, 37.7035474), (-122.3969..."


In [84]:
df_clipped["id"] = np.arange(1, df_clipped.shape[0]+1)
df_clipped.drop("coords", inplace=True, axis=1)
df_clipped["timestamp"] = df_clipped["timestamp"].astype(str)
df_clipped["timestamp"] = df_clipped["timestamp"].str.replace("[", "")
df_clipped["timestamp"] = df_clipped["timestamp"].str.replace("]", "")
df_clipped["timestamp"] = df["timestamp"].str.replace(" ", ", ")
df_clipped.to_csv("../datasets/trajectories/SF/mapped_id_poly_clipped.csv", sep=";", index=False)

In [92]:
df = pd.read_csv("../datasets/trajectories/SF/mapped_id_poly_clipped.csv", sep=";")

In [97]:
df["POLYLINE"] = df["POLYLINE"].swifter.apply(wkt.loads)
gdf = gpd.GeoDataFrame(df, crs="epsg:4326", geometry="POLYLINE")

Pandas Apply:   0%|          | 0/418139 [00:00<?, ?it/s]

In [94]:
network.fmm_trajectorie_mapping(
    network_file="../osm_data/porto/edges.shp",
    input_file="../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv",
    output_file="../datasets/trajectories/Porto/road-segment-mapping.txt"
)

NameError: name 'Network' is not defined