In [23]:
%load_ext autoreload
%autoreload 2

import os

import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import LineString
from shapely import wkt
import numpy as np
import swifter
from generator.preprocess import *
from generator.preprocess import remove_outlier_trajectories
from generator.road_network import RoadNetwork
import glob
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
# read data 
network = RoadNetwork()
network.load("../osm_data/sf")

all_files = glob.glob(os.path.join("../datasets/trajectories/SF/cabdata" , "*.txt"))

data = []

for filename in all_files:
    tdf = pd.read_csv(filename, index_col=None, header=None, delimiter=" ")
    tdf["tax_id"] = filename.split("/")[-1].split(".")[0].split("_")[1]
    data.append(tdf)

df = pd.concat(data, axis=0, ignore_index=True)
df = df.rename(columns={0: "lat", 1: "long", 2: "occupied", 3: "timestamp"})

In [45]:
df[50:60]

Unnamed: 0,lat,long,occupied,timestamp,tax_id
50,37.78554,-122.42929,1,1213032899,abdremlu
51,37.78459,-122.43728,1,1213032833,abdremlu
52,37.78468,-122.43977,1,1213032791,abdremlu
53,37.78704,-122.44025,1,1213032732,abdremlu
54,37.78834,-122.44049,1,1213032659,abdremlu
55,37.79543,-122.44103,1,1213032509,abdremlu
56,37.79638,-122.44218,0,1213032451,abdremlu
57,37.79638,-122.44218,0,1213032421,abdremlu
58,37.80007,-122.44288,0,1213032362,abdremlu
59,37.80334,-122.44365,0,1213032307,abdremlu


In [72]:

import time 
# group for each taxi
rows = []
for _, g in tqdm(df.groupby("tax_id")):
    # group each occupied trajectory
    trajectories_occu = g[g['occupied'] == 1].groupby((g['occupied'] != 1).cumsum())
    trajectories_nooccu = g[g['occupied'] == 0].groupby((g['occupied'] != 0).cumsum())
    for _, t in trajectories_occu:
        if t.shape[0] < 5:
            continue
        data = t.to_numpy()
        data[:, 0], data[:, 1] = data[:, 1], data[:, 0].copy()
        seq = LineString(data[::-1, :2])
        rows.append((seq, data[::-1, 3]))
    
    # for _, t in trajectories_nooccu:
    #     if t.shape[0] < 5:
    #         continue
    #     data = t.to_numpy()
    #     seq = LineString(data[::-1, :2])
    #     rows.append((seq, data[::-1, 3]))

processed_df = pd.DataFrame(rows, columns=["POLYLINE", "timestamp"])
processed_df

100%|██████████| 536/536 [01:02<00:00,  8.61it/s]


Unnamed: 0,POLYLINE,timestamp
0,"LINESTRING (-122.41527 37.7874, -122.40859 37....","[1213083415, 1213083475, 1213083535, 121308360..."
1,"LINESTRING (-122.39951 37.78881, -122.4032 37....","[1213081170, 1213081539, 1213081540, 121308154..."
2,"LINESTRING (-122.41304 37.78711, -122.41499 37...","[1213078088, 1213078127, 1213078189, 121307824..."
3,"LINESTRING (-122.40998 37.80838, -122.41186 37...","[1213077504, 1213077580, 1213077634, 121307770..."
4,"LINESTRING (-122.40998 37.80839, -122.41113 37...","[1213076544, 1213076658, 1213076688, 121307679..."
...,...,...
407148,"LINESTRING (-122.43085 37.77202, -122.42726 37...","[1211050722, 1211050787, 1211050834, 121105089..."
407149,"LINESTRING (-122.40082 37.79218, -122.40131 37...","[1211049437, 1211049498, 1211049565, 121104961..."
407150,"LINESTRING (-122.4051 37.78521, -122.40291 37....","[1211047463, 1211047523, 1211047583, 121104765..."
407151,"LINESTRING (-122.38849 37.61609, -122.39863 37...","[1211045500, 1211045565, 1211045614, 121104575..."


In [73]:
city_bounds = network.bounds
df_clipped = clip_trajectories(processed_df.copy(), city_bounds, polyline_convert=True)
df_clipped = filter_min_points(df_clipped, 5)
df_clipped

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,timestamp,POLYLINE,coords
0,"[1213083415, 1213083475, 1213083535, 121308360...","LINESTRING (-122.41527 37.78740, -122.40859 37...","[(-122.41527, 37.7874), (-122.40859, 37.78833)..."
1,"[1213081170, 1213081539, 1213081540, 121308154...","LINESTRING (-122.39951 37.78881, -122.40320 37...","[(-122.39951, 37.78881), (-122.4032, 37.78734)..."
2,"[1213078088, 1213078127, 1213078189, 121307824...","LINESTRING (-122.41304 37.78711, -122.41499 37...","[(-122.41304, 37.78711), (-122.41499, 37.78709..."
3,"[1213077504, 1213077580, 1213077634, 121307770...","LINESTRING (-122.40998 37.80838, -122.41186 37...","[(-122.40998, 37.80838), (-122.41186, 37.80509..."
4,"[1213076544, 1213076658, 1213076688, 121307679...","LINESTRING (-122.40998 37.80839, -122.41113 37...","[(-122.40998, 37.80839), (-122.41113, 37.80156..."
...,...,...,...
685278,"[1211050722, 1211050787, 1211050834, 121105089...","LINESTRING (-122.43085 37.77202, -122.42726 37...","[(-122.43085, 37.77202), (-122.42726, 37.77255..."
685279,"[1211049437, 1211049498, 1211049565, 121104961...","LINESTRING (-122.40082 37.79218, -122.40131 37...","[(-122.40082, 37.79218), (-122.40131, 37.793),..."
685280,"[1211047463, 1211047523, 1211047583, 121104765...","LINESTRING (-122.40510 37.78521, -122.40291 37...","[(-122.4051, 37.78521), (-122.40291, 37.78603)..."
685281,"[1211045500, 1211045565, 1211045614, 121104575...","LINESTRING (-122.39403 37.70355, -122.39694 37...","[(-122.39402857932053, 37.7035474), (-122.3969..."


In [74]:
processed_df["id"] = np.arange(1, processed_df.shape[0]+1)
processed_df.to_csv("../datasets/trajectories/SF/mapped_id_poly_clipped.csv")

In [78]:
df = pd.read_csv("../datasets/trajectories/SF/mapped_id_poly_clipped.csv", index_col=0)

Unnamed: 0,POLYLINE,timestamp,id
0,"LINESTRING (-122.41527 37.7874, -122.40859 37....",[1213083415 1213083475 1213083535 1213083600 1...,1
1,"LINESTRING (-122.39951 37.78881, -122.4032 37....",[1213081170 1213081539 1213081540 1213081541 1...,2
2,"LINESTRING (-122.41304 37.78711, -122.41499 37...",[1213078088 1213078127 1213078189 1213078247 1...,3
3,"LINESTRING (-122.40998 37.80838, -122.41186 37...",[1213077504 1213077580 1213077634 1213077708 1...,4
4,"LINESTRING (-122.40998 37.80839, -122.41113 37...",[1213076544 1213076658 1213076688 1213076790 1...,5
...,...,...,...
407148,"LINESTRING (-122.43085 37.77202, -122.42726 37...",[1211050722 1211050787 1211050834 1211050894 1...,407149
407149,"LINESTRING (-122.40082 37.79218, -122.40131 37...",[1211049437 1211049498 1211049565 1211049617 1...,407150
407150,"LINESTRING (-122.4051 37.78521, -122.40291 37....",[1211047463 1211047523 1211047583 1211047653 1...,407151
407151,"LINESTRING (-122.38849 37.61609, -122.39863 37...",[1211045500 1211045565 1211045614 1211045755 1...,407152


In [77]:
tdf = pd.read_csv("../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv", sep=";")
tdf

Unnamed: 0,TRIP_ID,POLYLINE,timestamp,id
0,1372636858620000589,"LINESTRING (-8.618643 41.141412, -8.618499 41....",[ 0 15 30 45 60 75 90 105 120 135 150 1...,1
1,1372637303620000596,"LINESTRING (-8.639847 41.159826, -8.640351 41....",[ 0 15 30 45 60 75 90 105 120 135 150 1...,2
2,1372636951620000320,LINESTRING (-8.630656287759539 41.149491641646...,[ 0 15 30 45 60 75 90 105 120 135 150 1...,3
3,1372636951620000320,LINESTRING (-8.630778800013456 41.149344053784...,[ 0 15 30 45 60 75 90 105 120 135 150 1...,4
4,1372636854620000520,"LINESTRING (-8.574704100000002 41.1519411, -8....",[ 0 15 30 45 60 75 90 105 120 135 150 165],5
...,...,...,...,...
2152272,1404155105620000121,LINESTRING (-8.607240818181818 41.145759818181...,[ 0 15 30 45 60 75 90 105 120 135 150 1...,2152273
2152273,1404171463620000698,LINESTRING (-8.612474825454544 41.146011261818...,[ 0 15 30 45 60 75 90 105 120 135 150 1...,2152274
2152274,1404171367620000670,"LINESTRING (-8.610138 41.140845, -8.610174 41....",[ 0 15 30 45 60 75 90 105 120 135 150 1...,2152275
2152275,1404141826620000248,"LINESTRING (-8.630712 41.154885, -8.63073 41.1...",[ 0 15 30 45 60 75 90 105 120 135 150 1...,2152276
