In [1]:
%load_ext autoreload
%autoreload 2

import os

import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import LineString
from shapely import wkt
import numpy as np
import swifter
from generator.preprocess import *
from generator.preprocess import remove_outlier_trajectories
from generator.road_network import RoadNetwork
import glob
from tqdm import tqdm

In [2]:
network = RoadNetwork()
network.load("../osm_data/sf")

In [3]:
# read data 

all_files = glob.glob(os.path.join("../datasets/trajectories/sf/cabdata" , "*.txt"))

data = []

for filename in all_files:
    tdf = pd.read_csv(filename, index_col=None, header=None, delimiter=" ")
    tdf["tax_id"] = filename.split("/")[-1].split(".")[0].split("_")[1]
    data.append(tdf)

df = pd.concat(data, axis=0, ignore_index=True)
df = df.rename(columns={0: "lat", 1: "long", 2: "occupied", 3: "timestamp"})

In [4]:

import time 
# group for each taxi
rows = []
for _, g in tqdm(df.groupby("tax_id")):
    # group each occupied trajectory
    trajectories_occu = g[g['occupied'] == 1].groupby((g['occupied'] != 1).cumsum())
    # trajectories_nooccu = g[g['occupied'] == 0].groupby((g['occupied'] != 0).cumsum())
    for _, t in trajectories_occu:
        if t.shape[0] < 5:
            continue
        data = t.to_numpy()
        data[:, 0], data[:, 1] = data[:, 1], data[:, 0].copy()
        seq = LineString(data[::-1, :2])
        stamps = data[::-1, 3]
        rows.append((seq, stamps - stamps[0]))
    
    # for _, t in trajectories_nooccu:
    #     if t.shape[0] < 5:
    #         continue
    #     data = t.to_numpy()
    #     seq = LineString(data[::-1, :2])
    #     rows.append((seq, data[::-1, 3]))

processed_df = pd.DataFrame(rows, columns=["POLYLINE", "timestamp"])
processed_df

100%|██████████| 536/536 [01:07<00:00,  7.94it/s]


Unnamed: 0,POLYLINE,timestamp
0,"LINESTRING (-122.41527 37.7874, -122.40859 37....","[0, 60, 120, 185, 240, 300]"
1,"LINESTRING (-122.39951 37.78881, -122.4032 37....","[0, 369, 370, 371, 372, 437, 494, 551, 601, 66..."
2,"LINESTRING (-122.41304 37.78711, -122.41499 37...","[0, 39, 101, 159, 219, 260]"
3,"LINESTRING (-122.40998 37.80838, -122.41186 37...","[0, 76, 130, 204, 252, 311, 372, 452, 501, 539]"
4,"LINESTRING (-122.40998 37.80839, -122.41113 37...","[0, 114, 144, 246, 318, 365, 426, 471]"
...,...,...
407148,"LINESTRING (-122.43085 37.77202, -122.42726 37...","[0, 65, 112, 172, 233, 262, 318, 384, 436, 480..."
407149,"LINESTRING (-122.40082 37.79218, -122.40131 37...","[0, 61, 128, 180, 241, 300, 368, 416]"
407150,"LINESTRING (-122.4051 37.78521, -122.40291 37....","[0, 60, 120, 190, 244, 316, 360, 431, 491, 550..."
407151,"LINESTRING (-122.38849 37.61609, -122.39863 37...","[0, 65, 114, 255, 386, 428, 542, 666, 718, 789..."


In [5]:
df.to_csv("../datasets/trajectories/sf/all_gps_points.csv", sep=";", index=False)

In [13]:
city_bounds = network.bounds
clipped = clip_trajectories(processed_df.copy(), city_bounds, polyline_convert=True)
# df_clipped = filter_min_points(df_clipped, 5)
clipped

Unnamed: 0,POLYLINE,timestamp
208244,"LINESTRING (-122.39448 37.79324, -122.39275 37...","[0, 61, 121, 186, 254, 307, 362, 422, 498, 524..."
2812,"MULTILINESTRING ((-122.39537 37.75057, -122.39...","[0, 60, 120, 180, 240, 300, 361, 421, 470]"
91459,"LINESTRING (-122.41175 37.78759, -122.40683 37...","[0, 66, 120, 180, 241, 301, 371, 425, 757, 817..."
90949,"LINESTRING (-122.41159 37.78703, -122.40655 37...","[0, 58, 119, 183, 253, 364, 414, 477, 537, 590..."
262333,"LINESTRING (-122.41187 37.78786, -122.40721 37...","[0, 60, 120, 190, 242, 301, 364, 646, 694, 739..."
...,...,...
233158,"LINESTRING (-122.48205 37.71859, -122.47899 37...","[0, 63, 124, 180, 240, 301, 360, 382, 442, 502..."
192874,"LINESTRING (-122.47447 37.75567, -122.47597 37...","[0, 147, 206, 246, 306, 366, 404, 469, 519, 56..."
25200,"MULTILINESTRING ((-122.49742 37.70355, -122.49...","[0, 2, 54, 98, 158, 218, 279, 344, 410, 475, 5..."
17728,"LINESTRING (-122.47553 37.72099, -122.47550 37...","[0, 60, 120, 180, 240, 294, 354, 415, 476, 536..."


In [14]:
""" 
Correct timestamps
"""

def strictly_increasing(L):
    return all(x+20>=y for x, y in zip(L, L[1:]))


def correct_timestamps(traj, orig_trajs, orig_ts):
    corrected_ts = []
    corrected_traj = []
    idxs = []
    found = False
    for i, g1 in enumerate(traj):
        ridx = 0
        for j, g2 in enumerate(orig_trajs[ridx:]):
            if g1 == g2:
                found = True
                corrected_ts.append(orig_ts[j])
                corrected_traj.append(g1)
                idxs.append(j)
                ridx = j+1
                break
            # if found:
            #     break

    assert len(corrected_traj) == len(corrected_ts)
    # assert strictly_increasing(idxs), (idxs)
    
    return corrected_traj, (np.array(corrected_ts) - corrected_ts[0]).astype(int).tolist()


rows = []
i = 0
orig_polies, orig_ts = processed_df.POLYLINE, processed_df.timestamp
for i, r in tqdm(clipped.iterrows()):
    op = list(orig_polies.loc[r.name].coords)
    ot = orig_ts.loc[r.name]
    if type(r.POLYLINE) == LineString:
        traj = list(r.POLYLINE.coords)
        if len(traj) < 5:
            continue
        ctraj, cts = correct_timestamps(traj, op, ot)
        if len(ctraj) < 5:
            continue
        rows.append([LineString(ctraj), cts])
    else:
        for line in r.POLYLINE:
            traj = list(line.coords)
            if len(traj) < 5:
                continue
            ctraj, cts = correct_timestamps(traj, op, ot)
            if len(ctraj) < 5:
                continue
            rows.append([LineString(ctraj), cts])

df = pd.DataFrame(rows, columns=["POLYLINE", "timestamp"])

401747it [01:34, 4267.08it/s]


In [16]:
df["id"] = np.arange(1, df.shape[0]+1)
df["timestamp"] = df["timestamp"].astype(str)
df["timestamp"] = df["timestamp"].str.replace("[", "")
df["timestamp"] = df["timestamp"].str.replace("]", "")
# df_clipped["timestamp"] = df["timestamp"].str.replace(" ", ", ")
df.to_csv("../datasets/trajectories/sf/mapped_id_poly_clipped_corrected.csv", sep=";", index=False)

In [19]:
df = pd.read_csv("../datasets/trajectories/sf/mapped_id_poly_clipped_corrected.csv", sep=";")
df

Unnamed: 0,POLYLINE,timestamp,id
0,"LINESTRING (-122.39448 37.79324, -122.39275 37...","0, 61, 121, 186, 254, 307, 362, 422, 498, 524",1
1,"LINESTRING (-122.39976 37.74958, -122.40305 37...","0, 60, 120, 181, 241, 290",2
2,"LINESTRING (-122.41175 37.78759, -122.40683 37...","0, 66, 120, 180, 241, 301, 371, 425",3
3,"LINESTRING (-122.41159 37.78703, -122.40655 37...","0, 58, 119, 183, 253, 364, 414, 477, 537",4
4,"LINESTRING (-122.41187 37.78786, -122.40721 37...","0, 60, 120, 190, 242, 301, 364",5
...,...,...,...
418134,"LINESTRING (-122.49827 37.70642, -122.5003 37....","0, 65, 168, 244, 290, 350, 416, 477, 484",418135
418135,"LINESTRING (-122.50954 37.77521, -122.50955 37...","0, 60, 132",418136
418136,"LINESTRING (-122.50912 37.77523, -122.50911 37...","0, 81, 132",418137
418137,"LINESTRING (-122.50915 37.77521, -122.50914 37...","0, 55, 115",418138


In [25]:
list(df.iloc[418135].POLYLINE.coords)

[(-122.50954, 37.77521), (-122.50955, 37.77515), (-122.50955, 37.77516)]

In [20]:
df["POLYLINE"] = df["POLYLINE"].swifter.apply(wkt.loads)
gdf = gpd.GeoDataFrame(df, crs="epsg:4326", geometry="POLYLINE")

Pandas Apply:   0%|          | 0/418139 [00:00<?, ?it/s]

In [4]:
network.fmm_trajectorie_mapping(
    network_file="../osm_data/sf/edges.shp",
    input_file="../datasets/trajectories/SF/mapped_id_poly_clipped.csv",
    output_file="../datasets/trajectories/SF/road-segment-mapping.txt"
)

gps file : ../datasets/trajectories/SF/mapped_id_poly_clipped.csv
id column : id
geom column : POLYLINE
timestamp column : timestamp
x column : x
y column : y
GPS point : false

Result file : ../datasets/trajectories/SF/road-segment-mapping.txt
Output fields: opath pgeom spdist cpath mgeom duration speed 
[2022-08-10 10:33:47.548] [info] [network.cpp:72] Read network from file ../osm_data/sf/edges.shp
Status: success
Time takes 7903.32 seconds
Total points 4269845 matched 4254775
Map match speed 538.353 points/s 

[2022-08-10 10:33:47.698] [info] [network.cpp:170] Number of edges 27039 nodes 9739
[2022-08-10 10:33:47.698] [info] [network.cpp:171] Field index: id 17 source 0 target 1
[2022-08-10 10:33:47.716] [info] [network.cpp:174] Read network done.
[2022-08-10 10:33:47.716] [info] [network_graph.cpp:17] Construct graph from network edges start
[2022-08-10 10:33:47.719] [info] [network_graph.cpp:30] Graph nodes 9739 edges 27039
[2022-08-10 10:33:47.719] [info] [network_graph.cpp:31] 

In [34]:
# preprocess the mapping especially the speed and distance values need to be verified
df = pd.read_csv("../datasets/trajectories/SF/road_segment_map_final.csv", sep=";")
df_prep = remove_outlier_trajectories(df.copy(), min_edges_traversed=3)
df_prep.to_csv("../datasets/trajectories/SF/road_segment_map_final.csv", sep=";")

Pandas Apply:   0%|          | 0/249765 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/249765 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/249765 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/249765 [00:00<?, ?it/s]

In [32]:
df = pd.read_csv("../datasets/trajectories/SF/road_segment_map_final.csv", sep=";")
df = df[df["speed_mean"] * 111000 * 3.6 < 100]
df.to_csv("../datasets/trajectories/SF/road_segment_map_final.csv", sep=";")

In [3]:
"""
Test of Travel Time Dataset generation
"""
from generator.trajectory import Trajectory

traj = Trajectory("../datasets/trajectories/sf/road_segment_map_final.csv", nrows=1000000)

In [4]:
temp = pd.read_csv("../datasets/trajectories/sf/road_segment_map_final_corrected_sf.csv", sep=";")

In [4]:
dft = traj.generate_TTE_datatset()
dft["travel_time"].describe()

count    406415.000000
mean        564.055414
std         322.097666
min          23.000000
25%         357.000000
50%         509.000000
75%         707.000000
max       36740.000000
Name: travel_time, dtype: float64

In [6]:
# delete corrupt trajs and save
temp = temp[~temp["id"].isin(dft[dft["travel_time"] <= 10]["id"].values)]
temp.to_csv("../datasets/trajectories/sf/road_segment_map_final.csv", sep=";")

In [5]:
"""
Generate traj features 
"""
features = traj.generate_speed_features(network)


406415it [01:58, 3418.54it/s]


In [12]:
features[features["avg_speed"] < 0] = 0

In [22]:
features.to_csv("../datasets/trajectories/sf/speed_features_unnormalized.csv")

In [11]:
features.describe()

Unnamed: 0,id,util,avg_speed
count,27039.0,27039.0,25533.0
mean,13518.356633,512.289915,33.769484
std,7806.028642,1518.868633,15.443006
min,0.0,0.0,0.0
25%,6758.5,8.0,25.724797
50%,13518.0,40.0,30.652395
75%,20278.5,255.0,37.424775
max,27038.0,30453.0,175.475249


In [43]:
df = pd.read_csv("../datasets/trajectories/SF/road_segment_map_final.csv", sep=";")
df

Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,opath,spdist,pgeom,cpath,mgeom,duration,speed,speed_mean
0,0,0,0,0,0,1,1,98229814981135411575822603,"0.00674403,0.00554895,0.00366038,0.00354957,0....","LINESTRING(-122.415273209 37.7874253071,-122.4...","(9822, 9819, 9817, 9816, 9814, 22457, 26567, 2...","LINESTRING(-122.415273209 37.7874253071,-122.4...",6060655560,"(0.000112400450758, 9.24825066826e-05, 5.63135...",0.000071
1,1,1,1,1,1,3,2,"15095,5421,13318,25692,23632,25836,26127,25864...","0.00514844,0.0114677,0.0187276,0.0031394,0.004...","LINESTRING(-122.399238232 37.7884665865,-122.4...","(15095, 10523, 14223, 5421, 15056, 13873, 2435...","LINESTRING(-122.399238232 37.7884665865,-122.3...",57506038,"(9.03235166076e-05, 0.000229354420738, 0.00031...",0.000179
2,2,2,2,2,2,4,3,9818150639815114881148411484,"0.00257552,0.00442952,0.00265894,0.00353444,4e-05","LINESTRING(-122.413245623 37.7870676516,-122.4...","(9818, 15061, 15063, 9819, 9817, 9815, 6273, 1...","LINESTRING(-122.413245623 37.7870676516,-122.4...",3962586041,"(6.60389382938e-05, 7.144395095e-05, 4.5843850...",0.000049
3,3,3,3,3,3,6,4,"14833,34,14674,22178,14663,7628,13311,13313,13...","0.00613141,0.00352033,0.0018208,0.00407939,0.0...","LINESTRING(-122.409986952 37.8083077416,-122.4...","(14833, 23801, 4432, 14683, 4186, 34, 6196, 10...","LINESTRING(-122.409986952 37.8083077416,-122.4...",804938,"(7.66426028084e-05, 7.18435653714e-05, 4.79158...",0.000065
4,4,4,4,4,4,9,6,214962148621474213861485414849,"0.00623408,0.00542046,0.00759899,0.00701491,0....","LINESTRING(-122.440825717 37.799194896,-122.43...","(21496, 21493, 21489, 21486, 21484, 21480, 214...","LINESTRING(-122.440825717 37.799194896,-122.44...",61601365543,"(0.000102197982729, 9.0340981251e-05, 5.587494...",0.000083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249760,249760,249760,257173,264135,269434,1076662,408528,"12118,12122,24037,10856,1801,10504,26646,18634...","0.00154045,0.00328032,0.00477082,0.00503109,0....","LINESTRING(-122.418099719 37.752265361,-122.41...","(12118, 12122, 1312, 24890, 20186, 24037, 2337...","LINESTRING(-122.418099719 37.752265361,-122.41...",625934,"(2.48460219937e-05, 5.55986932499e-05, 0.00014...",0.000074
249761,249761,249761,257174,264136,269435,1076666,408529,"15756,7013,7013,7013,7013,7011,6999,7397,2995,...","8.52819e-05,0.000129157,9.97646e-05,0.00010619...","LINESTRING(-122.4035129 37.7964258,-122.403495...","(15756, 7013, 25534, 7011, 7008, 7006, 7005, 3...","LINESTRING(-122.4035129 37.7964258,-122.4034 3...",6261586129,"(1.37551382518e-06, 2.11733194736e-06, 1.72007...",0.000005
249762,249762,249762,257175,264137,269436,1076668,408530,"18023,18026,18273,10941,15067,24749,15963,1551...","0.0016248,0.0030617,0.00209437,0.00172484,0.00...","LINESTRING(-122.41564603 37.7806874569,-122.41...","(18023, 18026, 8007, 18269, 18273, 14962, 1094...","LINESTRING(-122.41564603 37.7806874569,-122.41...",596061360,"(2.75390112065e-05, 5.10282889126e-05, 3.43340...",0.000155
249763,249763,249763,257176,264138,269437,1076670,408531,"6260,6999,24576,15061,15064,14958,20107,13970,...","0.00076415,0.00939688,0.00741844,0.00303346,0....","LINESTRING(-122.402083161 37.7893605665,-122.4...","(6260, 6999, 14223, 5421, 15056, 13873, 24351,...","LINESTRING(-122.402083161 37.7893605665,-122.4...",11464605848,"(6.70307224428e-06, 0.000146826186917, 0.00012...",0.000075
