In [1]:
%load_ext autoreload
%autoreload 2

import os

import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import LineString
from shapely import wkt
import numpy as np
import swifter
from generator.preprocess import *
from generator.preprocess import remove_outlier_trajectories
from generator.road_network import RoadNetwork
import glob
from tqdm import tqdm

In [2]:
network = RoadNetwork()
network.load("../osm_data/sf")

In [80]:
# read data 

all_files = glob.glob(os.path.join("../datasets/trajectories/SF/cabdata" , "*.txt"))

data = []

for filename in all_files:
    tdf = pd.read_csv(filename, index_col=None, header=None, delimiter=" ")
    tdf["tax_id"] = filename.split("/")[-1].split(".")[0].split("_")[1]
    data.append(tdf)

df = pd.concat(data, axis=0, ignore_index=True)
df = df.rename(columns={0: "lat", 1: "long", 2: "occupied", 3: "timestamp"})

In [45]:
df[50:60]

Unnamed: 0,lat,long,occupied,timestamp,tax_id
50,37.78554,-122.42929,1,1213032899,abdremlu
51,37.78459,-122.43728,1,1213032833,abdremlu
52,37.78468,-122.43977,1,1213032791,abdremlu
53,37.78704,-122.44025,1,1213032732,abdremlu
54,37.78834,-122.44049,1,1213032659,abdremlu
55,37.79543,-122.44103,1,1213032509,abdremlu
56,37.79638,-122.44218,0,1213032451,abdremlu
57,37.79638,-122.44218,0,1213032421,abdremlu
58,37.80007,-122.44288,0,1213032362,abdremlu
59,37.80334,-122.44365,0,1213032307,abdremlu


In [81]:

import time 
# group for each taxi
rows = []
for _, g in tqdm(df.groupby("tax_id")):
    # group each occupied trajectory
    trajectories_occu = g[g['occupied'] == 1].groupby((g['occupied'] != 1).cumsum())
    # trajectories_nooccu = g[g['occupied'] == 0].groupby((g['occupied'] != 0).cumsum())
    for _, t in trajectories_occu:
        if t.shape[0] < 5:
            continue
        data = t.to_numpy()
        data[:, 0], data[:, 1] = data[:, 1], data[:, 0].copy()
        seq = LineString(data[::-1, :2])
        rows.append((seq, data[::-1, 3]))
    
    # for _, t in trajectories_nooccu:
    #     if t.shape[0] < 5:
    #         continue
    #     data = t.to_numpy()
    #     seq = LineString(data[::-1, :2])
    #     rows.append((seq, data[::-1, 3]))

processed_df = pd.DataFrame(rows, columns=["POLYLINE", "timestamp"])
processed_df

100%|██████████| 536/536 [01:06<00:00,  8.10it/s]


Unnamed: 0,POLYLINE,timestamp
0,"LINESTRING (-122.41527 37.7874, -122.40859 37....","[1213083415, 1213083475, 1213083535, 121308360..."
1,"LINESTRING (-122.39951 37.78881, -122.4032 37....","[1213081170, 1213081539, 1213081540, 121308154..."
2,"LINESTRING (-122.41304 37.78711, -122.41499 37...","[1213078088, 1213078127, 1213078189, 121307824..."
3,"LINESTRING (-122.40998 37.80838, -122.41186 37...","[1213077504, 1213077580, 1213077634, 121307770..."
4,"LINESTRING (-122.40998 37.80839, -122.41113 37...","[1213076544, 1213076658, 1213076688, 121307679..."
...,...,...
407148,"LINESTRING (-122.43085 37.77202, -122.42726 37...","[1211050722, 1211050787, 1211050834, 121105089..."
407149,"LINESTRING (-122.40082 37.79218, -122.40131 37...","[1211049437, 1211049498, 1211049565, 121104961..."
407150,"LINESTRING (-122.4051 37.78521, -122.40291 37....","[1211047463, 1211047523, 1211047583, 121104765..."
407151,"LINESTRING (-122.38849 37.61609, -122.39863 37...","[1211045500, 1211045565, 1211045614, 121104575..."


In [82]:
city_bounds = network.bounds
df_clipped = clip_trajectories(processed_df.copy(), city_bounds, polyline_convert=True)
df_clipped = filter_min_points(df_clipped, 5)
df_clipped

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,timestamp,POLYLINE,coords
0,"[1213083415, 1213083475, 1213083535, 121308360...","LINESTRING (-122.41527 37.78740, -122.40859 37...","[(-122.41527, 37.7874), (-122.40859, 37.78833)..."
1,"[1213081170, 1213081539, 1213081540, 121308154...","LINESTRING (-122.39951 37.78881, -122.40320 37...","[(-122.39951, 37.78881), (-122.4032, 37.78734)..."
2,"[1213078088, 1213078127, 1213078189, 121307824...","LINESTRING (-122.41304 37.78711, -122.41499 37...","[(-122.41304, 37.78711), (-122.41499, 37.78709..."
3,"[1213077504, 1213077580, 1213077634, 121307770...","LINESTRING (-122.40998 37.80838, -122.41186 37...","[(-122.40998, 37.80838), (-122.41186, 37.80509..."
4,"[1213076544, 1213076658, 1213076688, 121307679...","LINESTRING (-122.40998 37.80839, -122.41113 37...","[(-122.40998, 37.80839), (-122.41113, 37.80156..."
...,...,...,...
685278,"[1211050722, 1211050787, 1211050834, 121105089...","LINESTRING (-122.43085 37.77202, -122.42726 37...","[(-122.43085, 37.77202), (-122.42726, 37.77255..."
685279,"[1211049437, 1211049498, 1211049565, 121104961...","LINESTRING (-122.40082 37.79218, -122.40131 37...","[(-122.40082, 37.79218), (-122.40131, 37.793),..."
685280,"[1211047463, 1211047523, 1211047583, 121104765...","LINESTRING (-122.40510 37.78521, -122.40291 37...","[(-122.4051, 37.78521), (-122.40291, 37.78603)..."
685281,"[1211045500, 1211045565, 1211045614, 121104575...","LINESTRING (-122.39403 37.70355, -122.39694 37...","[(-122.39402857932053, 37.7035474), (-122.3969..."


In [84]:
df_clipped["id"] = np.arange(1, df_clipped.shape[0]+1)
df_clipped.drop("coords", inplace=True, axis=1)
df_clipped["timestamp"] = df_clipped["timestamp"].astype(str)
df_clipped["timestamp"] = df_clipped["timestamp"].str.replace("[", "")
df_clipped["timestamp"] = df_clipped["timestamp"].str.replace("]", "")
df_clipped["timestamp"] = df["timestamp"].str.replace(" ", ", ")
df_clipped.to_csv("../datasets/trajectories/SF/mapped_id_poly_clipped.csv", sep=";", index=False)

In [3]:
df = pd.read_csv("../datasets/trajectories/SF/mapped_id_poly_clipped.csv", sep=";")
df

Unnamed: 0,timestamp,POLYLINE,id
0,1213083415 1213083475 1213083535 1213083600 12...,"LINESTRING (-122.41527 37.7874, -122.40859 37....",1
1,1213081170 1213081539 1213081540 1213081541 12...,"LINESTRING (-122.39951 37.78881, -122.4032 37....",2
2,1213078088 1213078127 1213078189 1213078247 12...,"LINESTRING (-122.41304 37.78711, -122.41499 37...",3
3,1213077504 1213077580 1213077634 1213077708 12...,"LINESTRING (-122.40998 37.80838, -122.41186 37...",4
4,1213076544 1213076658 1213076688 1213076790 12...,"LINESTRING (-122.40998 37.80839, -122.41113 37...",5
...,...,...,...
418134,1211050722 1211050787 1211050834 1211050894 12...,"LINESTRING (-122.43085 37.77202, -122.42726 37...",418135
418135,1211049437 1211049498 1211049565 1211049617 12...,"LINESTRING (-122.40082 37.79218, -122.40131 37...",418136
418136,1211047463 1211047523 1211047583 1211047653 12...,"LINESTRING (-122.4051 37.78521, -122.40291 37....",418137
418137,1211045500 1211045565 1211045614 1211045755 12...,"LINESTRING (-122.39402857932053 37.7035474, -1...",418138


In [35]:
df["POLYLINE"] = df["POLYLINE"].swifter.apply(wkt.loads)
gdf = gpd.GeoDataFrame(df, crs="epsg:4326", geometry="POLYLINE")

Pandas Apply:   0%|          | 0/418139 [00:00<?, ?it/s]

In [4]:
network.fmm_trajectorie_mapping(
    network_file="../osm_data/sf/edges.shp",
    input_file="../datasets/trajectories/SF/mapped_id_poly_clipped.csv",
    output_file="../datasets/trajectories/SF/road-segment-mapping.txt"
)

gps file : ../datasets/trajectories/SF/mapped_id_poly_clipped.csv
id column : id
geom column : POLYLINE
timestamp column : timestamp
x column : x
y column : y
GPS point : false

Result file : ../datasets/trajectories/SF/road-segment-mapping.txt
Output fields: opath pgeom spdist cpath mgeom duration speed 
[2022-08-10 10:33:47.548] [info] [network.cpp:72] Read network from file ../osm_data/sf/edges.shp
Status: success
Time takes 7903.32 seconds
Total points 4269845 matched 4254775
Map match speed 538.353 points/s 

[2022-08-10 10:33:47.698] [info] [network.cpp:170] Number of edges 27039 nodes 9739
[2022-08-10 10:33:47.698] [info] [network.cpp:171] Field index: id 17 source 0 target 1
[2022-08-10 10:33:47.716] [info] [network.cpp:174] Read network done.
[2022-08-10 10:33:47.716] [info] [network_graph.cpp:17] Construct graph from network edges start
[2022-08-10 10:33:47.719] [info] [network_graph.cpp:30] Graph nodes 9739 edges 27039
[2022-08-10 10:33:47.719] [info] [network_graph.cpp:31] 

In [25]:
# preprocess the mapping especially the speed and distance values need to be verified
df = pd.read_csv("../datasets/trajectories/SF/road-segment-mapping.csv", sep=";")
df_prep = remove_outlier_trajectories(df.copy(), min_edges_traversed=3, max_speed=1e1)
df_prep.to_csv("../datasets/trajectories/SF/road_segment_map_final.csv", sep=";")

Pandas Apply:   0%|          | 0/341905 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/341905 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/341905 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[(df["speed_mean"] > max_speed)]["speed"] = df[(df["speed_mean"] > max_speed)][


Pandas Apply:   0%|          | 0/270216 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["speed_mean"] = df["speed"].swifter.apply(np.mean)


In [27]:
df_prep.to_csv("../datasets/trajectories/SF/road_segment_map_final.csv", sep=";")

In [29]:
"""
Test of Travel Time Dataset generation
"""
from generator.trajectory import Trajectory

traj = Trajectory("../datasets/trajectories/SF/road_segment_map_final.csv", nrows=1000000)

Pandas Apply:   0%|          | 0/269439 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/269439 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/269439 [00:00<?, ?it/s]

In [30]:
dft = traj.generate_TTE_datatset()
dft["travel_time"].describe()

Pandas Apply:   0%|          | 0/269439 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/269439 [00:00<?, ?it/s]

count    269439.000000
mean        232.744558
std         237.150987
min           2.000000
25%         149.000000
50%         222.000000
75%         282.000000
max       31286.000000
Name: travel_time, dtype: float64

In [35]:
dft.iloc[dft["travel_time"].argmax()]["seg_seq"]

array([19603, 17960, 26550, 12902, 11994,  2991,  6382, 19183, 13273,
       16003,  7451, 25312, 21810, 13000, 20057, 26892, 27001, 27003,
       26996, 26887, 25692, 23633, 23638, 23625, 25769, 23648, 23632,
       23628, 21593, 21592, 21023, 23603, 25697, 23592, 25836, 23600,
        2443, 19476, 19478, 21852, 23596,  8582, 26114, 26128, 25568,
       19474, 21402, 23605, 26126, 26115, 21400, 21854, 23598,  3209,
       21902,  9912, 25862, 21754, 20981,  2363,  9909,  9903,  9902,
        9898,  9587, 21626, 19154, 21629, 21896,   279, 15914, 13104,
       24406, 21306, 19189, 19187, 12888, 17957, 17388,  4714, 16005,
       13274, 19184,  6383,  2990, 11995, 12903,  9544,  9547,  9549,
       23268, 22455, 23097, 26562, 26565, 25931,  6270, 26042, 11484,
       18051])