In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import LineString
from shapely import wkt
import numpy as np
import swifter
from generator.preprocess import *
from generator.preprocess import remove_outlier_trajectories
from generator.road_network import RoadNetwork

In [15]:
# load mapped dataset
df = pd.read_csv("../datasets/trajectories/porto/train.csv")
# load network
network = RoadNetwork()
network.load("../osm_data/porto")

In [16]:
# get average degree
info = ""
nnodes = network.line_graph.number_of_nodes()
deg = sum(d for n, d in network.line_graph.in_degree()) / float(nnodes)
info += "Average in degree: %8.4f \n" % deg
deg = sum(d for n, d in network.line_graph.out_degree()) / float(nnodes)
info += "Average out degree: %8.4f" % deg

info

'Average in degree:   2.7990\nAverage out degree:   2.7990'

In [4]:
df

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."
...,...,...,...,...,...,...,...,...,...
1710665,1404171463620000698,C,,,20000698,1404171463,A,False,"[[-8.612469,41.14602],[-8.612487,41.145993],[-..."
1710666,1404171367620000670,C,,,20000670,1404171367,A,False,"[[-8.610138,41.140845],[-8.610174,41.140935],[..."
1710667,1388745716620000264,C,,,20000264,1388745716,A,False,[]
1710668,1404141826620000248,B,,12.0,20000248,1404141826,A,False,"[[-8.630712,41.154885],[-8.63073,41.154813],[-..."


In [None]:
# preprocess (clip trajectories to porto bounds and filter min points per trajectory (takes around 2h on pascal))
# trajectories that go out of porto and then again into porto area are splitted into seperated trajectories
fdf = preprocess_trajectories_porto(df, city_bounds=network.bounds, polyline_convert=True)
# save dataframe
fdf.to_csv("../datasets/trajectories/Porto/clipped_porto_full_10pmin_2mil.csv")

In [None]:
"""
Map Timestamps to clipped dataframe. Each trajectory has a start timestamp and between each gps point are 15s.
We need to create a column which holds an array for each trajectory with the timestamps for each gps point.
"""
fdf = pd.read_csv("../datasets/trajectories/Porto/clipped_porto_full_10pmin_2mil.csv")

In [None]:
from ast import literal_eval
# Map trajectories in the format for fmm matching and insert timestamp column
df_fmm = fdf.loc[:, ["TRIP_ID", "POLYLINE", "coords"]]
df_fmm["coords"] = df_fmm["coords"].swifter.apply(literal_eval)
df_fmm["timestamp"] = df_fmm["coords"].swifter.apply(lambda x: list(np.arange(0, ((len(x)-1)*15)+1, 15)))
df_fmm["id"] = np.arange(1, df_fmm.shape[0]+1)
df_fmm = df_fmm.drop(["coords"], axis=1)
df_fmm.to_csv("../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv", sep=";", index=False)

In [None]:
tdf = pd.read_csv("../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv", sep=";")
tdf["timestamp"] = tdf["timestamp"].str.replace("[", "")
tdf["timestamp"] = tdf["timestamp"].str.replace("]", "")
tdf.to_csv("../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv", sep=";", index=False)

In [3]:
from ast import literal_eval
# check timestamp length = gps stamp
tdf = pd.read_csv("../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv", sep=";")
cdf= fdf.loc[:, ["coords"]]
cdf["coords"] = cdf["coords"].swifter.apply(literal_eval)
tdf["timestamp"] = tdf["timestamp"].swifter.apply(literal_eval)

In [None]:
df_fmm["timestamp"].str.len().mean()

In [None]:
# Map porto gps points to road segments using fmm -> takes really long!
network.fmm_trajectorie_mapping(
    network_file="../osm_data/porto/edges.shp",
    input_file="../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv",
    output_file="../datasets/trajectories/Porto/road-segment-mapping.txt"
)

In [None]:
from ast import literal_eval

# preprocess the mapping especially the speed and distance values need to be verified
df = pd.read_csv("../datasets/trajectories/Porto/road-segment-mapping.csv", sep=";")
df = remove_outlier_trajectories(df,  min_edges_traversed=3, max_speed=1e1)
df.to_csv("../datasets/trajectories/Porto/road_segment_map_final.csv", sep=";")

In [27]:
"""
Test of Travel Time Dataset generation
"""
from generator.trajectory import Trajectory

traj = Trajectory("../datasets/trajectories/Porto/road_segment_map_final.csv")

In [30]:
dft = traj.generate_TTE_datatset()
dft["travel_time"].describe()

Pandas Apply:   0%|          | 0/3088468 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1544234 [00:00<?, ?it/s]