In [134]:
%load_ext autoreload
%autoreload 2

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import LineString
from shapely import wkt
import numpy as np
import swifter
from generator.preprocess import *
from generator.preprocess import remove_outlier_trajectories
from generator.road_network import RoadNetwork

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [176]:
# load mapped dataset
df = pd.read_csv("../datasets/trajectories/Porto/train.csv")
# load network
network = RoadNetwork("Porto, Portugal", network_type="drive", retain_all=True, truncate_by_edge=True)

In [3]:
# preprocess (clip trajectories to porto bounds and filter min points per trajectory (takes around 2h on pascal))
# trajectories that go out of porto and then again into porto area are splitted into seperated trajectories
fdf = preprocess_trajectories_porto(df, city_bounds=network.bounds, polyline_convert=True)
# save dataframe
fdf.to_csv("../datasets/trajectories/Porto/clipped_porto_full_10pmin_2mil.csv")

Pandas Apply:   0%|          | 0/1710660 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["POLYLINE"] = df["POLYLINE"].swifter.apply(convert_to_line_string)


Dask Apply:   0%|          | 0/24 [00:01<?, ?it/s]

In [4]:
"""
Map Timestamps to clipped dataframe. Each trajectory has a start timestamp and between each gps point are 15s.
We need to create a column which holds an array for each trajectory with the timestamps for each gps point.
"""
fdf = pd.read_csv("../datasets/trajectories/Porto/clipped_porto_full_10pmin_2mil.csv")



In [6]:
from ast import literal_eval
# Map trajectories in the format for fmm matching and insert timestamp column
df_fmm = fdf.loc[:, ["TRIP_ID", "POLYLINE", "coords"]]
df_fmm["coords"] = df_fmm["coords"].swifter.apply(literal_eval)
df_fmm["timestamp"] = df_fmm["coords"].swifter.apply(lambda x: list(np.arange(0, ((len(x)-1)*15)+1, 15)))
df_fmm["id"] = np.arange(1, df_fmm.shape[0]+1)
df_fmm = df_fmm.drop(["coords"], axis=1)
df_fmm.to_csv("../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv", sep=";", index=False)

Pandas Apply:   0%|          | 0/2152277 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2152277 [00:00<?, ?it/s]

In [7]:
tdf = pd.read_csv("../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv", sep=";")
tdf["timestamp"] = tdf["timestamp"].str.replace("[", "")
tdf["timestamp"] = tdf["timestamp"].str.replace("]", "")
tdf.to_csv("../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv", sep=";", index=False)

In [25]:
from ast import literal_eval
# check timestamp length = gps stamp
tdf = pd.read_csv("../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv", sep=";")
cdf= fdf.loc[:, ["coords"]]
cdf["coords"] = cdf["coords"].swifter.apply(literal_eval)
tdf["timestamp"] = tdf["timestamp"].swifter.apply(literal_eval)

In [20]:
df_fmm["timestamp"].str.len().mean()

29.01901335190591

In [None]:
# Map gps points to road segments
network.fmm_trajectorie_mapping(
    input_file="../datasets/trajectories/Porto/mapped_id_poly_clipped_timestamp.csv",
    output_file="../datasets/trajectories/Porto/road-segment-mapping.txt"
)

In [142]:
from ast import literal_eval

# preprocess the mapping especially the speed and distance values need to be verified
df = pd.read_csv("../datasets/trajectories/Porto/road-segment-mapping.csv", sep=";")
df = remove_outlier_trajectories(df,  min_edges_traversed=3, max_speed=1e1)

Pandas Apply:   0%|          | 0/1827216 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1827216 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1827216 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/253901 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[(df["speed_mean"] > max_speed)]["speed"] = df[


Pandas Apply:   0%|          | 0/1798135 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["speed_mean"] = df["speed"].swifter.apply(np.mean)


In [157]:
df.to_csv("../datasets/trajectories/Porto/road_segment_map_final.csv", sep=";")

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1544234 entries, 0 to 2152276
Data columns (total 8 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   id          1544234 non-null  int64  
 1   opath       1544234 non-null  object 
 2   spdist      1544234 non-null  object 
 3   cpath       1544234 non-null  object 
 4   mgeom       1544234 non-null  object 
 5   duration    1544234 non-null  object 
 6   speed       1544234 non-null  object 
 7   speed_mean  1544234 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 138.3+ MB


In [185]:
sample = df.iloc[3, :]
sample["mgeom"] = wkt.loads(sample["mgeom"])
# print(len(sample["speed"]), len(list(sample["mgeom"].coords)), len(sample["cpath"]))

s = sample["cpath"][0]
so = network.gdf_edges.iloc[s, :]
so

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample["mgeom"] = wkt.loads(sample["mgeom"])


osmid                                  [715176795, 307804461]
oneway                                                  False
highway                                             secondary
length                                                  71.04
geometry    LINESTRING (-8.610926 41.1451899, -8.6109178 4...
lanes                                                       3
ref                                                       NaN
maxspeed                                                   50
bridge                                                    NaN
name                                 Praça de Almeida Garrett
width                                                     NaN
access                                                    NaN
junction                                                  NaN
tunnel                                                    NaN
area                                                      NaN
Name: (129559333, 3130312347, 0), dtype: object

In [187]:
# plot supspecious examples
m = folium.Map(
    location=[41.1372482, -8.689151],
    zoom_start=13,
    tiles="cartodbdark_matter"
)

data = [(c[1], c[0]) for c in list(sample["mgeom"].coords)]
d2 = [(c[1], c[0]) for c in list(so["geometry"].coords)]
#speed = df_an["speed"][3]
folium.PolyLine(data, color="green", weight=2.5, opacity=1).add_to(m)
#folium.PolyLine(d2, color="red", weight=2.5, opacity=1).add_to(m)

# folium.Marker(
#     location=[data.iloc[i]['lat'], data.iloc[i]['lon']],
#     popup=data.iloc[i]['name'],
#     icon=folium.DivIcon(html=f"""<div style="font-family: courier new; color: blue">{data.iloc[i]['name']}</div>""")
# ).add_to(n)

m

In [30]:
df_an["mcoords"][15]

[(-8.6306538011, 41.149489488),
 (-8.6302954, 41.1499033),
 (-8.6299508, 41.1503011),
 (-8.6295157, 41.1508085),
 (-8.6294233, 41.1509158),
 (-8.6293113, 41.1510435),
 (-8.6291323, 41.1512455),
 (-8.6290736, 41.1513117),
 (-8.6289862, 41.1514767),
 (-8.6289736, 41.1515796),
 (-8.6287691, 41.1523766),
 (-8.628743, 41.1524782),
 (-8.6289655, 41.1524906),
 (-8.6296472, 41.1525286),
 (-8.6301016, 41.152554),
 (-8.6304904, 41.1525757),
 (-8.6306176, 41.1525836),
 (-8.6308068, 41.1525922),
 (-8.6309917, 41.1526006),
 (-8.6313575, 41.1526165),
 (-8.6314821, 41.1526802),
 (-8.6321685, 41.1527301),
 (-8.6323388, 41.1527262),
 (-8.6323916, 41.1527199),
 (-8.6324663, 41.1527113),
 (-8.6326352, 41.1526917),
 (-8.6328135, 41.152688),
 (-8.6328921, 41.1526593),
 (-8.6331975, 41.1526205),
 (-8.633479, 41.1525859),
 (-8.6335202, 41.1525796),
 (-8.6335704, 41.1525732),
 (-8.6339491, 41.1525251),
 (-8.6341695, 41.1524971),
 (-8.6343718, 41.1524692),
 (-8.6344234, 41.1524621),
 (-8.6345977, 41.1524387),
