In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import networkx as nx
import osmnx as ox
import json


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%pwd

'/home/cseadmin/dz/TrafficFlowModel/data_process/gis_pipeline'

In [2]:
DATA_PATH = "../../data/"
DATASET = "whc"

DATE_PREFIX = "2020-06-"
START_DAY = 1
END_DAY = 30

DOWNSAMPLING_INTERVAL = 10 #s
TRAJ_SPLIT_INTERVAL = 600
FLOW_AGG_INTERVAL_MINUTE = 5


geo

---

In [3]:
df_edges = gpd.GeoDataFrame.from_file(os.path.join(DATA_PATH, DATASET, f"fmm_{DATASET}", "edges.shp"))

df_edges

Unnamed: 0,fid,u,v,geometry
0,0,39,105,"LINESTRING (114.04424 22.55066, 114.04424 22.5..."
1,1,129,36,"LINESTRING (114.04761 22.55094, 114.04768 22.5..."
2,2,36,68,"LINESTRING (114.04768 22.54928, 114.04667 22.5..."
3,3,68,10,"LINESTRING (114.04667 22.54928, 114.04665 22.5..."
4,4,39,10,"LINESTRING (114.04424 22.55066, 114.04665 22.5..."
...,...,...,...,...
487,487,79,45,"LINESTRING (114.06304 22.53372, 114.06297 22.5..."
488,488,73,79,"LINESTRING (114.06305 22.53176, 114.06304 22.5..."
489,489,102,79,"LINESTRING (114.06133 22.53171, 114.06304 22.5..."
490,490,102,73,"LINESTRING (114.06133 22.53171, 114.06305 22.5..."


In [4]:
df_geo=pd.DataFrame()

df_geo["geo_id"]=df_edges["fid"]
df_geo["type"]="LineString"
df_geo["coordinates"]=df_edges["geometry"].apply(lambda x: list(x.coords))

df_geo

Unnamed: 0,geo_id,type,coordinates
0,0,LineString,"[(114.04423924729905, 22.550663183705367), (11..."
1,1,LineString,"[(114.04761345337049, 22.550935297098224), (11..."
2,2,LineString,"[(114.04767694649549, 22.54928447584822), (114..."
3,3,LineString,"[(114.0466701269419, 22.54928447584822), (114...."
4,4,LineString,"[(114.04423924729905, 22.550663183705367), (11..."
...,...,...,...
487,487,LineString,"[(114.06303991233369, 22.533716708739618), (11..."
488,488,LineString,"[(114.06305083168648, 22.531762144591667), (11..."
489,489,LineString,"[(114.06132557394695, 22.531707547827757), (11..."
490,490,LineString,"[(114.06132557394695, 22.531707547827757), (11..."


In [5]:
df_geo.to_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}.geo"), index=False)

rel

---

In [6]:
rel=[]
rel_id_counter=0

u_dict={}

for _, row in df_edges.iterrows():
    if row["u"] not in u_dict.keys():
        u_dict[row["u"]]=[]
    u_dict[row["u"]].append(row["fid"])
    
for _, row in df_edges.iterrows():
    for adj_fid in u_dict[row["v"]]:
        rel.append([rel_id_counter, "geo", row["fid"], adj_fid])
        rel_id_counter+=1

df_rel=pd.DataFrame(rel, columns=["rel_id", "type", "origin_id", "destination_id"])

df_rel

Unnamed: 0,rel_id,type,origin_id,destination_id
0,0,geo,0,70
1,1,geo,0,246
2,2,geo,0,252
3,3,geo,1,2
4,4,geo,1,63
...,...,...,...,...
1689,1689,geo,490,244
1690,1690,geo,490,488
1691,1691,geo,491,245
1692,1692,geo,491,489


In [8]:
df_rel.to_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}.rel"), index=False)

dyna

---

In [4]:
df_fmm_res=pd.read_csv(os.path.join(DATA_PATH, DATASET, f"fmm_{DATASET}", "mr.txt"), sep=";").set_index("id").dropna()
df_fmm_data=pd.read_csv(os.path.join(DATA_PATH, DATASET, f"fmm_{DATASET}", "gps.csv"), sep=";", parse_dates=["time"])

df_fmm_res # 1828761 -> 575713 -> 357217 ----> 1826748
df_fmm_data # 23599651 -> 40249046 -> 39088814 ----> 25828330

Unnamed: 0_level_0,opath,cpath,mgeom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
65636,447447447447,447,"LINESTRING(114.051336781 22.5324939014,114.051..."
32823,198198448,198199448,"LINESTRING(114.045022079 22.5314131864,114.047..."
50016,489242244,489242244,"LINESTRING(114.061792086 22.5322542882,114.063..."
9378,444198198,444198,"LINESTRING(114.044632613 22.5314018059,114.044..."
6252,25007070,250070,"LINESTRING(114.044239247 22.5506631837,114.044..."
...,...,...,...
1812146,1441452464,1441453913901281303403162464,"LINESTRING(114.045215434 22.5420036316,114.045..."
1812147,"255,8,8,65,65,64,64,60,60,66,304,304,304,272,2...","255,8,65,64,310,311,60,62,66,95,343,342,304,28...","LINESTRING(114.051408403 22.551223899,114.0510..."
1812148,7791010262632327272157,791026273227927627572157,"LINESTRING(114.050582479 22.5511800838,114.051..."
1812149,211202200200447447447425425168164414,211202200446448447425168410164414,"LINESTRING(114.052324582 22.5315317224,114.051..."


Unnamed: 0,id,x,y,time,speed
0,0,114.044230,22.551098,2020-06-01 10:05:36,0.0
1,0,114.050660,22.551142,2020-06-01 10:07:21,5.0
2,0,114.050690,22.551117,2020-06-01 10:08:36,9.0
3,0,114.052180,22.551144,2020-06-01 10:08:51,55.0
4,0,114.056800,22.551168,2020-06-01 10:09:21,40.0
...,...,...,...,...,...
25828325,1826814,114.057686,22.531790,2020-06-30 09:13:41,30.0
25828326,1826815,114.043950,22.531195,2020-06-30 15:13:24,58.0
25828327,1826815,114.051240,22.531390,2020-06-30 15:14:14,41.0
25828328,1826815,114.054790,22.531458,2020-06-30 15:14:44,57.0


In [5]:
def convert_path(row):
    row["opath"]=np.array(row["opath"].split(","), dtype=np.int16)
    row["cpath"]=np.array(row["cpath"].split(","), dtype=np.int16)
    
    return row

df_fmm_res=df_fmm_res.apply(convert_path, axis=1)

df_fmm_res

Unnamed: 0_level_0,opath,cpath,mgeom,cpath_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
65636,"[447, 447, 447, 447]",[447],"LINESTRING(114.051336781 22.5324939014,114.051...",1
32823,"[198, 198, 448]","[198, 199, 448]","LINESTRING(114.045022079 22.5314131864,114.047...",3
50016,"[489, 242, 244]","[489, 242, 244]","LINESTRING(114.061792086 22.5322542882,114.063...",3
9378,"[444, 198, 198]","[444, 198]","LINESTRING(114.044632613 22.5314018059,114.044...",2
6252,"[250, 0, 70, 70]","[250, 0, 70]","LINESTRING(114.044239247 22.5506631837,114.044...",3
...,...,...,...,...
1812146,"[144, 145, 246, 4]","[144, 145, 391, 390, 128, 130, 340, 316, 246, 4]","LINESTRING(114.045215434 22.5420036316,114.045...",10
1812147,"[255, 8, 8, 65, 65, 64, 64, 60, 60, 66, 304, 3...","[255, 8, 65, 64, 310, 311, 60, 62, 66, 95, 343...","LINESTRING(114.051408403 22.551223899,114.0510...",30
1812148,"[7, 7, 9, 10, 10, 26, 26, 32, 32, 72, 72, 157]","[7, 9, 10, 26, 27, 32, 279, 276, 275, 72, 157]","LINESTRING(114.050582479 22.5511800838,114.051...",11
1812149,"[211, 202, 200, 200, 447, 447, 447, 425, 425, ...","[211, 202, 200, 446, 448, 447, 425, 168, 410, ...","LINESTRING(114.052324582 22.5315317224,114.051...",11


In [None]:
# no recovery

N=len(df_edges)

flow_matrix=np.zeros((END_DAY - START_DAY + 1, 24 * 60 // FLOW_AGG_INTERVAL_MINUTE, N), dtype=np.int16)

for traj_id in tqdm(df_fmm_res.index):
    time_list=df_fmm_data.loc[df_fmm_data["id"]==traj_id]["time"].values
    road_list=df_fmm_res.loc[traj_id, "opath"]
    
    assert(len(time_list)==len(road_list))
    
    for i in range(len(road_list)):
        time_i=pd.to_datetime(time_list[i])
        day=time_i.day
        mins=time_i.hour*60+time_i.minute
        
        flow_matrix[day-START_DAY][mins//FLOW_AGG_INTERVAL_MINUTE][road_list[i]]+=1

dyna_file=open(os.path.join(DATA_PATH, DATASET, f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min.dyna"), "w")
write_length=dyna_file.write("dyna_id,type,time,entity_id,flow\n") # disable printing return value

dyna_id_counter=0
for day in tqdm(range(flow_matrix.shape[0])):
    if day==18 or day==19:
        continue
    for interval in range(flow_matrix.shape[1]):
        for road in range(flow_matrix.shape[2]):
            write_length=dyna_file.write(f"{dyna_id_counter},"+
                    "state,"+
                    f"{DATE_PREFIX}{str(day+START_DAY).zfill(2)}T{str(interval*FLOW_AGG_INTERVAL_MINUTE//60).zfill(2)}:{str((interval%(60//FLOW_AGG_INTERVAL_MINUTE))*FLOW_AGG_INTERVAL_MINUTE).zfill(2)}:00Z,"+
                    f"{road},"+
                    f"{flow_matrix[day][interval][road]}\n")
            dyna_id_counter+=1
            
dyna_file.close()

  0%|          | 1112/1996483 [00:21<10:47:47, 51.34it/s]

config

---

In [17]:
import json

config={}

config["geo"]={}
config["geo"]["including_types"]=["LineString"]
config["geo"]["LineString"]={}

config["rel"]={}
config["rel"]["including_types"]=["geo"]
config["rel"]["geo"]={}

config["dyna"]={}
config["dyna"]["including_types"]=["state"]
config["dyna"]["state"]={"entity_id": "geo_id", "flow": "num"}

config["info"]={}
config["info"]["data_files"]=f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min_recovered"
config["info"]["geo_file"]=DATASET
config["info"]["rel_file"]=DATASET
config["info"]["data_col"]=["flow"]
config["info"]["output_dim"]=1
# config["info"]["weight_col"]="link_weight"
config["info"]["time_intervals"]=60*FLOW_AGG_INTERVAL_MINUTE
config["info"]["init_weight_inf_or_zero"]="zero" # adj matrix not connected: 0 (inf: infinity)
config["info"]["set_weight_link_or_dist"]="link" # adj matrix 01 (dist: use weight)
config["info"]["calculate_weight_adj"]=False
# config["info"]["weight_adj_epsilon"]=0.1 # disabled when the above is false

json.dump(config, open(os.path.join(DATA_PATH, DATASET, "config.json"), "w", encoding="utf-8"), ensure_ascii=False)
config

{'geo': {'including_types': ['LineString'], 'LineString': {}},
 'rel': {'including_types': ['geo'], 'geo': {}},
 'dyna': {'including_types': ['state'],
  'state': {'entity_id': 'geo_id', 'flow': 'num'}},
 'info': {'data_files': 'sz_taxi_202006_5min_recovered',
  'geo_file': 'sz_taxi_202006',
  'rel_file': 'sz_taxi_202006',
  'data_col': ['flow'],
  'output_dim': 1,
  'time_intervals': 300,
  'init_weight_inf_or_zero': 'zero',
  'set_weight_link_or_dist': 'link',
  'calculate_weight_adj': False}}