In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import networkx as nx
import osmnx as ox
import json


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%pwd

'/home/cseadmin/dz/TrafficFlowModel/data_process/whc'

In [2]:
DATA_PATH = "../../data/"
DATASET = "whc"

DATE_PREFIX = "2020-06-"
START_DAY = 1
END_DAY = 30

DOWNSAMPLING_INTERVAL = 10 #s
TRAJ_SPLIT_INTERVAL = 600
FLOW_AGG_INTERVAL_MINUTE = 5


geo

---

In [3]:
df_edges = gpd.GeoDataFrame.from_file(os.path.join(DATA_PATH, DATASET, f"fmm_{DATASET}", "edges.shp"))

df_edges

Unnamed: 0,fid,u,v,geometry
0,0,43,39,"LINESTRING (114.09928 22.55853, 114.09115 22.5..."
1,1,19,26,"LINESTRING (114.09948 22.54352, 114.09965 22.5..."
2,2,50,40,"LINESTRING (114.08166 22.54314, 114.08166 22.5..."
3,3,40,5,"LINESTRING (114.08166 22.54078, 114.07778 22.5..."
4,4,27,15,"LINESTRING (114.08160 22.55859, 114.07609 22.5..."
...,...,...,...,...
73,73,33,14,"LINESTRING (114.03816 22.51514, 114.04190 22.5..."
74,74,2,55,"LINESTRING (114.04277 22.52264, 114.04451 22.5..."
75,75,34,25,"LINESTRING (114.05128 22.53949, 114.05111 22.5..."
76,76,29,35,"LINESTRING (114.09719 22.55189, 114.09948 22.5..."


In [4]:
df_geo=pd.DataFrame()

df_geo["geo_id"]=df_edges["fid"]
df_geo["type"]="LineString"
df_geo["coordinates"]=df_edges["geometry"].apply(lambda x: list(x.coords))

df_geo

Unnamed: 0,geo_id,type,coordinates
0,0,LineString,"[(114.0992833370242, 22.55852704326107), (114...."
1,1,LineString,"[(114.09948190807, 22.54352257227957), (114.09..."
2,2,LineString,"[(114.08165693315638, 22.543141485105828), (11..."
3,3,LineString,"[(114.08165693315638, 22.540778721145184), (11..."
4,4,LineString,"[(114.08159504088236, 22.55859372615637), (114..."
...,...,...,...
73,73,LineString,"[(114.03816213759897, 22.515138227240058), (11..."
74,74,LineString,"[(114.04276634656432, 22.52264464445106), (114..."
75,75,LineString,"[(114.0512794153435, 22.53948710505066), (114...."
76,76,LineString,"[(114.09718753369062, 22.5518930387035), (114...."


In [5]:
df_geo.to_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}.geo"), index=False)

rel

---

In [6]:
rel=[]
rel_id_counter=0

u_dict={}

for _, row in df_edges.iterrows():
    if row["u"] not in u_dict.keys():
        u_dict[row["u"]]=[]
    u_dict[row["u"]].append(row["fid"])
    
for _, row in df_edges.iterrows():
    for adj_fid in u_dict[row["v"]]:
        rel.append([rel_id_counter, "geo", row["fid"], adj_fid])
        rel_id_counter+=1

df_rel=pd.DataFrame(rel, columns=["rel_id", "type", "origin_id", "destination_id"])

df_rel

Unnamed: 0,rel_id,type,origin_id,destination_id
0,0,geo,0,39
1,1,geo,1,40
2,2,geo,2,3
3,3,geo,2,41
4,4,geo,3,42
...,...,...,...,...
113,113,geo,75,36
114,114,geo,75,63
115,115,geo,76,37
116,116,geo,77,1


In [7]:
df_rel.to_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}.rel"), index=False)

dyna

---

In [8]:
df_fmm_res=pd.read_csv(os.path.join(DATA_PATH, DATASET, f"fmm_{DATASET}", "mr.txt"), sep=";").set_index("id").dropna()
df_fmm_data=pd.read_csv(os.path.join(DATA_PATH, DATASET, f"fmm_{DATASET}", "gps.csv"), sep=";", parse_dates=["time"])

df_fmm_res
df_fmm_data

Unnamed: 0_level_0,opath,cpath
id,Unnamed: 1_level_1,Unnamed: 2_level_1
68760,55,5
87504,555,5
84387,390,390
45331,7777777711,771
70323,3939,39
...,...,...
2811080,2525,25
2811081,2525,25
2811083,454444,4544
2811085,272828286868686829,27286829


Unnamed: 0,id,x,y,time,speed
0,0,114.04106,22.556318,2020-06-01 09:58:21,0.000000
1,0,114.04221,22.556417,2020-06-01 09:58:51,34.000000
2,0,114.04333,22.556404,2020-06-01 09:59:07,17.000000
3,0,114.04351,22.555658,2020-06-01 09:59:21,23.000000
4,0,114.04356,22.553564,2020-06-01 10:00:01,13.000000
...,...,...,...,...,...
97979600,2819175,114.05476,22.531572,2020-06-30 11:42:40,76.000000
97979601,2819176,114.04357,22.557222,2020-06-30 14:44:44,1.013353
97979602,2819176,114.04367,22.557001,2020-06-30 14:46:14,3.000000
97979603,2819176,114.04381,22.556934,2020-06-30 14:46:24,13.000000


In [9]:
def convert_path(row):
    row["opath"]=np.array(row["opath"].split(","), dtype=np.int16)
    row["cpath"]=np.array(row["cpath"].split(","), dtype=np.int16)
    
    return row

df_fmm_res=df_fmm_res.apply(convert_path, axis=1)

df_fmm_res

Unnamed: 0_level_0,opath,cpath
id,Unnamed: 1_level_1,Unnamed: 2_level_1
68760,"[5, 5]",[5]
87504,"[5, 5, 5]",[5]
84387,"[39, 0]","[39, 0]"
45331,"[77, 77, 77, 77, 1, 1]","[77, 1]"
70323,"[39, 39]",[39]
...,...,...
2811080,"[25, 25]",[25]
2811081,"[25, 25]",[25]
2811083,"[45, 44, 44]","[45, 44]"
2811085,"[27, 28, 28, 28, 68, 68, 68, 68, 29]","[27, 28, 68, 29]"


In [11]:
# no recovery

N=len(df_edges)

flow_matrix=np.zeros((END_DAY - START_DAY + 1, 24 * 60 // FLOW_AGG_INTERVAL_MINUTE, N), dtype=np.int16)

for traj_id in tqdm(df_fmm_res.index):
    time_list=df_fmm_data.loc[df_fmm_data["id"]==traj_id]["time"].values
    road_list=df_fmm_res.loc[traj_id, "opath"]
    
    assert(len(time_list)==len(road_list))
    
    for i in range(len(road_list)):
        time_i=pd.to_datetime(time_list[i])
        day=time_i.day
        mins=time_i.hour*60+time_i.minute
        
        flow_matrix[day-START_DAY][mins//FLOW_AGG_INTERVAL_MINUTE][road_list[i]]+=1

dyna_file=open(os.path.join(DATA_PATH, DATASET, f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min.dyna"), "w")
write_length=dyna_file.write("dyna_id,type,time,entity_id,flow\n") # disable printing return value

dyna_id_counter=0
for day in tqdm(range(flow_matrix.shape[0])):
    if day==18 or day==19:
        continue
    for interval in range(flow_matrix.shape[1]):
        for road in range(flow_matrix.shape[2]):
            write_length=dyna_file.write(f"{dyna_id_counter},"+
                    "state,"+
                    f"{DATE_PREFIX}{str(day+START_DAY).zfill(2)}T{str(interval*FLOW_AGG_INTERVAL_MINUTE//60).zfill(2)}:{str((interval%(60//FLOW_AGG_INTERVAL_MINUTE))*FLOW_AGG_INTERVAL_MINUTE).zfill(2)}:00Z,"+
                    f"{road},"+
                    f"{flow_matrix[day][interval][road]}\n")
            dyna_id_counter+=1
            
dyna_file.close()

100%|██████████| 153493/153493 [3:13:58<00:00, 13.19it/s]  
100%|██████████| 30/30 [00:01<00:00, 18.23it/s]


config

---

In [10]:
import json

config={}

config["geo"]={}
config["geo"]["including_types"]=["LineString"]
config["geo"]["LineString"]={}

config["rel"]={}
config["rel"]["including_types"]=["geo"]
config["rel"]["geo"]={}

config["dyna"]={}
config["dyna"]["including_types"]=["state"]
config["dyna"]["state"]={"entity_id": "geo_id", "flow": "num"}

config["info"]={}
config["info"]["data_files"]=f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min"
config["info"]["geo_file"]=DATASET
config["info"]["rel_file"]=DATASET
config["info"]["data_col"]=["flow"]
config["info"]["output_dim"]=1
# config["info"]["weight_col"]="link_weight"
config["info"]["time_intervals"]=60*FLOW_AGG_INTERVAL_MINUTE
config["info"]["init_weight_inf_or_zero"]="zero" # adj matrix not connected: 0 (inf: infinity)
config["info"]["set_weight_link_or_dist"]="link" # adj matrix 01 (dist: use weight)
config["info"]["calculate_weight_adj"]=False
# config["info"]["weight_adj_epsilon"]=0.1 # disabled when the above is false

json.dump(config, open(os.path.join(DATA_PATH, DATASET, "config.json"), "w", encoding="utf-8"), ensure_ascii=False)
config

{'geo': {'including_types': ['LineString'], 'LineString': {}},
 'rel': {'including_types': ['geo'], 'geo': {}},
 'dyna': {'including_types': ['state'],
  'state': {'entity_id': 'geo_id', 'flow': 'num'}},
 'info': {'data_files': 'whc_5min',
  'geo_file': 'whc',
  'rel_file': 'whc',
  'data_col': ['flow'],
  'output_dim': 1,
  'time_intervals': 300,
  'init_weight_inf_or_zero': 'zero',
  'set_weight_link_or_dist': 'link',
  'calculate_weight_adj': False}}