In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import networkx as nx
import osmnx as ox
import json

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%pwd

'/home/cseadmin/dz/TrafficFlowModel/data_process/gis_pipeline'

In [2]:
DATA_PATH = "../../data/"
TAXI_DATA_PATH = "../../data/taxi_after_proc/clean202006"
DATASET = "sz_taxi_202006"

MIN_LAT = 22.5311
MAX_LAT = 22.5517
MIN_LNG = 114.0439
MAX_LNG = 114.0633

DATE_PREFIX = "2020-06-"
START_DAY = 1
END_DAY = 30

DOWNSAMPLING_INTERVAL = 30
TRAJ_SPLIT_INTERVAL = 600
FLOW_AGG_INTERVAL_MINUTE = 15

geo

---

In [3]:
df_edges = gpd.GeoDataFrame.from_file(os.path.join(DATA_PATH, DATASET, f"fmm_{DATASET}", "edges.shp"))

df_edges

Unnamed: 0,fid,u,v,geometry
0,0,39,105,"LINESTRING (114.04424 22.55066, 114.04424 22.5..."
1,1,129,36,"LINESTRING (114.04761 22.55094, 114.04768 22.5..."
2,2,36,68,"LINESTRING (114.04768 22.54928, 114.04667 22.5..."
3,3,68,10,"LINESTRING (114.04667 22.54928, 114.04665 22.5..."
4,4,39,10,"LINESTRING (114.04424 22.55066, 114.04665 22.5..."
...,...,...,...,...
487,487,79,45,"LINESTRING (114.06304 22.53372, 114.06297 22.5..."
488,488,73,79,"LINESTRING (114.06305 22.53176, 114.06304 22.5..."
489,489,102,79,"LINESTRING (114.06133 22.53171, 114.06304 22.5..."
490,490,102,73,"LINESTRING (114.06133 22.53171, 114.06305 22.5..."


In [4]:
df_geo=pd.DataFrame()

df_geo["geo_id"]=df_edges["fid"]
df_geo["type"]="LineString"
df_geo["coordinates"]=df_edges["geometry"].apply(lambda x: list(x.coords))

df_geo

Unnamed: 0,geo_id,type,coordinates
0,0,LineString,"[(114.04423924729905, 22.550663183705367), (11..."
1,1,LineString,"[(114.04761345337049, 22.550935297098224), (11..."
2,2,LineString,"[(114.04767694649549, 22.54928447584822), (114..."
3,3,LineString,"[(114.0466701269419, 22.54928447584822), (114...."
4,4,LineString,"[(114.04423924729905, 22.550663183705367), (11..."
...,...,...,...
487,487,LineString,"[(114.06303991233369, 22.533716708739618), (11..."
488,488,LineString,"[(114.06305083168648, 22.531762144591667), (11..."
489,489,LineString,"[(114.06132557394695, 22.531707547827757), (11..."
490,490,LineString,"[(114.06132557394695, 22.531707547827757), (11..."


In [5]:
df_geo.to_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}.geo"), index=False)

rel

---

In [6]:
rel=[]
rel_id_counter=0

u_dict={}

for _, row in df_edges.iterrows():
    if row["u"] not in u_dict.keys():
        u_dict[row["u"]]=[]
    u_dict[row["u"]].append(row["fid"])
    
for _, row in df_edges.iterrows():
    for adj_fid in u_dict[row["v"]]:
        rel.append([rel_id_counter, "geo", row["fid"], adj_fid])
        rel_id_counter+=1

df_rel=pd.DataFrame(rel, columns=["rel_id", "type", "origin_id", "destination_id"])

df_rel

Unnamed: 0,rel_id,type,origin_id,destination_id
0,0,geo,0,70
1,1,geo,0,246
2,2,geo,0,252
3,3,geo,1,2
4,4,geo,1,63
...,...,...,...,...
1689,1689,geo,490,244
1690,1690,geo,490,488
1691,1691,geo,491,245
1692,1692,geo,491,489


In [8]:
df_rel.to_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}.rel"), index=False)

dyna

---

In [5]:
df_fmm_res=pd.read_csv(os.path.join(DATA_PATH, DATASET, f"fmm_{DATASET}", "mr.txt"), sep=";")
df_fmm_data=pd.read_csv(os.path.join(DATA_PATH, DATASET, f"fmm_{DATASET}", "gps.csv"), sep=";", parse_dates=["time"])

df_fmm_res
df_fmm_data

Unnamed: 0,id,opath,cpath,mgeom
0,78132,244476447,244245477476209210211447,"LINESTRING(114.062939001 22.5317586056,114.061..."
1,81256,488488,488,"LINESTRING(114.063049678 22.5319686462,114.063..."
2,79694,19269,19269,"LINESTRING(114.062691727 22.5502455229,114.062..."
3,57826,443435,443435,"LINESTRING(114.044581738 22.5329280644,114.044..."
4,93752,211443,211202445444443,"LINESTRING(114.051461116 22.5315229562,114.051..."
...,...,...,...,...
1996489,1906027,444444,444,"LINESTRING(114.047618321 22.5314890507,114.046..."
1996490,1906028,190191434,190191187434,"LINESTRING(114.045760819 22.5333764018,114.045..."
1996491,1906029,38638936330047,"386,391,389,142,351,363,362,355,303,300,298,29...","LINESTRING(114.044990728 22.5407215039,114.045..."
1996492,1906030,30072,30029829228928728528272,"LINESTRING(114.055735231 22.5421678383,114.055..."


Unnamed: 0,id,x,y,time
0,0,114.050660,22.551142,2020-06-01 10:07:21
1,0,114.050690,22.551117,2020-06-01 10:08:36
2,0,114.054720,22.551180,2020-06-01 10:09:07
3,0,114.057950,22.546215,2020-06-01 10:11:22
4,0,114.059260,22.545100,2020-06-01 10:12:14
...,...,...,...,...
15645397,1996492,114.044890,22.531273,2020-06-30 10:40:31
15645398,1996492,114.060360,22.531437,2020-06-30 10:41:50
15645399,1996492,114.063095,22.533310,2020-06-30 10:45:01
15645400,1996493,114.062830,22.531683,2020-06-30 11:42:00


In [6]:
df_fmm_res[df_fmm_res.isna().any(axis=1)]

df_fmm_res=df_fmm_res.dropna()
df_fmm_res.isna().sum()

Unnamed: 0,id,opath,cpath,mgeom
5081,75125,,,LINESTRING()
190142,193652,,,LINESTRING()
580873,565420,,,LINESTRING()
870480,872909,,,LINESTRING()
918235,980001,,,LINESTRING()
1210606,1242357,,,LINESTRING()
1600715,1656276,,,LINESTRING()
1654564,1633393,,,LINESTRING()
1672602,1626219,,,LINESTRING()
1683996,1623198,,,LINESTRING()


id       0
opath    0
cpath    0
mgeom    0
dtype: int64

In [None]:

N=len(df_edges)

dyna_file=open(os.path.join(DATA_PATH, DATASET, f"{DATASET}_all.dyna"), "w")
write_length=dyna_file.write("dyna_id,type,time,entity_id,flow\n") # disable printing return value

flow_matrix=np.zeros((END_DAY - START_DAY + 1, 24 * 60 // FLOW_AGG_INTERVAL_MINUTE, N), dtype=np.int16)

for traj_id in tqdm(df_fmm_res["id"].values):
    time_list=df_fmm_data.loc[df_fmm_data["id"]==traj_id]["time"].values
    road_list=np.array(df_fmm_res.loc[df_fmm_res["id"]==traj_id]["opath"].values[0].split(","), dtype=np.int16)
    
    assert(len(time_list)==len(road_list))
    
    for i in range(len(road_list)):
        time_i=pd.to_datetime(time_list[i])
        day=time_i.day
        mins=time_i.hour*60+time_i.minute
        
        flow_matrix[day-START_DAY][mins//FLOW_AGG_INTERVAL_MINUTE][road_list[i]]+=1
        
dyna_id_counter=0   
for day in tqdm(range(flow_matrix.shape[0])):
    for interval in range(flow_matrix.shape[1]):
        for road in range(flow_matrix.shape[2]):
            write_length=dyna_file.write(f"{dyna_id_counter},"+
                    "state,"+
                    f"{DATE_PREFIX}{str(day+START_DAY).zfill(2)}T{str(interval*FLOW_AGG_INTERVAL_MINUTE//60).zfill(2)}:{str((interval%(60//FLOW_AGG_INTERVAL_MINUTE))*FLOW_AGG_INTERVAL_MINUTE).zfill(2)}:00Z,"+
                    f"{road},"+
                    f"{flow_matrix[day][interval][road]}\n")
            dyna_id_counter+=1
            
dyna_file.close()

  0%|          | 1112/1996483 [00:21<10:47:47, 51.34it/s]

In [14]:
np.save(os.path.join(DATA_PATH, DATASET, f"{DATASET}_all.npy"), flow_matrix)

In [43]:
df_dyna_all=pd.read_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}_all.dyna"))

df_dyna_all

Unnamed: 0,dyna_id,type,time,entity_id,flow
0,0,state,2020-06-01T00:00:00Z,0,24
1,1,state,2020-06-01T00:00:00Z,1,3
2,2,state,2020-06-01T00:00:00Z,2,0
3,3,state,2020-06-01T00:00:00Z,3,0
4,4,state,2020-06-01T00:00:00Z,4,27
...,...,...,...,...,...
1416955,1416955,state,2020-06-30T23:45:00Z,487,0
1416956,1416956,state,2020-06-30T23:45:00Z,488,0
1416957,1416957,state,2020-06-30T23:45:00Z,489,0
1416958,1416958,state,2020-06-30T23:45:00Z,490,0


In [46]:
df_dyna=df_dyna_all.loc[(df_dyna_all["time"]<"2020-06-19T00:00:00Z") | (df_dyna_all["time"]>="2020-06-21T00:00:00Z")]

df_dyna
df_dyna.to_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}.dyna"), index=False)

Unnamed: 0,dyna_id,type,time,entity_id,flow
0,0,state,2020-06-01T00:00:00Z,0,24
1,1,state,2020-06-01T00:00:00Z,1,3
2,2,state,2020-06-01T00:00:00Z,2,0
3,3,state,2020-06-01T00:00:00Z,3,0
4,4,state,2020-06-01T00:00:00Z,4,27
...,...,...,...,...,...
1416955,1416955,state,2020-06-30T23:45:00Z,487,0
1416956,1416956,state,2020-06-30T23:45:00Z,488,0
1416957,1416957,state,2020-06-30T23:45:00Z,489,0
1416958,1416958,state,2020-06-30T23:45:00Z,490,0


In [20]:
# delete date 0619, 0620
flow_matrix_no1920=np.delete(flow_matrix, [18, 19], axis=0)

np.save(os.path.join(DATA_PATH, DATASET, f"{DATASET}.npy"), flow_matrix_no1920)

config

---

In [10]:
import json

config={}

config["geo"]={}
config["geo"]["including_types"]=["LineString"]
config["geo"]["LineString"]={}

config["rel"]={}
config["rel"]["including_types"]=["geo"]
config["rel"]["geo"]={}

config["dyna"]={}
config["dyna"]["including_types"]=["state"]
config["dyna"]["state"]={"entity_id": "geo_id", "flow": "num"}

config["info"]={}
config["info"]["data_files"]=DATASET
config["info"]["geo_file"]=DATASET
config["info"]["rel_file"]=DATASET
config["info"]["data_col"]=["flow"]
config["info"]["output_dim"]=1
# config["info"]["weight_col"]="link_weight"
config["info"]["time_intervals"]=60*FLOW_AGG_INTERVAL_MINUTE
config["info"]["init_weight_inf_or_zero"]="zero" # adj matrix not connected: 0 (inf: infinity)
config["info"]["set_weight_link_or_dist"]="link" # adj matrix 01 (dist: use weight)
config["info"]["calculate_weight_adj"]=False
# config["info"]["weight_adj_epsilon"]=0.1 # disabled when the above is false

json.dump(config, open(os.path.join(DATA_PATH, DATASET, "config.json"), "w", encoding="utf-8"), ensure_ascii=False)
config

{'geo': {'including_types': ['LineString'], 'LineString': {}},
 'rel': {'including_types': ['geo'], 'geo': {}},
 'dyna': {'including_types': ['state'],
  'state': {'entity_id': 'geo_id', 'flow': 'num'}},
 'info': {'data_files': 'sz_taxi_202006',
  'geo_file': 'sz_taxi_202006',
  'rel_file': 'sz_taxi_202006',
  'data_col': ['flow'],
  'output_dim': 1,
  'time_intervals': 900,
  'init_weight_inf_or_zero': 'zero',
  'set_weight_link_or_dist': 'link',
  'calculate_weight_adj': False}}