In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import networkx as nx
import osmnx as ox
import json

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%pwd

'/home/cseadmin/dz/TrafficFlowModel/data_process/gis_pipeline'

In [3]:
DATA_PATH = "../../data/"
TAXI_DATA_PATH = "../../data/taxi_after_proc/clean202006"
DATASET = "sz_taxi_202006"

MIN_LAT = 22.5311
MAX_LAT = 22.5517
MIN_LNG = 114.0439
MAX_LNG = 114.0633

DATE_PREFIX = "2020-06-"
START_DAY = 1
END_DAY = 30

DOWNSAMPLING_INTERVAL = 5 #s
TRAJ_SPLIT_INTERVAL = 600
FLOW_AGG_INTERVAL_MINUTE = 5

In [12]:
def test_dyna_interval(time, start, end):
    hms=time.split("T")[1].split("Z")[0]
    return hms>=f"{start}:00:00" and hms<f"{end}:00:00"

In [4]:
df_dyna=pd.read_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min_recovered.dyna"))

df_dyna

Unnamed: 0,dyna_id,type,time,entity_id,flow,speed
0,0,state,2020-06-01T00:00:00Z,0,18,42.333333
1,1,state,2020-06-01T00:00:00Z,1,2,42.000000
2,2,state,2020-06-01T00:00:00Z,2,1,29.057819
3,3,state,2020-06-01T00:00:00Z,3,0,29.057819
4,4,state,2020-06-01T00:00:00Z,4,13,42.545455
...,...,...,...,...,...,...
3967483,3967483,state,2020-06-30T23:55:00Z,487,0,29.057819
3967484,3967484,state,2020-06-30T23:55:00Z,488,0,29.057819
3967485,3967485,state,2020-06-30T23:55:00Z,489,0,29.057819
3967486,3967486,state,2020-06-30T23:55:00Z,490,0,29.057819


In [16]:
index_drop_list=[]
for index, row in df_dyna.iterrows():
    if test_dyna_interval(row["time"], 23, 24):
        index_drop_list.append(index)
        
df_dyna.drop(index=index_drop_list, inplace=True)
df_dyna

Unnamed: 0,dyna_id,type,time,entity_id,flow,speed
0,0,state,2020-06-01T00:00:00Z,0,18,42.333333
1,1,state,2020-06-01T00:00:00Z,1,2,42.000000
2,2,state,2020-06-01T00:00:00Z,2,1,29.057819
3,3,state,2020-06-01T00:00:00Z,3,0,29.057819
4,4,state,2020-06-01T00:00:00Z,4,13,42.545455
...,...,...,...,...,...,...
3961579,3961579,state,2020-06-30T22:55:00Z,487,6,35.250000
3961580,3961580,state,2020-06-30T22:55:00Z,488,4,45.250000
3961581,3961581,state,2020-06-30T22:55:00Z,489,3,31.500000
3961582,3961582,state,2020-06-30T22:55:00Z,490,3,48.000000


In [17]:
df_dyna.to_csv(os.path.join(DATA_PATH, DATASET, f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min_recovered_no23.dyna"), index=False)

In [18]:
import json

config={}

config["geo"]={}
config["geo"]["including_types"]=["LineString"]
config["geo"]["LineString"]={}

config["rel"]={}
config["rel"]["including_types"]=["geo"]
config["rel"]["geo"]={}

config["dyna"]={}
config["dyna"]["including_types"]=["state"]
config["dyna"]["state"]={"entity_id": "geo_id", "flow": "num"}

config["info"]={}
config["info"]["data_files"]=f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min_recovered_no23"
config["info"]["geo_file"]=DATASET
config["info"]["rel_file"]=DATASET
config["info"]["data_col"]=["flow"]
config["info"]["output_dim"]=1
# config["info"]["weight_col"]="link_weight"
config["info"]["time_intervals"]=60*FLOW_AGG_INTERVAL_MINUTE
config["info"]["init_weight_inf_or_zero"]="zero" # adj matrix not connected: 0 (inf: infinity)
config["info"]["set_weight_link_or_dist"]="link" # adj matrix 01 (dist: use weight)
config["info"]["calculate_weight_adj"]=False
# config["info"]["weight_adj_epsilon"]=0.1 # disabled when the above is false

json.dump(config, open(os.path.join(DATA_PATH, DATASET, "config.json"), "w", encoding="utf-8"), ensure_ascii=False)
config

{'geo': {'including_types': ['LineString'], 'LineString': {}},
 'rel': {'including_types': ['geo'], 'geo': {}},
 'dyna': {'including_types': ['state'],
  'state': {'entity_id': 'geo_id', 'flow': 'num'}},
 'info': {'data_files': 'sz_taxi_202006_5min_recovered_no23',
  'geo_file': 'sz_taxi_202006',
  'rel_file': 'sz_taxi_202006',
  'data_col': ['flow'],
  'output_dim': 1,
  'time_intervals': 300,
  'init_weight_inf_or_zero': 'zero',
  'set_weight_link_or_dist': 'link',
  'calculate_weight_adj': False}}

dl_traffic

In [2]:
import pandas as pd
DATA_PATH = "../../data/"
TAXI_DATA_PATH = "../../data/taxi_after_proc/clean202006"
DATASET = "sz_taxi_202006"
DOWNSAMPLING_INTERVAL = 5 #s
TRAJ_SPLIT_INTERVAL = 600
FLOW_AGG_INTERVAL_MINUTE = 5

df_dlt=pd.read_pickle(os.path.join(DATA_PATH, DATASET, f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min_recovered_dlt.pkl"))

df_dlt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,482,483,484,485,486,487,488,489,490,491
2020-06-01 00:00:00+00:00,18,2,1,0,13,5,1,5,5,12,...,6,0,5,3,6,18,10,4,10,6
2020-06-01 00:05:00+00:00,14,0,0,1,11,0,0,3,8,4,...,5,0,5,6,5,20,18,6,12,8
2020-06-01 00:10:00+00:00,20,2,0,0,18,6,0,7,8,12,...,5,0,1,0,3,22,17,13,10,8
2020-06-01 00:15:00+00:00,19,0,1,1,21,5,0,5,7,5,...,2,1,3,3,2,16,4,10,9,12
2020-06-01 00:20:00+00:00,14,2,0,1,16,9,1,10,5,5,...,4,1,7,8,4,16,14,9,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 23:35:00+00:00,0,0,0,0,0,1,0,1,1,0,...,1,0,0,1,1,3,3,0,0,0
2020-06-30 23:40:00+00:00,0,0,1,1,1,0,0,0,1,0,...,0,0,1,0,0,2,1,1,0,1
2020-06-30 23:45:00+00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-06-30 23:50:00+00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
index_drop_list=[]
for i in df_dlt.index:
    if i.hour>=23:
        index_drop_list.append(i)
        
df_dlt.drop(index=index_drop_list, inplace=True)
df_dlt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,482,483,484,485,486,487,488,489,490,491
2020-06-01 00:00:00+00:00,18,2,1,0,13,5,1,5,5,12,...,6,0,5,3,6,18,10,4,10,6
2020-06-01 00:05:00+00:00,14,0,0,1,11,0,0,3,8,4,...,5,0,5,6,5,20,18,6,12,8
2020-06-01 00:10:00+00:00,20,2,0,0,18,6,0,7,8,12,...,5,0,1,0,3,22,17,13,10,8
2020-06-01 00:15:00+00:00,19,0,1,1,21,5,0,5,7,5,...,2,1,3,3,2,16,4,10,9,12
2020-06-01 00:20:00+00:00,14,2,0,1,16,9,1,10,5,5,...,4,1,7,8,4,16,14,9,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 22:35:00+00:00,18,0,0,0,9,2,1,3,2,1,...,3,0,1,1,3,8,3,4,6,7
2020-06-30 22:40:00+00:00,10,2,0,0,11,5,0,5,1,4,...,5,1,1,2,5,8,5,3,3,2
2020-06-30 22:45:00+00:00,5,0,1,0,5,2,0,3,4,1,...,3,0,1,1,2,9,5,2,4,2
2020-06-30 22:50:00+00:00,8,0,0,0,10,1,1,2,2,3,...,3,0,0,0,3,7,2,0,4,3


In [7]:
df_dlt.to_pickle(os.path.join(DATA_PATH, DATASET, f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min_recovered_dlt_no23.pkl"))

In [8]:
df_speed_dlt=pd.read_pickle(os.path.join(DATA_PATH, DATASET, f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min_recovered_speed_dlt.pkl"))

df_speed_dlt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,482,483,484,485,486,487,488,489,490,491
2020-06-01 00:00:00+00:00,42.3333,42,29.0578,29.0578,42.5455,29.0578,29.0578,44.5,24.6667,26.9,...,3,29.0578,24,29.0578,48.5,21.9286,48.7143,34.75,41.5556,45.5
2020-06-01 00:05:00+00:00,30.5,29.0578,29.0578,29.0578,45.5455,29.0578,29.0578,33.6667,18.5714,39.25,...,29.0578,29.0578,27.25,32.5,18,32.9375,40.6471,37,41.2222,60
2020-06-01 00:10:00+00:00,34,20.5,29.0578,29.0578,46.125,29.0578,29.0578,36.2,9.25,42.2727,...,31.6667,29.0578,29.0578,29.0578,23.5,32.0588,40.7647,33.25,41.7143,53
2020-06-01 00:15:00+00:00,31.9231,29.0578,29.0578,29.0578,40.8,36,29.0578,43,20.3333,55.6667,...,29.0578,28,29.0578,29,44,32,30.5,38.2222,50.5,55.4
2020-06-01 00:20:00+00:00,27.9091,29.0578,29.0578,29.0578,42.9231,51,29.0578,43.5,14.5,38.8,...,39,32,20,26,26.5,28.3571,39.25,36.6667,36.375,29.0578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 23:35:00+00:00,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,...,29.0578,29.0578,29.0578,29.0578,29.0578,40.3333,47.6667,29.0578,29.0578,29.0578
2020-06-30 23:40:00+00:00,29.0578,29.0578,29.0578,29.0578,55,29.0578,29.0578,29.0578,32,29.0578,...,29.0578,29.0578,29.0578,29.0578,29.0578,20,25,41,29.0578,29.0578
2020-06-30 23:45:00+00:00,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,...,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578
2020-06-30 23:50:00+00:00,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,...,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578,29.0578


In [9]:
index_drop_list=[]
for i in df_speed_dlt.index:
    if i.hour>=23:
        index_drop_list.append(i)
        
df_speed_dlt.drop(index=index_drop_list, inplace=True)
df_speed_dlt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,482,483,484,485,486,487,488,489,490,491
2020-06-01 00:00:00+00:00,42.3333,42,29.0578,29.0578,42.5455,29.0578,29.0578,44.5,24.6667,26.9,...,3,29.0578,24,29.0578,48.5,21.9286,48.7143,34.75,41.5556,45.5
2020-06-01 00:05:00+00:00,30.5,29.0578,29.0578,29.0578,45.5455,29.0578,29.0578,33.6667,18.5714,39.25,...,29.0578,29.0578,27.25,32.5,18,32.9375,40.6471,37,41.2222,60
2020-06-01 00:10:00+00:00,34,20.5,29.0578,29.0578,46.125,29.0578,29.0578,36.2,9.25,42.2727,...,31.6667,29.0578,29.0578,29.0578,23.5,32.0588,40.7647,33.25,41.7143,53
2020-06-01 00:15:00+00:00,31.9231,29.0578,29.0578,29.0578,40.8,36,29.0578,43,20.3333,55.6667,...,29.0578,28,29.0578,29,44,32,30.5,38.2222,50.5,55.4
2020-06-01 00:20:00+00:00,27.9091,29.0578,29.0578,29.0578,42.9231,51,29.0578,43.5,14.5,38.8,...,39,32,20,26,26.5,28.3571,39.25,36.6667,36.375,29.0578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 22:35:00+00:00,30.6,29.0578,29.0578,29.0578,36.875,29.0578,49,26.5,3,59,...,29.0578,29.0578,29.0578,29.0578,29.0578,25.5714,49,43.3333,46.2,42.3333
2020-06-30 22:40:00+00:00,38.4286,30.5,29.0578,29.0578,38.625,29.0578,29.0578,52.5,10,31.5,...,8,29.0578,26,35.5,29.0578,40.5714,41.3333,33.3333,49.3333,29.0578
2020-06-30 22:45:00+00:00,43.25,29.0578,29.0578,29.0578,39.2,49,29.0578,53,28,29.0578,...,29.0578,29.0578,29.0578,29.0578,29.0578,39.1111,47.6,33,43.3333,29.0578
2020-06-30 22:50:00+00:00,27.4,29.0578,29.0578,29.0578,40.1111,29.0578,29.0578,56.5,40,28,...,16,29.0578,29.0578,29.0578,25,24,47.5,29.0578,52,54


In [10]:
df_speed_dlt.to_pickle(os.path.join(DATA_PATH, DATASET, f"{DATASET}_{FLOW_AGG_INTERVAL_MINUTE}min_recovered_speed_dlt_no23.pkl"))