# Input data preprocessing

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import subprocess
import sys
def get_repo_root():
    """Get the root directory of the repo."""
    dir_in_repo = os.path.dirname(os.path.abspath('__file__')) # os.getcwd()
    return subprocess.check_output('git rev-parse --show-toplevel'.split(),
                                   cwd=dir_in_repo,
                                   universal_newlines=True).rstrip()
sys.path.append(get_repo_root())
ROOT_dir = get_repo_root()

In [2]:
import pandas as pd
import geopandas as gpd

osm_folder = ROOT_dir + '/dbs/osm/'
city = 'istanbul'
network = osm_folder + f'{city}.shp'

## 1. Load data

In [34]:
gdf = gpd.read_file(network)
gdf['osm_id'] = gdf['osm_id'].astype(int)
df = pd.read_csv(ROOT_dir + f'/dbs/flow_{city}.csv')
#df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)
df.iloc[0]

Date                             2018-01-01
Hour                                      0
Detector                                 28
IST_Speed_dir_0                     98.7333
IST_Speed_dir_1                     86.1333
Flow_dir_0                             2295
Flow_dir_1                             3064
Active_Line_Number                        6
Flow_Lane1                              325
Flow_Lane2                              992
Flow_Lane3                              973
Flow_Lane4                             1249
Flow_Lane5                             1285
Flow_Lane6                              535
Flow_Lane7                                0
Flow_Lane8                                0
Speed1                              89.7333
Speed2                              92.0667
Speed3                              109.633
Speed4                                 95.9
Speed5                              82.0333
Speed6                              76.8667
Speed7                          

### 1.1 For flow_istanbul.csv, separate the directions

In [35]:
# Direction 0
name_mapping0 = {'Hour': 'time',
                 'Detector': 'sensor',
                 'Here_segmentID': 'HERE_segID',
                 'Flow_dir_0': 'flow',
                 'Here_Speed_direction_0': 'speed',
                 'OSM_ID_SpeedSensor_dir_0': 'osm_id',
                 'Here_FFS_direction_0': 'speed_ff',
                 'IST_Speed_dir_0': 'speed_gt'}
df0 = df.loc[:, ['Hour',
                 'Detector',
                 'Here_segmentID',
                 'Flow_dir_0',
                 'Here_Speed_direction_0',
                 'OSM_ID_SpeedSensor_dir_0',
                 'Here_FFS_direction_0',
                 'IST_Speed_dir_0']].rename(columns=name_mapping0)
df0.loc[:, 'direction'] = 0

# Direction 1
name_mapping1 = {'Hour': 'time',
                 'Detector': 'sensor',
                 'Here_segmentID': 'HERE_segID',
                 'Flow_dir_1': 'flow',
                 'Here_Speed_direction_1': 'speed',
                 'OSM_ID_SpeedSensor_dir_1': 'osm_id',
                 'Here_FFS_direction_1': 'speed_ff',
                 'IST_Speed_dir_1': 'speed_gt'}
df1 = df.loc[:, ['Hour',
                 'Detector',
                 'Here_segmentID',
                 'Flow_dir_1',
                 'Here_Speed_direction_1',
                 'OSM_ID_SpeedSensor_dir_1',
                 'Here_FFS_direction_1',
                 'IST_Speed_dir_1']].rename(columns=name_mapping1)
df1.loc[:, 'direction'] = 1

df = pd.concat([df0, df1])
df.head()

Unnamed: 0,time,sensor,HERE_segID,flow,speed,osm_id,speed_ff,speed_gt,direction
0,0,28,3_10128,2295,80.2,76846059,76.0,98.733333,0
1,0,41,3_10114,2410,80.33,370173491,73.0,77.275862,0
2,0,87,3_7162+,1532,92.91,550287857,84.0,114.310345,0
3,0,144,3_2774+,1953,100.88,174344130,82.0,101.071429,0
4,0,144,3_2776,1953,96.56,4477300,64.0,101.071429,0


### 1.2 Infer the free-flow speed from sensors

In [36]:
df_ff = df.groupby(['HERE_segID',
                    'direction'])['speed_gt'].max().reset_index().rename(columns={'speed_gt': 'speed_gt_ff'})
df = pd.merge(df, df_ff, on=['HERE_segID', 'direction'])

df_cap_p = df.groupby(['HERE_segID',
                       'direction'])['flow'].quantile(q=0.95).reset_index().rename(columns={'flow': 'capacity_gt'})
df = pd.merge(df, df_cap_p, on=['HERE_segID', 'direction'])
df.iloc[0]

time                  0
sensor               28
HERE_segID      3_10128
flow               2295
speed              80.2
osm_id         76846059
speed_ff             76
speed_gt        98.7333
direction             0
speed_gt_ff         112
capacity_gt      3693.6
Name: 0, dtype: object

In [37]:
df_n = pd.merge(df, gdf.loc[:, ['osm_id', 'oneway', 'lanes']].drop_duplicates(subset=['osm_id']),
                on='osm_id', how='left')
# Missing lanes are assumed to be small roads with lane number = 1
df_n.loc[:, 'lanes'] = df_n.loc[:, 'lanes'].apply(lambda x: int(x) if x in ['4', '3', '2', '1'] else 1)

# Missing values are assumed to be small roads with oneway = 1
df_n.loc[:, 'oneway'] = df_n.loc[:, 'oneway'].apply(lambda x: int(x) if x in [0, 1] else 1)
df_n.head()

Unnamed: 0,time,sensor,HERE_segID,flow,speed,osm_id,speed_ff,speed_gt,direction,speed_gt_ff,capacity_gt,oneway,lanes
0,0,28,3_10128,2295,80.2,76846059,76.0,98.733333,0,112.0,3693.6,1,3
1,1,28,3_10128,2307,80.68,76846059,76.0,98.533333,0,112.0,3693.6,1,3
2,2,28,3_10128,1821,80.63,76846059,76.0,99.814815,0,112.0,3693.6,1,3
3,3,28,3_10128,1431,86.28,76846059,76.0,104.766667,0,112.0,3693.6,1,3
4,4,28,3_10128,1009,89.8,76846059,76.0,106.166667,0,112.0,3693.6,1,3


In [27]:
df_n.groupby(['oneway', 'lanes']).size()

oneway  lanes
0       1         18851
1       1        252374
        2         17150
        3         72752
        4         23107
dtype: int64

In [28]:
print('Number of road segments:', len(df_n.HERE_segID.unique()))

Number of road segments: 70


## 2 Define capacity theoretically

In [29]:
capacity_dict = {(1, 1): 1200, (1, 0): 700, (2, 1): 2400, (2, 0): 1500,
                 (3, 1): 3600, (3, 0): 2800, (4, 1): 4800, (4, 0): 4000}

In [38]:
df_n.loc[:, 'capacity'] = df_n.apply(lambda row: capacity_dict[(row['lanes'], row['oneway'])], axis=1)
df_n.to_csv(ROOT_dir + f'/dbs/flow_{city}_m.csv', index=False)
