# Get the simulation-based distance errors by using Haversine distance instead of network distance
Some areas are very large to run the simulation, so this part focuses on the distance errors in the selected urban areas.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [15]:
import os
import subprocess
import sys
import yaml
import pandas as pd
from pprint import pprint
import geopandas as gpd
import json

def get_repo_root():
    """Get the root directory of the repo."""
    dir_in_repo = os.path.dirname(os.path.abspath('__file__')) # os.getcwd()
    return subprocess.check_output('git rev-parse --show-toplevel'.split(),
                                   cwd=dir_in_repo,
                                   universal_newlines=True).rstrip()
sys.path.append(get_repo_root())
ROOT_dir = get_repo_root()

with open(ROOT_dir + '/lib/regions.yaml') as f:
    region_manager = yaml.load(f, Loader=yaml.FullLoader)

def get_region_area(region=None):
    # The boundary to use when downloading drive networks
    utm_epsg = region_manager[region]['utm_epsg']
    zone_id = region_manager[region]['zone_id']
    zones_path = region_manager[region]['zones_path']
    zones = gpd.read_file(ROOT_dir + zones_path)
    zones = zones.loc[zones[zone_id].notnull()]
    zones = zones.rename(columns={zone_id: "zone"})
    zones.zone = zones.zone.astype(int)
    zones = zones.loc[zones.geometry.notnull()].to_crs(utm_epsg)
    boundary = zones.assign(a=1).dissolve(by='a')
    area = boundary['geometry'].area/ 10**6
    return area.values[0]

## 1. Find the regions for analysis

In [11]:
runid = 7
regions = [x for x in region_manager if os.path.exists(ROOT_dir + f'/dbs/{x}/visits/visits_{runid}_trips_dom_network.csv')]
pprint(regions)

['barcelona',
 'madrid',
 'surabaya',
 'johannesburg',
 'capetown',
 'kualalumpur',
 'cebu',
 'guadalajara',
 'stpertersburg',
 'nairobi']


## 2. Calculate the urban areas (km^2)

In [16]:
region_area_dict = {x: get_region_area(region=x) for x in regions}
pprint(region_area_dict)
with open(ROOT_dir + '/results/region_area_urban.txt', 'a') as outfile:
    json.dump(region_area_dict, outfile)
    outfile.write('\n')

{'barcelona': 584.8563942083829,
 'capetown': 2457.0526781278363,
 'cebu': 4878.703238331435,
 'guadalajara': 218.8857087984455,
 'johannesburg': 1663.4723173706666,
 'kualalumpur': 272.3380464548566,
 'madrid': 604.4652865522514,
 'nairobi': 712.7124295209519,
 'stpertersburg': 1450.6040527648972,
 'surabaya': 330.6512424353324}


In [28]:
pprint([float("%.1f"%region_area_dict[x]) for x in regions])

[584.9, 604.5, 330.7, 1663.5, 2457.1, 272.3, 4878.7, 218.9, 1450.6, 712.7]


In [37]:
df_area = pd.DataFrame.from_dict(region_area_dict, columns=['area'], orient='index').sort_values('area')
pprint(df_area.index)
pprint([region_manager[x]['name'] for x in df_area.index])

Index(['guadalajara', 'kualalumpur', 'surabaya', 'barcelona', 'madrid',
       'nairobi', 'stpertersburg', 'johannesburg', 'capetown', 'cebu'],
      dtype='object')
['Guadalajara, Mexico',
 'Kuala Lumpur, Malaysia',
 'Surabaya, Indonesia',
 'Barcelona, Spain',
 'Madrid, Spain',
 'Nairobi, Kenya',
 'Saint Petersburg, Russia',
 'Johannesburg, South Africa',
 'Cape Town, South Africa',
 'Cebu, Philippines']


## 3. Merge distance files and save

In [22]:
def region_data_loader(region=None, runid=None):
    df = pd.read_csv(ROOT_dir + f'/dbs/{region}/visits/visits_{runid}_trips_dom_network.csv')
    df.loc[:, 'region'] = region
    df.loc[:, 'distance_network'] += 0.4 # Compensate walking distance in 5 min
    return df.loc[:, ['region', 'distance', 'distance_network']]
list_df = [region_data_loader(region=x, runid=runid) for x in regions]
df = pd.concat(list_df)
df = df.loc[(df.distance > 0.1) & (df.distance_network >= df.distance), :]
df.loc[:, 'diff'] = df.loc[:, 'distance_network'] / df.loc[:, 'distance']
df.head()

Unnamed: 0,region,distance,distance_network,diff
0,barcelona,1.39165,2.116665,1.520975
1,barcelona,10.944446,12.594353,1.150753
2,barcelona,10.02126,12.011026,1.198554
3,barcelona,1.957362,2.667059,1.362578
4,barcelona,5.295774,6.724929,1.269867


In [23]:
df.groupby('region')['diff'].median()

region
barcelona        1.447533
capetown         1.483635
cebu             1.542900
guadalajara      1.400996
johannesburg     1.423787
kualalumpur      1.521055
madrid           1.421416
nairobi          1.617161
stpertersburg    1.395736
surabaya         1.477869
Name: diff, dtype: float64

In [24]:
df.to_csv(ROOT_dir + '/dbs/distance_error_simulation.csv', index=False)