# Get the data-based distance errors by using Haversine distance instead of network distance

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import subprocess
import sys
import yaml
import pandas as pd

def get_repo_root():
    """Get the root directory of the repo."""
    dir_in_repo = os.path.dirname(os.path.abspath('__file__')) # os.getcwd()
    return subprocess.check_output('git rev-parse --show-toplevel'.split(),
                                   cwd=dir_in_repo,
                                   universal_newlines=True).rstrip()
sys.path.append(get_repo_root())
ROOT_dir = get_repo_root()

import lib.sweden_sv as sweden_sv
import lib.netherlands as netherlands
import lib.genericvalidation as genericvalidation

with open(ROOT_dir + '/lib/regions.yaml') as f:
    region_manager = yaml.load(f, Loader=yaml.FullLoader)

region_path = {
    'sweden': {
        'home_locations_path': ROOT_dir + "/dbs/sweden/homelocations.csv",
        'tweets_calibration': ROOT_dir + "/dbs/sweden/geotweets_c.csv",
        'tweets_validation': ROOT_dir + "/dbs/sweden/geotweets_v.csv",
        'gt': sweden_sv.GroundTruthLoader()
    },
    'netherlands': {
        'home_locations_path': ROOT_dir + "/dbs/netherlands/homelocations.csv",
        'tweets_calibration': ROOT_dir + "/dbs/netherlands/geotweets_c.csv",
        'tweets_validation': ROOT_dir + "/dbs/netherlands/geotweets_v.csv",
        'gt': netherlands.GroundTruthLoader()
    }
}

## 1. Load trip survey data
Sweden, the Netherlands

### 1.1 Sweden

In [11]:
region='sweden'
df_se = pd.read_csv(ROOT_dir + f"/dbs/{region}/survey_deso/day_trips.csv")
df_se = df_se.loc[:, ["sub_id", 'trip_id', 'trip_main_id', 'distance_main',
                      'date', "origin_main_deso", "desti_main_deso", 'trip_weight']]
df_se = df_se.drop_duplicates(subset=["sub_id", 'trip_id', 'trip_main_id'])
df_se["T"] = df_se["date"].apply(lambda x: pd.to_datetime(x))
df_se = df_se.loc[~df_se["T"].apply(lambda x: x.weekday()).isin([5, 6]), :]
df_se.dropna(axis=0, how='any', inplace=True)
df_se = df_se.loc[:, ["origin_main_deso",
                      "desti_main_deso",
                      'distance_main']].rename(columns={"origin_main_deso": 'origin',
                                                        "desti_main_deso": 'destination',
                                                        'distance_main': 'distance_network'})

In [None]:
# calculate ODM-based distances
ground_truth = region_path[region]['gt']
# load zones
ground_truth.load_zones()

In [9]:
distances_se = pd.DataFrame(genericvalidation.zone_distances(ground_truth.zones))
idx = distances_se.index.rename(['origin', 'destination'])
distances_se.set_index(idx, inplace=True)
distances_se = distances_se.reset_index()
distances_se.columns = ["origin", "destination", 'distance']

Calculating distances between zones...


In [10]:
distances_se.head()

Unnamed: 0,origin,destination,distance
0,0114A0010,0114A0010,0.0
1,0114A0010,0114C1010,4.795251
2,0114A0010,0114C1020,3.548429
3,0114A0010,0114C1030,4.160562
4,0114A0010,0114C1040,2.703179


In [12]:
df_se = pd.merge(df_se, distances_se, on=["origin", "destination"])
df_se.head()


Unnamed: 0,origin,destination,distance_network,distance
0,0127C1060,0127C1060,10.03,0.0
1,0127C1060,0127C1060,10.03,0.0
2,1498C1020,1499C1010,25.95,22.808517
3,1480C1350,1480C1340,0.6,0.368819
4,0180C2420,0180C2420,0.1,0.0


### 1.2 the Netherlands

In [13]:
def trip_row(df):
    row = df.iloc[0]
    row['dest_zip'] = df.iloc[-1]['dest_zip']
    row['dest_time'] = df.iloc[-1]['dest_time']
    return row

region='netherlands'
sheet1 = pd.read_excel(ROOT_dir + f"/dbs/{region}/mobility_data/OViN2017_Databestand.xlsx")
trips = sheet1[
    ['OPID', 'AfstV', 'Wogem', 'Jaar', 'Maand', 'Dag', 'VerplID',
     'VertUur', 'VertPC', 'AankUur', 'AankPC', 'FactorV']]
trips = trips.rename(columns={
    'Wogem': 'home_city',
    'Jaar': 'year',
    'Maand': 'month',
    'Dag': 'day',
    'VerplID': 'trip_id',
    'VertUur': 'origin_time',
    'VertPC': 'origin_zip',
    'AankUur': 'dest_time',
    'AankPC': 'dest_zip',
    'FactorV': 'weight_trip',
    'AfstV': 'distance'
})

# Process
trips = trips.dropna(subset=['trip_id'])
trips = trips.dropna(subset=['distance'])
trips.loc[:, 'distance'] = trips.loc[:, 'distance'] / 10 # hectometer to km
trips = trips.groupby(['OPID', 'trip_id']).apply(trip_row)
trips['origin_zip'] = trips['origin_zip'].astype('int64')
trips['dest_zip'] = trips['dest_zip'].astype('int64')
df_nt = trips.loc[:, ['origin_zip',
                      'dest_zip',
                      'distance']].rename(columns={'origin_zip': 'origin',
                                                   'dest_zip': 'destination',
                                                   'distance': 'distance_network'})

In [14]:
# calculate ODM-based distances
ground_truth = region_path[region]['gt']
# load zones
ground_truth.load_zones()

In [15]:
distances_nt = pd.DataFrame(genericvalidation.zone_distances(ground_truth.zones))
idx = distances_nt.index.rename(['origin', 'destination'])
distances_nt.set_index(idx, inplace=True)
distances_nt = distances_nt.reset_index()
distances_nt.columns = ["origin", "destination", 'distance']

Calculating distances between zones...


In [16]:
distances_nt.head()

Unnamed: 0,origin,destination,distance
0,1011,1011,0.0
1,1011,1012,0.736195
2,1011,1013,3.463281
3,1011,1014,3.789964
4,1011,1015,1.68383


In [17]:
df_nt = pd.merge(df_nt, distances_nt, on=["origin", "destination"])
df_nt.head()

Unnamed: 0,origin,destination,distance_network,distance
0,9718,9712,0.5,1.076508
1,9718,9712,0.5,1.076508
2,9718,9712,2.1,1.076508
3,9718,9712,2.5,1.076508
4,9718,9712,1.2,1.076508


## 2. Merge two areas' data and save it

In [18]:
df_se.loc[:, 'region'] = 'sweden'
df_nt.loc[:, 'region'] = 'netherlands'
df_se = df_se.loc[df_se.distance > 0.1, :]
df_nt = df_nt.loc[df_nt.distance > 0.1, :]
df = pd.concat([df_se, df_nt])

In [19]:
df.to_csv(ROOT_dir + '/dbs/distance_error_data.csv', index=False)