In [1]:
import pandas as pd, numpy as np
from choicemodels.tools import distancematrix as dm

tract_centroids_file = '../data/bay_tract_centroids.csv'
distance_matrix_file = '../data/bay_tracts_distance_matrix.csv'

  from pandas.core import datetools


## Load the data

In [2]:
# use double-precision floating points to ensure sufficient significant digits
dtypes = {'GEOID10':str, 'lat':np.float64, 'lng':np.float64}
df = pd.read_csv(tract_centroids_file, dtype=dtypes, encoding='utf-8').sort_values(by='GEOID10')
len(df)

1588

In [3]:
# alternatively, create a randomized dataframe of length n to test performance relative to size
#n = 5000
#df = pd.DataFrame({'GEOID10':range(n), 'lng':np.random.random(n), 'lat':np.random.random(n)})

In [4]:
# index the dataframe by place identifier (i.e., census tract ID)
df = df.set_index('GEOID10')

## Calculate distance matrices, reindexed as multi-index vectors

#### First, the euclidean distance vector in units of degrees

In [5]:
%%time
df_eu_dm = dm.distance_matrix(df, method='euclidean')

Wall time: 74 ms


In [6]:
print(df_eu_dm.shape)
df_eu_dm.head()

(2521744L,)


06001400100  06001400100    0.000000
             06001400200    0.026261
             06001400300    0.035165
             06001400400    0.032078
             06001400500    0.037980
dtype: float64

#### Next, the great-circle distance vector in units of meters

In [7]:
%%time
df_gc_dm = dm.distance_matrix(df, method='greatcircle')

  arc = np.arccos(cos)


Wall time: 1.53 s


In [8]:
print(df_gc_dm.shape)
df_gc_dm.head()

(2521744L,)


06001400100  06001400100       0
             06001400200    2659
             06001400300    3595
             06001400400    3111
             06001400500    3579
dtype: int32

In [9]:
# save distance matrix to disk
df_gc_dm.to_csv(distance_matrix_file, index=True, encoding='utf-8')