In [320]:
%matplotlib inline

import json
import requests
import pandas as pd
import geopandas as gpd
import polyline as pl
import googlemaps
from itertools import permutations, combinations
from shapely.geometry import LineString

# Data prep for flows tutorial

This notebook compiles steps required to compile street and straight distances between all bike share stations within the city of San Francisco and obtain the number of trips that were taken over the period of September 2015 to August 2016. This relies on the following two files:

1. Stations

In [321]:
url = 'http://www.bayareabikeshare.com/stations/json'

## Distances

* Reading the stations file. Keep only those in San Francisco:

In [330]:
js = json.loads(requests.get(url).text)

stns = pd.DataFrame(js['stationBeanList'])

stns = stns.loc[stns['landMark']=='San Francisco', :]
stns.head(3)

Unnamed: 0,altitude,availableBikes,availableDocks,city,id,is_renting,landMark,lastCommunicationTime,latitude,location,longitude,postalCode,stAddress1,stAddress2,stationName,status,statusKey,statusValue,testStation,totalDocks
14,,5,14,San Francisco,39,True,San Francisco,2017-03-11 09:44:57,37.783871,San Francisco,-122.408433,,Powell Street BART,Market,Powell Street BART,IN_SERVICE,1,In Service,False,19
15,,5,10,San Francisco,41,True,San Francisco,2017-03-11 09:42:26,37.795001,,-122.39997,,Clay at Battery,Clay Street,Clay at Battery,IN_SERVICE,1,In Service,False,15
16,,6,9,San Francisco,42,True,San Francisco,2017-03-11 09:44:36,37.79728,,-122.398436,,Davis at Jackson,Davis Street,Davis at Jackson,IN_SERVICE,1,In Service,False,15


* Compiling all possible destinations from station to station.

In [348]:
od_ids = pd.DataFrame([(i[0], i[1], set((i[0], i[1]))) for i in
             permutations(stns['id'], 2)],
                      columns=['orig', 'dest', 'set'])

In [349]:
od_ids[od_ids['set']==set((41, 42))]

Unnamed: 0,orig,dest,set
42,41,42,"{41, 42}"
83,42,41,"{41, 42}"


* Pull lines + distance from Google

In [350]:
key = open('key').readline().strip('\n')
gmaps = googlemaps.Client(key=key)

In [80]:
%%time
for id, pair in od_ids_u.iterrows():
    xy1 = stns.loc[\
            stns['station_id']==pair['orig'], ['lat', 'long']\
                  ].iloc[0].tolist()
    xy2 = stns.loc[\
            stns['station_id']==pair['dest'], ['lat', 'long']\
                  ].iloc[0].tolist()
    drs = gmaps.directions(xy1, xy2, mode='bicycling')
    line = drs[0]['overview_polyline']['points']
    od_ids_u.loc[id, 'line'] = line

CPU times: user 8.35 s, sys: 485 ms, total: 8.84 s
Wall time: 1min 21s


In [82]:
# Save just in case
od_ids_u.to_csv('lines.csv')

* Encode trips as `shapely` line objects decoding them with [`polyline`](https://pypi.python.org/pypi/polyline/1.3.2).

In [242]:
def rearrange(l):
    '''
    Swap latitude for longitude so it conforms
    to XY as `LineString` expects
    '''
    return list(map(lambda t: t[::-1], l))

In [243]:
od_ids_u['geometry'] = od_ids_u['line'].apply(\
                    lambda l: LineString(\
                               rearrange(pl.decode(l))\
                                        )\
                                             )

* Join lines to table with all trips

In [244]:
idify = lambda od: str(min(od))+'-'+str(max(od))

In [259]:
od_ids['id'] = od_ids[['orig', 'dest']].apply(idify, axis=1)
od_ids_u['id'] = od_ids_u[['orig', 'dest']].apply(idify, axis=1)
od = od_ids.join(od_ids_u.set_index('id')[['geometry']],\
                 on='id')\
           .drop(['set', 'id'], axis=1)

* Turn the table into a `GeoDataFrame`

In [260]:
od = gpd.GeoDataFrame(od.drop('geometry', axis=1), \
                      geometry=od['geometry'], \
                      crs={'init' :'epsg:4326'})

* Project to the NAD83 / California Albers projection ([`EPSG:3310`](http://epsg.io/3310)), expressed in metres.

In [261]:
od = od.to_crs(epsg=3310)

* Obtain street distances

In [268]:
od['street_dist'] = od.length

* Obtain straight distances

In [281]:
def straight_dist(line):
    xys = line.coords
    stl = LineString([xys[0], xys[-1]])
    return stl.length

In [282]:
od['straight_dist'] = od['geometry'].apply(straight_dist)

* Index on string ID

In [286]:
od['id'] = od[['orig', 'dest']].apply(\
                                lambda r: str(r.orig)+'-'+str(r.dest), \
                                    axis=1)
od = od.set_index('id')

## Trips

* Attach number of trips

In [287]:
od.head()

Unnamed: 0_level_0,orig,dest,geometry,street_dist,straight_dist
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
41-42,41,42,LINESTRING (-211028.4643687008 -21929.58136633...,464.017539,282.679311
41-45,41,45,LINESTRING (-211028.4643687008 -21929.58136633...,775.880245,274.139521
41-46,41,46,LINESTRING (-211028.4643687008 -21929.58136633...,1061.183692,419.825049
41-47,41,47,LINESTRING (-211028.4643687008 -21929.58136633...,975.231435,745.941569
41-48,41,48,LINESTRING (-211028.4643687008 -21929.58136633...,1240.265533,561.747815


* Attach the following information to each line
    * Origin ID
    * Destination ID
    * N. of trips
    * Straight distance
    * Street distance

* Write out as a `GeoJSON`

In [256]:
od.to_file('flows.geojson', driver='GeoJSON')