In [1]:
import itertools
import json
import numpy as np
import os
import pandas as pd
import pickle
import pprint
import random
import re
import requests
import seaborn
import sys

DATA_DIR = 'data/capital_bikeshare_2013_to_pres'

coordinates = {}

random.seed(90210)

## Data Analysis

In [2]:
r = requests.get('https://secure.capitalbikeshare.com/data/stations.json')

for station in r.json()['stations']:
    coordinates[station['s']] = (station['la'], station['lo'])

In [2]:
df = None

for filename in os.listdir(DATA_DIR):
    if filename.endswith('.csv'):
        tmp_df = pd.read_csv(os.path.join(DATA_DIR, filename))
        df = tmp_df if df is None else df.append(tmp_df)

In [5]:
# there are 11 rows with empty End Station columns
len(df[(df['Start Station'].isnull())]), len(df[(df['End Station'].isnull())])

(0, 11)

In [6]:
starts = df[df['Start Station'].notnull()]['Start Station']
ends = df[df['End Station'].notnull()]['End Station']

In [7]:
stations = set(np.concatenate((starts.values, ends.values)))

In [8]:
print '{:8d} rows'.format(len(df))
print '{:8d} stations'.format(len(stations))

 5964097 rows
     378 stations


In [14]:
# There are some coordinates we got from their site that aren't applicable to our dataset
for loc in set(coordinates) - set(stations):
    try:
        del coordinates[loc]
    except KeyError:
        pass

In [15]:
desired_stations = set(stations) - set(coordinates.keys())
print '{:d}/{:d} locations needed'.format(len(desired_stations), len(stations))

30/378 locations needed


## Geocoding

In [16]:
from geopy.geocoders import Nominatim, GoogleV3

# append Washington D.C. to all nominatim requests
nominatim = Nominatim(format_string='%s Washington D.C.')
google_v3 = GoogleV3()

In [17]:
def intelligent_geocode_guess(s, geolocator):
    m = re.match('(.*)\[(.*)\]', s)
        
    if '/' in s:
        parts = s.split('/')
    elif m:
        parts = m.groups()
    else:
        parts = [s]
        
    assert len(parts) in (1, 2)
        
    return filter(None, [geolocator.geocode(part) for part in parts])

In [18]:
from geopy.distance import great_circle

def geocode_locations(locations, geolocator, gap_threshold=.2):
    # gap threshold is in miles
    results = {}

    for location in locations:
        try:
            guess = intelligent_geocode_guess(location, geolocator)
        
            if len(guess) == 1:
                results[location] = (guess[0].latitude, guess[0].longitude)
            elif len(guess) == 2:
                g1, g2 = guess
                gap = great_circle((g1.latitude, g1.longitude),
                                   (g2.latitude, g2.longitude)).miles
                
                if gap > gap_threshold:
                    print 'Gap threshold of {:f} miles between guesses for "{:s}"'.format(gap, location) 
        except Exception as e:
            print e
    
    return results

In [19]:
print '{:d} desired stations to geocode.'.format(len(desired_stations))

30 desired stations to geocode.


In [20]:
# Try Nominatim, then default to Google
nominatim_results = geocode_locations(desired_stations, nominatim)

Service timed out
Gap threshold of 0.960233 miles between guesses for "McPherson Square / 14th & H St NW"


In [21]:
coordinates = dict(coordinates.items() + nominatim_results.items())

In [22]:
desired_stations = set(stations) - set(coordinates.keys())

In [23]:
print '{:d} desired stations to geocode.'.format(len(desired_stations))

24 desired stations to geocode.


In [24]:
google_results = geocode_locations(desired_stations, google_v3)

Gap threshold of 637.345842 miles between guesses for "Thomas Jefferson Cmty Ctr / 2nd St S & Ivy"
Gap threshold of 0.549459 miles between guesses for "12th & Hayes St /  Pentagon City Metro"
Gap threshold of 1160.609771 miles between guesses for "8th & F St NW / National Portrait Gallery"
Gap threshold of 0.549459 miles between guesses for "Pentagon City Metro / 12th & Hayes St"


In [25]:
coordinates = dict(coordinates.items() + google_results.items())
desired_stations = set(stations) - set(coordinates.keys())
print '{:d} desired stations to geocode.'.format(len(desired_stations))

6 desired stations to geocode.


In [26]:
desired_stations

{'12th & Hayes St /  Pentagon City Metro',
 '8th & F St NW / National Portrait Gallery',
 'Court House Metro / Wilson Blvd & N Uhle St',
 'McPherson Square / 14th & H St NW',
 'Pentagon City Metro / 12th & Hayes St',
 'Thomas Jefferson Cmty Ctr / 2nd St S & Ivy'}

In [27]:
# manual corrections
coordinates['McPherson Square / 14th & H St NW'] = (38.900221, -77.031883)
coordinates['Thomas Jefferson Cmty Ctr / 2nd St S & Ivy'] = (38.869522, -77.093467)
coordinates['12th & Hayes St /  Pentagon City Metro'] = (38.863021, -77.059269)
coordinates['8th & F St NW / National Portrait Gallery'] = (38.897195, -77.022944)
coordinates['Pentagon City Metro / 12th & Hayes St'] = (38.863021, -77.059269)
coordinates['Court House Metro / Wilson Blvd & N Uhle St'] = (38.891556, -77.084864)

In [28]:
desired_stations = set(stations) - set(coordinates.keys())
print '{:d} desired stations to geocode.'.format(len(desired_stations))

0 desired stations to geocode.


## Verification

### Sanity Checks

In [29]:
# A very generous bounding box to detect outliars
lat_min, lat_max = (38.5481, 39.6503)
lon_min, lon_max = (-77.683, -76.0912)

for (loc, (lat, lon)) in coordinates.iteritems():
    if not (lat_min < lat < lat_max) or not (lon_min < lon < lon_max):
        print loc, lat, lon

Alta Tech Office 29.4645374 -98.4837589
1714 Warehouse  46.6897396 -118.9417081
Mo Co Warehouse 39.585491 -94.946311
18th & Hayes St 45.00592 -93.234741


In [30]:
# weird outliars detected
coordinates['1714 Warehouse '] = (38.876701, -77.017907)
coordinates['18th & Hayes St'] = (38.857315, -77.056481)

# can not find
try:
    del coordinates['Mo Co Warehouse']
    del coordinates['Alta Tech Office']
except KeyError:
    pass

In [31]:
# Sanity check the minimum/maximum latitudes and longitudes
print min([lat for lat, _ in coordinates.values()])
print max([lat for lat, _ in coordinates.values()])

print min([lon for _, lon in coordinates.values()])
print max([lon for _, lon in coordinates.values()])

38.801111
39.123513
-77.202501
-76.931862


In [3]:
anomalies = ['Mo Co Warehouse', 'Alta Tech Office']

anomaly_mask = ((df['Start Station'].isin(anomalies)) | 
                (df['End Station'].isin(anomalies)) | 
                (df['Start Station'].isnull()) | 
                (df['End Station'].isnull()))

len(df[anomaly_mask])

30

### Random Sampling

In [37]:
sample = random.sample([{'loc': c, 'lat': coordinates[c][0], 'lon': coordinates[c][1]} for c in coordinates], 10)

In [38]:
df_rand = pd.DataFrame.from_dict(sample)

In [39]:
df_rand[['loc', 'lat', 'lon']]

Unnamed: 0,loc,lat,lon
0,Rhode Island Ave Metro,38.920682,-76.995876
1,Clarendon Metro / Wilson Blvd & N Highland St,38.88786,-77.094875
2,Bethesda Metro,38.984691,-77.094537
3,3rd & D St SE,38.8851,-77.0023
4,Veterans Pl & Pershing Dr,38.997445,-77.023894
5,3000 Connecticut Ave NW / National Zoo,38.930282,-77.055599
6,14th & Harvard St NW,38.9268,-77.0322
7,3rd & H St NE,38.900412,-77.001949
8,Frederick Ave & Horners Ln,39.094772,-77.145213
9,Carroll & Ethan Allen Ave,38.977933,-77.006472


## Exporting

In [40]:
with open(os.path.join(DATA_DIR, 'capital_bikeshare_coordinates_mapping.json'), 'wb') as outfile:
    json.dump(coordinates, outfile)

## Applying to Original Problem

In [4]:
with open(os.path.join(DATA_DIR, 'capital_bikeshare_coordinates_mapping.json'), 'rb') as infile:
    mapping = json.load(infile)
    
len(mapping)

376

In [26]:
# This only returns rows which match anomaly_mask
df[~(df['Start Station'].isin(mapping)) | ~(df['End Station'].isin(mapping))]

Unnamed: 0,Bike number,Bike#,Duration,End Station,End date,End terminal,Member Type,Start Station,Start date,Start terminal,Start time,Subscriber Type,Subscription Type,Total duration (ms)
42832,,W01360,0h 6m 41s,,7/5/2014 11:00,,,Veterans Pl & Pershing Dr,7/5/2014 10:53,32043,,,Registered,
128210,,W01160,0h 25m 42s,,7/13/2014 13:17,,,15th St & Massachusetts Ave SE,7/13/2014 12:51,31626,,,Registered,
368971,,W01297,0h 3m 22s,,8/4/2014 17:39,,,15th & P St NW,8/4/2014 17:35,31201,,,Registered,
658611,,? (0x74BEBCE4),0h 0m 3s,Alta Tech Office,9/11/2013 6:19,32902.0,,Alta Tech Office,9/11/2013 6:19,32902,,,Subscriber,
658612,,? (0x74BEBCE4),0h 25m 6s,Alta Tech Office,9/11/2013 6:45,32902.0,,Alta Tech Office,9/11/2013 6:19,32902,,,Subscriber,
19005,,W01352,0h 12m 17s,Mo Co Warehouse,1/8/2014 8:53,32900.0,Registered,Silver Spring Metro/Colesville Rd & Wayne Ave,1/8/2014 8:41,32012,,,,
23224,,W00308,0h 10m 16s,Mo Co Warehouse,1/9/2014 8:45,32900.0,Registered,Silver Spring Metro/Colesville Rd & Wayne Ave,1/9/2014 8:35,32012,,,,
27560,,W20012,0h 14m 18s,Mo Co Warehouse,1/10/2014 8:52,32900.0,Registered,Silver Spring Metro/Colesville Rd & Wayne Ave,1/10/2014 8:38,32012,,,,
44513,,W01306,0h 9m 4s,Mo Co Warehouse,1/14/2014 8:42,32900.0,Registered,Silver Spring Metro/Colesville Rd & Wayne Ave,1/14/2014 8:33,32012,,,,
47325,,W01306,0h 11m 48s,Mo Co Warehouse,1/14/2014 19:08,32900.0,Registered,Mo Co Warehouse,1/14/2014 18:57,32900,,,,
