# Area Extractor

Here we obtain the area and neighborhood information of each citibike station in the City of New York. To obtain this information the StreetEasy API (Area by Location) has been used.

In [1]:
import json
import csv
import pandas as pd
import requests
import pickle
from pandas.io.json import json_normalize

In [2]:
f = open('stations.json', 'r')
stations_json = json.loads(f.read())
stations_full = json_normalize(stations_json['stationBeanList'])

In [3]:
stations_full.columns

Index([u'altitude', u'availableBikes', u'availableDocks', u'city', u'id', u'landMark', u'lastCommunicationTime', u'latitude', u'location', u'longitude', u'postalCode', u'stAddress1', u'stAddress2', u'stationName', u'statusKey', u'statusValue', u'testStation', u'totalDocks'], dtype='object')

In [4]:
stations = stations_full[['id', 'latitude', 'longitude', 'stationName']]
stations.count()

id             508
latitude       508
longitude      508
stationName    508
dtype: int64

In [7]:
stations.head()

Unnamed: 0,id,latitude,longitude,stationName
0,72,40.767272,-73.993929,W 52 St & 11 Ave
1,79,40.719116,-74.006667,Franklin St & W Broadway
2,82,40.711174,-74.000165,St James Pl & Pearl St
3,83,40.683826,-73.976323,Atlantic Ave & Fort Greene Pl
4,116,40.741776,-74.001497,W 17 St & 8 Ave


In [21]:
streetEasy_key1 = 'a00fc395e74bd46ece55d52b98b783445bcf2acc'
streetEasy_key2 = '615e0d6b82f9198a3b690baf51383e58b978f635'
streetEasy_key3 = 'fd0c7be4a56199306a34118253c55c580c5d04ca'
streetEasy_key4 = 'aac52f8de6deb59c4ac946347f5a0e7b2003f50d'
streetEasy_key5 = '8778d47f2bc849f2b6d6d0388cd4280382908578'
streetEasy_key6 = 'a1c1df657e16669aade950d84e1e5047fc850daf'
streetEasy_key7 = 'f2634896e8b6fd2071d6387d464fe762c3a20d13'

**get_area(row, streetEasy_key) Function**
1. API call made to StreetEasy using the streetEasy_key.
2. Area and Neighborhood information obtained based on the latitude and longitude of the station.
3. Append the information to the table.

In [35]:
def get_area(row, streetEasy_key):
    lat = row['latitude']
    lon = row['longitude']
    try:
        url = 'http://streeteasy.com/nyc/api/areas/for_location?lon=' + \
                        `lon` + '&lat=' + `lat` + '&key=' + streetEasy_key + '&format=json'
        response = requests.get(url)
        neibs = response.json()
        row['city'] = neibs['city']
        row['neighborhood'] = neibs['subtitle'][3:]
        row['neighborhood_id'] = neibs['parent_id']
        row['area'] = neibs['name']
        row['area_id'] = neibs['id']
    except:
        row['city'] = "-"
        row['neighborhood'] = "-"
        row['neighborhood_id'] = -1
        row['area'] = "-"
        row['area_id'] = -1
    print `row['id']` + '\t' + `row['stationName']` + '\t' + `row['area']`
    return row

In [None]:
augmented_stations1 = stations[0:90].apply(lambda row : get_area(row, streetEasy_key2), axis=1)
augmented_stations2 = stations[90:180].apply(lambda row : get_area(row, streetEasy_key3), axis=1)
augmented_stations3 = stations[180:270].apply(lambda row : get_area(row, streetEasy_key4), axis=1)
augmented_stations4 = stations[270:360].apply(lambda row : get_area(row, streetEasy_key5), axis=1)
augmented_stations5 = stations[360:450].apply(lambda row : get_area(row, streetEasy_key6), axis=1)
augmented_stations6 = stations[450:490].apply(lambda row : get_area(row, streetEasy_key7), axis=1)
augmented_stations7 = stations[490:495].apply(lambda row : get_area(row, streetEasy_key2), axis=1)
augmented_stations8 = stations[495:500].apply(lambda row : get_area(row, streetEasy_key3), axis=1)
augmented_stations9 = stations[500:506].apply(lambda row : get_area(row, streetEasy_key4), axis=1)
augmented_stations10 = stations[506:].apply(lambda row : get_area(row, streetEasy_key6), axis=1)

In [46]:
# Final dataset with neighborhoods information of all stations

augmented_stations = pd.concat([augmented_stations1, augmented_stations2, augmented_stations3, \
                                    augmented_stations4, augmented_stations5, augmented_stations6, \
                                    augmented_stations7, augmented_stations8, augmented_stations9, \
                                    augmented_stations10])

In [39]:
augmented_stations.count()

id                 508
latitude           508
longitude          508
stationName        508
city               508
neighborhood       508
neighborhood_id    508
area               508
area_id            508
dtype: int64

**Dumping this table into a pickle file**

In [43]:
pckl_file = open('augmented_stations.pickle', 'wb')
pickle.dump(augmented_stations, pckl_file)
pckl_file.close()

**Loading table from the pickle file**

In [44]:
augmented_stations_pickle = pickle.load(open('augmented_stations.pickle', 'rb'))

In [45]:
augmented_stations_pickle.count()

id                 508
latitude           508
longitude          508
stationName        508
city               508
neighborhood       508
neighborhood_id    508
area               508
area_id            508
dtype: int64