In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
from shapely.geometry import Point
from geopandas import GeoSeries, GeoDataFrame
from haversine import haversine
%matplotlib inline

In [2]:
# Import the subway entrance geolocation into geopandas
metro_geo = gpd.read_file('data/geolocation/subway_entrances.geojson')
metro_geo.head(3)

Unnamed: 0,geometry,line,name,objectid,url
0,POINT (-73.86835600032798 40.84916900104506),2-5,Birchall Ave & Sagamore St at NW corner,1734,http://web.mta.info/nyct/service/
1,POINT (-73.86821300022677 40.84912800131844),2-5,Birchall Ave & Sagamore St at NE corner,1735,http://web.mta.info/nyct/service/
2,POINT (-73.87349900050798 40.84122300105249),2-5,Morris Park Ave & 180th St at NW corner,1736,http://web.mta.info/nyct/service/


In [53]:
# Import bike station information (only stations that showed up btw 2016-04 and 2017-03)
bikes = pd.read_csv('data/station_info_201604_201703.csv').drop('Unnamed: 0', axis=1)
bikes.sort_values('st_id', ascending=True).head(3)
bikes = bikes.drop(bikes.index[[20, 51, 64, 116, 275, 470, 475]])

In [4]:
# get the [latitude, longitude] pair via station ID
def get_location_by_ID(data, station_id):
    if station_id not in data.st_id.values:
        print('Error: the station ID not found in the database')
    location = data[data.st_id == station_id][['latitude', 'longitude']].values
    return tuple(location.flat)

# get the name of the station via station ID
def get_name_by_ID(data, station_id):
    if station_id not in data.st_id.values:
        print('Error: the station ID not found in the database')
    name = data[data.st_id == station_id]['name']
    return name

# calculate (estimated) distance between two stations via stations IDs
def station_distance(data, station_id1, station_id2):
    if (station_id1 not in data.st_id.values) | (station_id2 not in data.st_id.values):
        print('Error: the station ID not found in the database')
    loc1 = get_location_by_ID(data, station_id1)
    loc2 = get_location_by_ID(data, station_id2)
    distance = haversine(loc1, loc2)
    return distance

In [5]:
station_distance(bikes, 3146,  325)

4.983283090716133

In [6]:
def distance_to_station(station_location, another_location):
    distance = haversine(station_location, another_location)
    return distance

In [7]:
# For each location in locations1, find the closest location in locations2.
# Returns index for matched element in locations2 and the distance
def calculate_minimum_distances(locations1, locations2):
    num_rows1 = len(locations1)
    num_rows2 = len(locations2)
    min_distances = np.zeros(num_rows1)
    min_distances_indeces = np.zeros(num_rows1)
    for i, loc1 in enumerate(locations1):
        distances = np.zeros(num_rows2)
        for j, loc2 in enumerate(locations2):
            dist = haversine(loc1, loc2)
            distances[j] = dist
        min_distances[i] = np.min(distances)
        min_distances_indeces[i] = np.argmin(distances)
    
    return pd.DataFrame(min_distances, index=min_distances_indeces.astype(int))

In [8]:
bikes['st_location'] = tuple(list(zip(bikes.latitude, bikes.longitude)))

In [9]:
metro_geo['metro_location'] = None
for ind in metro_geo.index:
    lat = metro_geo.geometry[ind].y
    long = metro_geo.geometry[ind].x
    metro_geo['metro_location'].iloc[ind] = tuple([lat, long])

In [10]:
dists = calculate_minimum_distances(bikes['st_location'], metro_geo['metro_location'])

In [11]:
# For a location with station_id, find the closest location in locations2.
# Returns index for matched element in locations2 and the distance
def calculate_minimum_distances2(stations_data, station_id, locations):
    num_rows = len(locations)
    distances = np.zeros(num_rows)
    loc1 = get_location_by_ID(stations_data, station_id)
    for i, loc2 in enumerate(locations):
        dist = haversine(loc1, loc2)
        distances[i] = dist
    min_dist = np.min(distances)
    min_dist_index = np.argmin(distances).astype(int)
    
    return {'st_id': station_id, 'closest_loc_index': min_dist_index, 'closest_distance': min_dist}

In [12]:
res = calculate_minimum_distances2(bikes, 72, metro_geo.metro_location)

In [13]:
res

{'closest_distance': 0.8367660141242963, 'closest_loc_index': 890, 'st_id': 72}

In [55]:
subway_distances = []
for st in bikes.st_id:
    res = calculate_minimum_distances2(bikes, st, metro_geo.metro_location)
    subway_distances.append(res)

In [56]:
sub_dist = pd.DataFrame(subway_distances)

In [58]:
bikes = pd.merge(bikes, sub_dist, on='st_id')

In [59]:
bikes

Unnamed: 0,st_id,name,latitude,longitude,closest_distance,closest_loc_index
0,72,W 52 St & 11 Ave,40.767272,-73.993929,0.836766,890
1,79,Franklin St & W Broadway,40.719116,-74.006667,0.012754,421
2,82,St James Pl & Pearl St,40.711174,-74.000165,0.372382,454
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,0.176488,1732
4,116,W 17 St & 8 Ave,40.741776,-74.001497,0.062889,905
5,119,Park Ave & St Edwards St,40.696089,-73.978034,0.606591,1896
6,120,Lexington Ave & Classon Ave,40.686768,-73.959282,0.229292,331
7,127,Barrow St & Hudson St,40.731724,-74.006744,0.353316,1324
8,128,MacDougal St & Prince St,40.727103,-74.002971,0.131415,551
9,137,E 56 St & Madison Ave,40.761628,-73.972924,0.226170,1240


In [63]:
station_info = pd.read_csv('data/station_info_201604_201703.csv').drop('Unnamed: 0', axis=1)
station_info

Unnamed: 0,st_id,name,latitude,longitude
0,72,W 52 St & 11 Ave,40.767272,-73.993929
1,79,Franklin St & W Broadway,40.719116,-74.006667
2,82,St James Pl & Pearl St,40.711174,-74.000165
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323
4,116,W 17 St & 8 Ave,40.741776,-74.001497
5,119,Park Ave & St Edwards St,40.696089,-73.978034
6,120,Lexington Ave & Classon Ave,40.686768,-73.959282
7,127,Barrow St & Hudson St,40.731724,-74.006744
8,128,MacDougal St & Prince St,40.727103,-74.002971
9,137,E 56 St & Madison Ave,40.761628,-73.972924


In [64]:
print(len(station_info))
station_info = station_info.drop(station_info.index[[20, 51, 64, 116, 275, 470, 475]])
print(len(station_info))

664
657


In [66]:
station_info.sort_values('st_id', ascending=True).head(3).reset_index()
station_info

Unnamed: 0,st_id,name,latitude,longitude
0,72,W 52 St & 11 Ave,40.767272,-73.993929
1,79,Franklin St & W Broadway,40.719116,-74.006667
2,82,St James Pl & Pearl St,40.711174,-74.000165
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323
4,116,W 17 St & 8 Ave,40.741776,-74.001497
5,119,Park Ave & St Edwards St,40.696089,-73.978034
6,120,Lexington Ave & Classon Ave,40.686768,-73.959282
7,127,Barrow St & Hudson St,40.731724,-74.006744
8,128,MacDougal St & Prince St,40.727103,-74.002971
9,137,E 56 St & Madison Ave,40.761628,-73.972924
