# Finding Station Locations

## Import the libraries 

In [14]:
import pandas as pd
import numpy as np
import sys

print("Python Version:", sys.version)
print("Pandas Version:", pd.__version__)
print("Numpy Version:", np.__version__)

Python Version: 3.6.3 |Anaconda, Inc.| (default, Oct  6 2017, 12:04:38) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Pandas Version: 0.20.3
Numpy Version: 1.13.3


## Read in the data
### Read in station location data
We pulled station location data from the MTA website

In [5]:
stations = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')
stations.head()

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
0,1,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034
1,2,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843
2,3,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479
3,4,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.76182,-73.925508
4,5,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575


### Read in our cleaned turnstile data

In [6]:
turns = pd.read_pickle('data/cleaned_turnstile_data.pkl')
turns.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,...,group_id,station_id,entries_diff,exit_diff,first_row_group,entries_outlier,exit_outlier,all_traffic,week,hour
196410,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,05:00:00,REGULAR,5899854,...,0,EASTCHSTER/DYRE5IRT,1.0,5.0,False,False,False,6.0,12,5
196411,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,09:00:00,REGULAR,5899857,...,0,EASTCHSTER/DYRE5IRT,3.0,5.0,False,False,False,8.0,12,9
196412,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,13:00:00,REGULAR,5899857,...,0,EASTCHSTER/DYRE5IRT,0.0,1.0,False,False,False,1.0,12,13
196413,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,17:00:00,REGULAR,5899857,...,0,EASTCHSTER/DYRE5IRT,0.0,4.0,False,False,False,4.0,12,17
196414,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,21:00:00,REGULAR,5899857,...,0,EASTCHSTER/DYRE5IRT,0.0,2.0,False,False,False,2.0,12,21


### Read in a join table

As you might have noticed, there is not common field to merge the stations data with our turnstile data :( But don't worry!! We found a super handy table that links the station ID from the stations table to a booth id which corresponds to the control area id (c/a) in our turnstile data (Yay!!!) 

Let's read in that join table

*File downloaded from: https://docs.google.com/spreadsheets/d/10sz0xWODQ02Kemx6ovS0NLQ_gA0YV9YQtdD7uiCcyjI/edit?usp=sharing&authkey=CMTzrvwE*

In [7]:
join_table = pd.read_csv('data/Remote-Booth-Station - turnstiles 2013.csv')
join_table.head()

Unnamed: 0,Remote,Booth,stop_id.2013,Station.2013,Line.2013,Division.2013
0,R051,A002,R11,LEXINGTON AVE,456NQR,BMT
1,R050,A004,R11,LEXINGTON AVE,456NQR,BMT
2,R079,A006,R13,5 AVE-59 ST,NQR,BMT
3,R079,A007,R13,5 AVE-59 ST,NQR,BMT
4,R080,A010,R14,57 ST-7 AVE,NQR,BMT


## Join the turnstile data with station locations

### Join the stations data with our join file to get booth id

In [8]:
stations_booth = pd.merge(join_table ,stations,how='left',left_on='stop_id.2013',right_on='GTFS Stop ID')
stations_booth.head()

Unnamed: 0,Remote,Booth,stop_id.2013,Station.2013,Line.2013,Division.2013,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
0,R051,A002,R11,LEXINGTON AVE,456NQR,BMT,7.0,613.0,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258
1,R050,A004,R11,LEXINGTON AVE,456NQR,BMT,7.0,613.0,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258
2,R079,A006,R13,5 AVE-59 ST,NQR,BMT,8.0,8.0,R13,BMT,Astoria,5 Av/59 St,M,N W R,Subway,40.764811,-73.973347
3,R079,A007,R13,5 AVE-59 ST,NQR,BMT,8.0,8.0,R13,BMT,Astoria,5 Av/59 St,M,N W R,Subway,40.764811,-73.973347
4,R080,A010,R14,57 ST-7 AVE,NQR,BMT,9.0,9.0,R14,BMT,Broadway - Brighton,57 St - 7 Av,M,N Q R W,Subway,40.764664,-73.980658


In [9]:
missing = stations_booth['GTFS Latitude'].isnull().sum()

print("We're missing the stop id in our join table for {} stations".format(missing))
print("...but we have booth data for {} of our total stations!".format(round((len(stations_booth)-missing)/len(stations_booth),2)))

We're missing the stop id in our join table for 41 stations
...but we have booth data for 0.95 of our total stations!


### Join the turnstile data with our stations data

In [10]:
turns_loc = pd.merge(turns, stations_booth, how='inner',left_on='c/a',right_on='Booth')
turns_loc.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,...,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
0,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,05:00:00,REGULAR,5899854,...,442.0,501,IRT,Dyre Av,Eastchester - Dyre Av,Bx,5,At Grade,40.8883,-73.830834
1,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,09:00:00,REGULAR,5899857,...,442.0,501,IRT,Dyre Av,Eastchester - Dyre Av,Bx,5,At Grade,40.8883,-73.830834
2,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,13:00:00,REGULAR,5899857,...,442.0,501,IRT,Dyre Av,Eastchester - Dyre Av,Bx,5,At Grade,40.8883,-73.830834
3,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,17:00:00,REGULAR,5899857,...,442.0,501,IRT,Dyre Av,Eastchester - Dyre Av,Bx,5,At Grade,40.8883,-73.830834
4,R730,R431,00-00-01,EASTCHSTER/DYRE,5,IRT,03/25/2017,21:00:00,REGULAR,5899857,...,442.0,501,IRT,Dyre Av,Eastchester - Dyre Av,Bx,5,At Grade,40.8883,-73.830834


In [11]:
missing_t = turns_loc['GTFS Latitude'].isnull().sum()

print("We're missing the corresponding c/a id for {} turnstile records".format(missing_t))
print("...but we have location data for {} of our total turnstile records!".format(round((len(turns_loc)-missing_t)/len(turns_loc),2)))

We're missing the corresponding c/a id for 362630 turnstile records
...but we have location data for 0.93 of our total turnstile records!


Check out the station names from the turnstile data and station loc data to make sure they agree

In [12]:
diff_stations = [(x, y) for x, y in zip(turns_loc['station'], turns_loc['Stop Name'])]
set(diff_stations)

{('PELHAM BAY PARK', 'Pelham Bay Park'),
 ('QUEENS PLAZA', 'Queens Plaza'),
 ('ROOSEVELT ISLND', 'Roosevelt Island'),
 ("B'WAY-LAFAYETTE", 'Broadway-Lafayette St'),
 ('EAST 105 ST', 'E 105 St'),
 ('61 ST WOODSIDE', 'Woodside - 61 St'),
 ('WHITLOCK AV', 'Whitlock Av'),
 ('NECK RD', 'Neck Rd'),
 ('GRAND ARMY PLAZ', 'Grand Army Plaza'),
 ('81 ST-MUSEUM', '81 St - Museum of Natural History'),
 ('NOSTRAND AV', 'Nostrand Av'),
 ('CYPRESS AV', 'Cypress Av'),
 ('NASSAU ST', 'Nassau Av'),
 ('SUTTER AV', 'Sutter Av'),
 ('KINGS HWY', 'Kings Hwy'),
 ('225 ST', '225 St'),
 ('HEWES ST', 'Hewes St'),
 ('FLATBUSH AV-B.C', 'Flatbush Av - Brooklyn College'),
 ('JKSN HT-ROOSVLT', 'Jackson Hts - Roosevelt Av'),
 ('LAFAYETTE AV', 'Lafayette Av'),
 ('167 ST', '167 St'),
 ('BROADWAY JCT', 'Broadway Jct'),
 ('GRAND-NEWTOWN', 'Grand Av - Newtown'),
 ('67 AV', '67 Av'),
 ('137 ST CITY COL', '137 St - City College'),
 ('EUCLID AV', 'Euclid Av'),
 ('57 ST', '57 St'),
 ('59 ST', 'Lexington Av/59 St'),
 ('BEACH 25 

In [20]:
nans = [pair for pair in diff_stations if np.nan in pair]
print('{} Stations do not have location information: {} of total'.format(len(nans), len(nans)/len(diff_stations)))

362630 Stations do not have location information: 0.06985371807360694 of total


## Pickle the turnstile data with lat/longs

In [36]:
turns_loc.to_pickle('data/turns_data_locations.pkl')