# Finding Station Locations

## Import the libraries 

In [2]:
import pandas as pd
import numpy as np
import sys

print("Python Version:", sys.version)
print("Pandas Version:", pd.__version__)
print("Numpy Version:", np.__version__)

Python Version: 3.6.3 |Anaconda, Inc.| (default, Oct  6 2017, 12:04:38) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Pandas Version: 0.20.3
Numpy Version: 1.14.0


## Read in the data
### Read in station location data
We pulled station location data from the MTA website

In [3]:
stations = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')
stations.head()

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
0,1,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034
1,2,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843
2,3,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479
3,4,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.76182,-73.925508
4,5,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575


### Read in our cleaned turnstile data

In [4]:
turns = pd.read_pickle('data/cleaned_turnstile_data.pkl')
turns.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,...,entries_diff,exit_diff,first_row_group,entries_outlier,exit_outlier,week,hour,timegroup,timegroupstr,wkdaynbr
54505,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,04:00:00,REGULAR,338971959,...,89.0,12.0,False,False,False,12,4,1,12am - 4am,6
54506,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,08:00:00,REGULAR,338971992,...,33.0,7.0,False,False,False,12,8,2,4am - 8am,6
54507,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,12:00:00,REGULAR,338972160,...,168.0,51.0,False,False,False,12,12,3,8am - 12pm,6
54508,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,16:00:00,REGULAR,338972687,...,527.0,60.0,False,False,False,12,16,4,12pm - 4pm,6
54509,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,20:00:00,REGULAR,338973443,...,756.0,47.0,False,False,False,12,20,5,4pm - 8pm,6


### Read in a join table

As you might have noticed, there is not common field to merge the stations data with our turnstile data :( But don't worry!! We found a super handy table that links the station ID from the stations table to a booth id which corresponds to the control area id (c/a) in our turnstile data (Yay!!!) 

Let's read in that join table

*File downloaded from: https://docs.google.com/spreadsheets/d/10sz0xWODQ02Kemx6ovS0NLQ_gA0YV9YQtdD7uiCcyjI/edit?usp=sharing&authkey=CMTzrvwE*

In [5]:
join_table = pd.read_csv('data/Remote-Booth-Station - turnstiles 2013.csv')
join_table.head()

Unnamed: 0,Remote,Booth,stop_id.2013,Station.2013,Line.2013,Division.2013
0,R051,A002,R11,LEXINGTON AVE,456NQR,BMT
1,R050,A004,R11,LEXINGTON AVE,456NQR,BMT
2,R079,A006,R13,5 AVE-59 ST,NQR,BMT
3,R079,A007,R13,5 AVE-59 ST,NQR,BMT
4,R080,A010,R14,57 ST-7 AVE,NQR,BMT


## Join the turnstile data with station locations

### Join the stations data with our join file to get booth id

In [6]:
stations_booth = pd.merge(join_table ,stations,how='left',left_on='stop_id.2013',right_on='GTFS Stop ID')
stations_booth.head()

Unnamed: 0,Remote,Booth,stop_id.2013,Station.2013,Line.2013,Division.2013,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
0,R051,A002,R11,LEXINGTON AVE,456NQR,BMT,7.0,613.0,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258
1,R050,A004,R11,LEXINGTON AVE,456NQR,BMT,7.0,613.0,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258
2,R079,A006,R13,5 AVE-59 ST,NQR,BMT,8.0,8.0,R13,BMT,Astoria,5 Av/59 St,M,N W R,Subway,40.764811,-73.973347
3,R079,A007,R13,5 AVE-59 ST,NQR,BMT,8.0,8.0,R13,BMT,Astoria,5 Av/59 St,M,N W R,Subway,40.764811,-73.973347
4,R080,A010,R14,57 ST-7 AVE,NQR,BMT,9.0,9.0,R14,BMT,Broadway - Brighton,57 St - 7 Av,M,N Q R W,Subway,40.764664,-73.980658


In [7]:
missing = stations_booth['GTFS Latitude'].isnull().sum()

print("We're missing the stop id in our join table for {} stations".format(missing))
print("...but we have booth data for {} of our total stations!".format(round((len(stations_booth)-missing)/len(stations_booth),2)))

We're missing the stop id in our join table for 41 stations
...but we have booth data for 0.95 of our total stations!


### Join the turnstile data with our stations data

In [8]:
turns_loc = pd.merge(turns, stations_booth, how='inner',left_on='c/a',right_on='Booth')
turns_loc.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,...,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
0,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,04:00:00,REGULAR,338971959,...,168.0,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739
1,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,08:00:00,REGULAR,338971992,...,168.0,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739
2,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,12:00:00,REGULAR,338972160,...,168.0,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739
3,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,16:00:00,REGULAR,338972687,...,168.0,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739
4,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,20:00:00,REGULAR,338973443,...,168.0,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739


In [9]:
missing_t = turns_loc['GTFS Latitude'].isnull().sum()

print("We're missing the corresponding c/a id for {} turnstile records".format(missing_t))
print("...but we have location data for {} of our total turnstile records!".format(round((len(turns_loc)-missing_t)/len(turns_loc),2)))

We're missing the corresponding c/a id for 369104 turnstile records
...but we have location data for 0.93 of our total turnstile records!


Check out the station names from the turnstile data and station loc data to make sure they agree

In [10]:
diff_stations = [(x, y) for x, y in zip(turns_loc['station'], turns_loc['Stop Name'])]
set(diff_stations)

{('ATL AV-BARCLAY', 'Atlantic Av - Barclays Ctr'),
 ('PARK PLACE', 'Grand Central - 42 St'),
 ('PARK PLACE', 'Park Pl'),
 ('138/GRAND CONC', '138 St - Grand Concourse'),
 ('BURKE AV', 'Burke Av'),
 ('SARATOGA AV', 'Saratoga Av'),
 ('UNION ST', 'Union St'),
 ('METS-WILLETS PT', 'Mets - Willets Point'),
 ('JOURNAL SQUARE', nan),
 ('3 AV 138 ST', '3 Av - 138 St'),
 ('30 AV', '30 Av'),
 ('135 ST', '135 St'),
 ('W 8 ST-AQUARIUM', 'W 8 St - NY Aquarium'),
 ('MARBLE HILL-225', 'Marble Hill - 225 St'),
 ('BEACH 60 ST', 'Beach 60 St'),
 ('68ST-HUNTER CO', '68 St - Hunter College'),
 ('CENTRAL AV', 'Central Av'),
 ('MT EDEN AV', 'Mt Eden Av'),
 ('BROADWAY JCT', 'Broadway Jct'),
 ('BUSHWICK AV', 'Bushwick Av - Aberdeen St'),
 ('WORLD TRADE CTR', 'World Trade Center'),
 ('KOSCIUSZKO ST', 'Kosciuszko St'),
 ('BUHRE AV', 'Buhre Av'),
 ('HOYT ST', 'Hoyt St'),
 ('59 ST COLUMBUS', '59 St - Columbus Circle'),
 ('BURNSIDE AV', 'Burnside Av'),
 ('BRIARWOOD', 'Briarwood - Van Wyck Blvd'),
 ('174 ST', '174 

In [11]:
station_count = turns_loc.station.nunique()
nans = set([pair for pair in diff_stations if np.nan in pair])

print('{} Stations do not have location information: {} of total'.format(len(nans), len(nans)/station_count))

26 Stations do not have location information: 0.06951871657754011 of total


In [12]:
set(nans)

{('145 ST', nan),
 ('9TH STREET', nan),
 ('BEACH 105 ST', nan),
 ('BEACH 98 ST', nan),
 ('CITY / BUS', nan),
 ('EXCHANGE PLACE', nan),
 ('GROVE STREET', nan),
 ('HARRISON', nan),
 ('JFK JAMAICA CT1', nan),
 ('JOURNAL SQUARE', nan),
 ('LACKAWANNA', nan),
 ('NEWARK BM BW', nan),
 ('NEWARK C', nan),
 ('NEWARK HM HE', nan),
 ('NEWARK HW BMEBE', nan),
 ('ORCHARD BEACH', nan),
 ('PATH WTC', nan),
 ('PATH WTC 2', nan),
 ('PAVONIA/NEWPORT', nan),
 ('QUEENSBORO PLZ', nan),
 ('RIT-MANHATTAN', nan),
 ('RIT-ROOSEVELT', nan),
 ('THIRTY ST', nan),
 ('THIRTY THIRD ST', nan),
 ('TWENTY THIRD ST', nan),
 ('W 4 ST-WASH SQ', nan)}

## Add new column for clean Station ID for use in visualizations

In [13]:
turns_loc['station_id'] = turns_loc['Stop Name'] + turns_loc['Line']

In [14]:
turns_loc.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,...,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,station_id
0,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,04:00:00,REGULAR,338971959,...,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739,Spring St8th Av - Fulton St
1,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,08:00:00,REGULAR,338971992,...,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739,Spring St8th Av - Fulton St
2,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,12:00:00,REGULAR,338972160,...,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739,Spring St8th Av - Fulton St
3,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,16:00:00,REGULAR,338972687,...,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739,Spring St8th Av - Fulton St
4,N086,R282,00-00-02,SPRING ST,CE,IND,03/25/2017,20:00:00,REGULAR,338973443,...,A33,IND,8th Av - Fulton St,Spring St,M,C E,Subway,40.726227,-74.003739,Spring St8th Av - Fulton St


## Pickle the raw turnstile data with lat/longs

In [15]:
turns_loc.to_pickle('data/turns_data_locations.pkl')

## Find total entry volume by hour block of day

In [16]:
# Find total entries per hour block per station 
stations_block = turns_loc.groupby(['station_id',
                                   'GTFS Latitude',
                                   'GTFS Longitude',
                                   'wkdaynbr', 
                                   'timegroupstr', 
                                   'date'], as_index=False)[['entries_diff']].sum()

print(len(stations_block))
stations_block.sort_values(by='entries_diff', ascending=False, inplace=True)
stations_block.head(10)

507679


Unnamed: 0,station_id,GTFS Latitude,GTFS Longitude,wkdaynbr,timegroupstr,date,entries_diff
338552,Grand Central - 42 StLexington - Shuttle,40.751776,-73.976848,4,4pm - 8pm,04/20/2017,43977.0
338714,Grand Central - 42 StLexington - Shuttle,40.751776,-73.976848,5,4pm - 8pm,03/31/2017,42156.0
476741,Times Sq - 42 StBroadway - 7Av,40.75529,-73.987495,2,4pm - 8pm,06/27/2017,40638.0
476891,Times Sq - 42 StBroadway - 7Av,40.75529,-73.987495,3,4pm - 8pm,04/26/2017,40582.0
476884,Times Sq - 42 StBroadway - 7Av,40.75529,-73.987495,3,4pm - 8pm,03/30/2016,40487.0
476883,Times Sq - 42 StBroadway - 7Av,40.75529,-73.987495,3,4pm - 8pm,03/29/2017,40410.0
476910,Times Sq - 42 StBroadway - 7Av,40.75529,-73.987495,3,4pm - 8pm,06/29/2016,40360.0
476896,Times Sq - 42 StBroadway - 7Av,40.75529,-73.987495,3,4pm - 8pm,05/11/2016,40182.0
476731,Times Sq - 42 StBroadway - 7Av,40.75529,-73.987495,2,4pm - 8pm,05/23/2017,40050.0
476890,Times Sq - 42 StBroadway - 7Av,40.75529,-73.987495,3,4pm - 8pm,04/20/2016,39729.0


Write out to csv for Tableau viz

In [17]:
stations_block.to_csv('data/station_vol_by_hour.csv', index=False)

In [18]:
turns_loc.to_csv('data/turns_data_locations.csv', index=False)