# Find Optimal Station for Given Time Segment

## Import Libraries

In [3]:
import pandas as pd
import numpy as np
import sys

print("Python Version:", sys.version)
print("Pandas Version:", pd.__version__)
print("Numpy Version:", np.__version__)

Python Version: 3.6.3 |Anaconda, Inc.| (default, Oct  6 2017, 12:04:38) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Pandas Version: 0.20.3
Numpy Version: 1.14.0


## Read in turnstile data with locations
(This was pickled in notebook 2_FindingStationLocations)

In [10]:
df = pd.read_pickle('data/turns_data_locations.pkl')

In [11]:
print('Weekdays', df['wkdaynbr'].unique())
print('Hour Blocks', df['timegroup'].unique())
print('Hour Block Strings', df['timegroupstr'].unique())


Weekdays [6 0 1 2 3 4 5]
Hour Blocks [1 2 3 4 5 6]
Hour Block Strings ['12am - 4am' '4am - 8am' '8am - 12pm' '12pm - 4pm' '4pm - 8pm'
 '8pm - 12am']


## Find station with daily avg highest frequency entries for given segment

In [12]:
daily_sum = df.groupby(['station_id', 'wkdaynbr', 'timegroup', 'timegroupstr', 'date'], as_index = False)['entries_diff'].sum()
daily_sum.head()

Unnamed: 0,station_id,wkdaynbr,timegroup,timegroupstr,date,entries_diff
0,1 AvCanarsie,0,1,12am - 4am,03/26/2017,1766.0
1,1 AvCanarsie,0,1,12am - 4am,03/27/2016,1732.0
2,1 AvCanarsie,0,1,12am - 4am,04/02/2017,1804.0
3,1 AvCanarsie,0,1,12am - 4am,04/03/2016,2249.0
4,1 AvCanarsie,0,1,12am - 4am,04/09/2017,1515.0


In [13]:
daily_avg = daily_sum.groupby(['station_id', 'wkdaynbr', 'timegroup', 'timegroupstr'], as_index = False)['entries_diff'].mean()
daily_avg.sort_values(by='entries_diff', ascending=False, inplace=True)
daily_avg.head(10)

Unnamed: 0,station_id,wkdaynbr,timegroup,timegroupstr,entries_diff
17158,Times Sq - 42 StBroadway - 7Av,3,5,4pm - 8pm,38475.214286
17164,Times Sq - 42 StBroadway - 7Av,4,5,4pm - 8pm,37161.107143
17152,Times Sq - 42 StBroadway - 7Av,2,5,4pm - 8pm,36247.75
17170,Times Sq - 42 StBroadway - 7Av,5,5,4pm - 8pm,35110.928571
17146,Times Sq - 42 StBroadway - 7Av,1,5,4pm - 8pm,34557.678571
3562,34 St - Herald SqBroadway - Brighton,5,5,4pm - 8pm,32560.285714
3544,34 St - Herald SqBroadway - Brighton,2,5,4pm - 8pm,31674.5
3556,34 St - Herald SqBroadway - Brighton,4,5,4pm - 8pm,31613.535714
3550,34 St - Herald SqBroadway - Brighton,3,5,4pm - 8pm,31566.107143
12166,Grand Central - 42 StLexington - Shuttle,4,5,4pm - 8pm,31192.607143


In [14]:
def find_optimal_station(time, day, df=daily_avg):
    
    subset = df[(df['wkdaynbr']==day) & (df['timegroup']==time)].sort_values(by='entries_diff', ascending=False)
    subset.reset_index(drop=True, inplace=True)
    print(subset.head(1))
    
    station = subset.loc[0, 'station_id']
    daily_avg = subset.loc[0, 'entries_diff']
    
    return station, daily_avg

In [15]:
find_optimal_station(5, 3)

                       station_id  wkdaynbr  timegroup timegroupstr  \
0  Times Sq - 42 StBroadway - 7Av         3          5    4pm - 8pm   

   entries_diff  
0  38475.214286  


('Times Sq - 42 StBroadway - 7Av', 38475.21428571428)

In [21]:
max_avg = daily_avg.groupby(['wkdaynbr', 'timegroup'], as_index=False)['entries_diff'].max()
max_avg.head()

Unnamed: 0,wkdaynbr,timegroup,entries_diff
0,0,1,13868.285714
1,0,2,3109.214286
2,0,3,8294.5
3,0,4,15516.5
4,0,5,20478.25


In [22]:
optimized = pd.merge(max_avg, daily_avg, how='inner', on=['wkdaynbr', 'timegroup', 'entries_diff'])
optimized.sort_values(by='entries_diff', ascending=False, inplace=True)

In [28]:
print('Length of optimized station time df:', len(optimized))
optimized.head()

Length of optimized station time df: 42


Unnamed: 0,wkdaynbr,timegroup,entries_diff,station_id,timegroupstr
22,3,5,38475.214286,Times Sq - 42 StBroadway - 7Av,4pm - 8pm
28,4,5,37161.107143,Times Sq - 42 StBroadway - 7Av,4pm - 8pm
16,2,5,36247.75,Times Sq - 42 StBroadway - 7Av,4pm - 8pm
34,5,5,35110.928571,Times Sq - 42 StBroadway - 7Av,4pm - 8pm
10,1,5,34557.678571,Times Sq - 42 StBroadway - 7Av,4pm - 8pm


Write out as csv for visualization in Tableau

In [27]:
optimized.to_csv('data/optimized_segment.csv', index=False)