# Baking Stations

At the interface layer I'd like to be able to bring up the actual counts of the datasets of interest. This I can place in a flat file&mdash;all the database stuff isn't necessary.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import requests
import io
import zipfile
from tqdm import tqdm

In [2]:
r = requests.get('https://s3.amazonaws.com/tripdata/201606-citibike-tripdata.zip')
with zipfile.ZipFile(io.BytesIO(r.content)) as ar:
    trip_data = pd.read_csv(ar.open('201606-citibike-tripdata.csv'))

In [3]:
july_22 = pd.read_csv("../data/final/all_june_22_citibike_trips.csv", index_col=0)

In [4]:
all_station_ids = set(trip_data['start station id'].values).union(set(trip_data['end station id'].values))

In [5]:
trip_data.head(1)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,1470,6/1/2016 00:00:18,6/1/2016 00:24:48,380,W 4 St & 7 Ave S,40.734011,-74.002939,3236,W 42 St & Dyer Ave,40.758985,-73.9938,19859,Subscriber,1972.0,1


In [6]:
subframes = []
start_counts = []
end_counts = []

for station_id in tqdm(all_station_ids):
    started_here = trip_data[trip_data['start station id'] == station_id]
    ended_here = trip_data[trip_data['end station id'] == station_id]
    start_counts.append(len(july_22[july_22['start station id'] == station_id]))
    end_counts.append(len(july_22[july_22['end station id'] == station_id]))
    if len(started_here) > 0:
        s = started_here.iloc[0]
        subframes.append({'latitude': s['start station latitude'],
                          'longitude': s['start station longitude'],
                          'station name': s['start station name'],
                          'station id': s['start station id']})
    elif len(ended_here) > 0:
        s  = ended_here.iloc[0]
        subframes.append({'latitude': s['end station latitude'],
                          'longitude': s['end station longitude'],
                          'station name': s['end station name'],
                          'station id': s['end station id']})
    else:
        print("ERRROR")
        
stations = pd.DataFrame(subframes).set_index('station id', drop=True)
stations['incoming trips'] = start_counts
stations['outgoing trips'] = end_counts
stations['all trips'] = np.array(start_counts) + np.array(end_counts)
stations.index.name = 'station id'
del subframes

100%|████████████████████████████████████████| 489/489 [00:05<00:00, 94.56it/s]


In [7]:
stations.head(5)

Unnamed: 0_level_0,latitude,longitude,station name,incoming trips,outgoing trips,all trips
station id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
72,40.767272,-73.993929,W 52 St & 11 Ave,133,147,280
79,40.719116,-74.006667,Franklin St & W Broadway,113,114,227
82,40.711174,-74.000165,St James Pl & Pearl St,53,46,99
83,40.683826,-73.976323,Atlantic Ave & Fort Greene Pl,41,51,92
116,40.741776,-74.001497,W 17 St & 8 Ave,265,267,532


In [8]:
# stations = pd.read_csv("../data/final/july_22_station_metadata.csv", index_col=0)

In [9]:
# drop bad stations
stations = stations.drop([3257, 3240, 3213, 3202, 3196, 3192, 3191, 3186, 3184, 3183, 3252])

In [10]:
stations = stations.drop([255])  # this is depot that is way out of the range of the viz.

In [11]:
# code in a column for station kind: active, inactive (when it's in maintainence), depot (when it's a bike depot).
stations['kind'] = 'active'

def depot_or_not(srs):
    if "Depot" in srs['station name']:
        return "depot"
    elif srs['all trips'] == 0:
        return "inactive"
    else:
        return "active"
    
stations['kind'] = stations.apply(depot_or_not, axis='columns')

In [12]:
stations[stations['kind'] == 'depot']  # actually depots only have outbound trips.
# Since they pollute the view we can actually just remove them.

Unnamed: 0_level_0,latitude,longitude,station name,incoming trips,outgoing trips,all trips,kind
station id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3017,40.751483,-73.996764,NYCBS Depot - FAR,0,2,2,depot
3219,40.729193,-73.976655,NYCBS Depot - STY,0,6,6,depot
3250,40.71691,-73.983838,NYCBS Depot - PIT,0,0,0,depot


In [13]:
stations = stations.drop([3017, 3219, 3250])

In [14]:
stations[stations['kind'] == 'inactive']  # out for maintanance

Unnamed: 0_level_0,latitude,longitude,station name,incoming trips,outgoing trips,all trips,kind
station id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
233,40.694757,-73.990527,Cadman Plaza W & Pierrepont St,0,0,0,inactive
291,40.713126,-73.984844,Madison St & Montgomery St,0,0,0,inactive
503,40.738274,-73.98752,E 20 St & Park Ave,0,0,0,inactive
3085,40.71469,-73.95739,Roebling St & N 4 St,0,0,0,inactive
3136,40.766505,-73.971476,5 Ave & E 63 St,0,0,0,inactive
3169,40.787209,-73.981281,Riverside Dr & W 82 St,0,0,0,inactive
3259,40.74937,-73.999234,9 Ave & W 28 St,0,0,0,inactive
3260,40.727064,-73.996621,Mercer St & Bleecker St,0,0,0,inactive


In [15]:
stations.to_csv("../data/final/june_22_station_metadata.csv")

Conclusion:

Ok, so at this point I am going to defer on doing the bike roundabout thing because it actually involves a *lot* more work.

I will concentrate on mapping stations instead, for now, and gathering up all of the data (which I am realizing will also need to be transformed yet again later...sigh).

In [16]:
# all_bike_ids = set(july_22['bike id'])
# station_counts = pd.Series(0, index=stations.index)

# for bike_id in all_bike_ids:
#     trips = july_22[july_22['start station id'] == station_id]
#     start_station_trip = july_22.sort_values(by='starttime', ascending=False).iloc[0]
#     start_station = start_station_trip['start station id']
#     station_counts