In [43]:
import pandas as pd
import numpy as np

df_boston = pd.read_csv('./data/boston_2015.csv', dtype={'start_station_id': np.int64, 'end_station_id': 'string', 'end_station_name': 'string', 'start_station_name': 'string', 'bike_id': np.int64, 'user_type': 'string'})
print(f'Total number of rows: {len(df_boston)}')

Total number of rows: 1122558


In [44]:
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer


In [45]:
df_boston['user_type'].unique()

<StringArray>
['Subscriber', 'Customer']
Length: 2, dtype: string

# Preprocessing
### Set data type of 'end_station_id'

In [46]:
df_boston['end_station_id'].unique()

<StringArray>
[ '96',  '95',  '68',  '88',  '76', '118',  '75',  '67',  '36',  '23',
 ...
 '169', '174', '175', '159', '171', '178', '176', '179', '180', '177']
Length: 157, dtype: string

In [47]:
# Value '\\N' seems to be anomaly -> occurs only once, so drop!
num_occurences = len(df_boston.loc[df_boston["end_station_id"] == "\\N"])
print(f'Number of "\\\\N" occurences in end_station_id column: {num_occurences}')
df_boston.drop(index=df_boston.loc[df_boston["end_station_id"] == "\\N"].index, inplace=True, axis=1)

# now set column to dtype np.int64
df_boston = df_boston.astype({'end_station_id': np.int64})
df_boston.info()

Number of "\\N" occurences in end_station_id column: 1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1122557 entries, 0 to 1122557
Data columns (total 8 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   start_time          1122557 non-null  object
 1   end_time            1122557 non-null  object
 2   start_station_id    1122557 non-null  int64 
 3   end_station_id      1122557 non-null  int64 
 4   start_station_name  1122557 non-null  string
 5   end_station_name    1122557 non-null  string
 6   bike_id             1122557 non-null  int64 
 7   user_type           1122557 non-null  string
dtypes: int64(3), object(2), string(3)
memory usage: 77.1+ MB


### Set data type of time columns

In [48]:
df_boston['start_time'] = pd.to_datetime(df_boston['start_time'], format='%Y-%m-%d %X')
df_boston['end_time'] = pd.to_datetime(df_boston['end_time'], format='%Y-%m-%d %X')
df_boston.dtypes

start_time            datetime64[ns]
end_time              datetime64[ns]
start_station_id               int64
end_station_id                 int64
start_station_name            string
end_station_name              string
bike_id                        int64
user_type                     string
dtype: object

#### Check if station id and names are unique tuples

In [49]:
# len(df_boston['start_station_name'].unique())
is_unique = True
i = 0
all_station_ids = df_boston['start_station_id'].unique()

while i < len(all_station_ids) & is_unique:
    is_unique = len(df_boston.loc[df_boston['start_station_id'] == i]['start_station_name'].unique()) == 1
    i += 1

print("Station IDs and names are unique", is_unique)


Station IDs and names are unique True


In [50]:
# Check if station id and name always match
unique_start_tuples = np.unique(df_boston[['start_station_id', 'start_station_name']].values.astype('str'), axis=0)
unique_start_tuples

array([['1', '18 Dorrance Warehouse'],
       ['10', 'B.U. Central - 725 Comm. Ave.'],
       ['100', 'Davis Square'],
       ['102', 'Powder House Circle - Nathan Tufts Park'],
       ['103', 'JFK Crossing at Harvard St. / Thorndike St.'],
       ['104',
        'Harvard University Radcliffe Quadrangle at Shepard St / Garden St'],
       ['105', 'Lower Cambridgeport at Magazine St/Riverside Rd'],
       ['106', 'Mt Pleasant Ave / Dudley Town Common'],
       ['107', 'Ames St at Main St'],
       ['108',
        'Harvard University / SEAS Cruft-Pierce Halls at 29 Oxford St'],
       ['109', 'TD Garden - Causeway at Portal Park #1'],
       ['11', 'Longwood Ave / Binney St'],
       ['110', 'Harvard University Gund Hall at Quincy St / Kirkland S'],
       ['111', 'Packard Ave / Powderhouse Blvd'],
       ['112', 'Somerville Hospital at Highland Ave / Crocker St'],
       ['113', 'Andrew Station - Dorchester Ave at Humboldt Pl'],
       ['114', 'Teele Square at 239 Holland St'],
       [

In [51]:
#TODO: check for entire duplicates (also end stations)
unique_start_tuples[:,0:1]
u, c = np.unique(unique_start_tuples[:,0:1], return_counts=True)
dup = u[c > 1]
dup 

array([], dtype='<U70')

# Feature Engineering
#### Calculate trip_length 

In [52]:
df_boston['trip_length'] = (df_boston['end_time'] - df_boston['start_time'])
df_boston['trip_length'] = df_boston['trip_length'] / np.timedelta64(1, 's')
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,trip_length
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber,543.0
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber,438.0
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber,255.0
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber,432.0
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer,735.0


# First Data Exploration
### Understanding the revenue modell

![caption](data/thehubway_revenue_modell_2015.png)

In [53]:
# Pricing info pulled from: https://web.archive.org/web/20150206035343/http://www.thehubway.com:80/pricing
# INFO: bluebikes was previously called 'thehubway', see https://en.wikipedia.org/wiki/Bluebikes
# ASSUMTION: The revenue modell did not change over the year 2015. This was checked through all entries of the wayback-machine, which occured in 7 day intervalls. We can be pretty certain of this. 


# Get # of subscribers and customers
print(df_boston.groupby('user_type').size())
# TODO: Include this in revenue modell

# Thoughts on the revenue modell: We don't see individual user-ids for each ride. This means that we can only infer the "revenue" generated by each trip through the respective user_type.
# E.g. -> if we have 10 subscribers for a given timeframe, who undertake 100 trips, each trip would generate 1/10 of the users membership fee in the given timeframe
# Membership numbers for 2015: https://web.archive.org/web/20160208155519/http://www.thehubway.com/mediakit -> anual members: 13.248, Causal Passes: 102.445 (24 & 72 Hours) with a total of 1,1319,310 Trips (???)

no_trips_from_dataset_2015 = 11319310
no_trips_from_website_2015 = len(df_boston)
print(f'We only have {no_trips_from_website_2015/no_trips_from_dataset_2015} of the data!')

# Big Open Question: How should we deal with the missing data? (we only have 1/10 of the trip data apparently) ? -> https://s3.amazonaws.com/hubway-data/index.html this contains the raw data, it appears that there are a lot more than 112k rides..
# Open Question: How should we include the membership fees and casual passes costs in the revenue modell?


# get proprtion all the rides who might incur overtime fees (trip_length >30 min) to understand if this usage pattern is relevant for the revenue model
proportion_overtime_rides = len(df_boston[df_boston['trip_length'] >= (30*60)]) / len(df_boston) 
print(proportion_overtime_rides)

# Conclusion: Around 7% of rides were overtime -> we should include this in the revenue modell
# TODO: Engineer the following features for the revenue modell: revenue_through_overtime (calculated on the basis of trip_duration)


user_type
Customer      370585
Subscriber    751972
dtype: int64
We only have 0.09917185764856692 of the data!
0.06948154971195226


### merging of boston station dataset and the df_boston data

In [54]:
#boston blue bikes stations data
df_stations_data = pd.read_csv('./data/current_bluebikes_stations.csv', header=1)
df_january = pd.read_csv('./data/201501-hubway-tripdata.csv', usecols=['start station id', 'start station latitude', 'start station longitude', 'end station id', 'end station id', 'end station latitude', 'end station longitude'])
df_sep = pd.read_csv('./data/201509-hubway-tripdata.csv', usecols=['start station id', 'start station latitude', 'start station longitude', 'end station id', 'end station id', 'end station latitude', 'end station longitude'])
df_july = pd.read_csv('./data/201507-hubway-tripdata.csv', usecols=['start station id', 'start station latitude', 'start station longitude', 'end station id', 'end station id', 'end station latitude', 'end station longitude'])
df_oct = pd.read_csv('./data/201510-hubway-tripdata.csv', usecols=['start station id', 'start station latitude', 'start station longitude', 'end station id', 'end station id', 'end station latitude', 'end station longitude'])
df_dec = pd.read_csv('./data/201512-hubway-tripdata.csv', usecols=['start station id', 'start station latitude', 'start station longitude', 'end station id', 'end station id', 'end station latitude', 'end station longitude'])
unique_stations_external=np.unique(np.concatenate((df_january[['start station id', 'start station latitude', 'start station longitude']].values, df_january[['end station id', 'end station latitude', 'end station longitude']].values,df_july[['start station id', 'start station latitude', 'start station longitude']].values, df_july[['end station id', 'end station latitude', 'end station longitude']].values, df_sep[['start station id', 'start station latitude', 'start station longitude']].values, df_sep[['end station id', 'end station latitude', 'end station longitude']].values, df_oct[['start station id', 'start station latitude', 'start station longitude']].values, df_oct[['end station id', 'end station latitude', 'end station longitude']].values, df_dec[['start station id', 'start station latitude', 'start station longitude']].values, df_dec[['end station id', 'end station latitude', 'end station longitude']].values)), axis=0)

In [55]:
unique_stations_original=np.unique(np.concatenate((df_boston['start_station_name'].values, df_boston['end_station_name'].values)))
print(f'# stations from original data: {len(unique_stations_original)} | # stations from external data: {len(unique_stations_external)}')
df_unique_stations = pd.DataFrame(unique_stations_external, columns=['station_id', 'station_latitude', 'station_longitude'])

# stations from original data: 156 | # stations from external data: 156


In [56]:
df_unique_stations.columns = ['start_station_id', 'start_station_latitude', 'start_station_longitude']
df_boston = df_boston.merge(df_unique_stations, how='left', on='start_station_id')
df_unique_stations.columns = ['end_station_id', 'end_station_latitude', 'end_station_longitude']
df_boston = df_boston.merge(df_unique_stations, how='left', on='end_station_id')
df_boston.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1122557 entries, 0 to 1122556
Data columns (total 13 columns):
 #   Column                   Non-Null Count    Dtype         
---  ------                   --------------    -----         
 0   start_time               1122557 non-null  datetime64[ns]
 1   end_time                 1122557 non-null  datetime64[ns]
 2   start_station_id         1122557 non-null  int64         
 3   end_station_id           1122557 non-null  int64         
 4   start_station_name       1122557 non-null  string        
 5   end_station_name         1122557 non-null  string        
 6   bike_id                  1122557 non-null  int64         
 7   user_type                1122557 non-null  string        
 8   trip_length              1122557 non-null  float64       
 9   start_station_latitude   1122557 non-null  float64       
 10  start_station_longitude  1122557 non-null  float64       
 11  end_station_latitude     1122557 non-null  float64       
 12  

In [57]:
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,trip_length,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber,543.0,42.387995,-71.119084,42.373379,-71.111075
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber,438.0,42.361962,-71.092053,42.372969,-71.094445
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber,255.0,42.366277,-71.09169,42.36507,-71.1031
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber,432.0,42.387995,-71.119084,42.373379,-71.111075
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer,735.0,42.356954,-71.113687,42.374035,-71.101427


In [58]:
unique_stations_original=np.unique(np.concatenate((df_boston['start_station_name'].values, df_boston['end_station_name'].values)))
unique_stations_external = df_stations_data['Name'].values

In [59]:
print(f"#stations in stations dataset: {len(df_stations_data.loc[df_stations_data['Deployment Year'] < 2015.0])}")
print(f"#stations in original dataset: {len(unique_stations_original)}")

#stations in stations dataset: 130
#stations in original dataset: 156


In [60]:
stations_no_match = list(filter(lambda x: not(np.any(unique_stations_external == x)), unique_stations_original))
len(stations_no_match)

stations_no_match


['18 Dorrance Warehouse',
 'Agganis Arena - 925 Comm Ave.',
 'Allston Green District - Commonwealth Ave & Griggs St',
 'Andrew Station - Dorchester Ave at Humboldt Pl',
 'Aquarium Station - 200 Atlantic Ave.',
 'BIDMC - Brookline at Burlington St',
 'Back Bay / South End Station',
 'Beacon St / Mass Ave',
 'Beacon St at Washington / Kirkland',
 'Boston Convention & Exhibition Center',
 'Boston Medical Center -  East Concord at Harrison Ave',
 'Boston Public Library - 700 Boylston St.',
 'Boylston / Mass Ave',
 'Boylston St / Berkeley St',
 'Boylston St / Washington St',
 'Boylston St. at Arlington St.',
 'Boylston at Fairfield',
 'Brigham Cir / Huntington Ave',
 'Brighton Center',
 'Brookline Village - Pearl Street @ MBTA',
 'Buswell St. at Park Dr.',
 'Cambridge St. at Joy St.',
 'Charles Circle - Charles St. at Cambridge St.',
 'Charles St at Beacon St',
 'Charlestown - Main St at Austin St',
 'Charlestown - Warren St at Chelsea St',
 'Chinatown Gate Plaza - Surface Rd. at Beach St.'

In [61]:
stations_match = list(filter(lambda x: np.any(unique_stations_external == x), unique_stations_original))
stations_match

['359 Broadway - Broadway at Fayette Street',
 'Alewife Station at Russell Field',
 'Ames St at Main St',
 'B.U. Central - 725 Comm. Ave.',
 'Binney St / Sixth St',
 'Bunker Hill Community College',
 'Cambridge Main Library at Broadway / Trowbridge St',
 'Cambridge St - at Columbia St / Webster Ave',
 'CambridgeSide Galleria - CambridgeSide PL at Land Blvd',
 'Central Sq Post Office / Cambridge City Hall at Mass Ave / Pleasant St',
 'Central Square at Mass Ave / Essex St',
 'Conway Park - Somerville Avenue',
 'Dana Park',
 'Danehy Park',
 'Davis Square',
 'EF - North Point Park',
 'Fan Pier',
 'Harvard Kennedy School at Bennett St / Eliot St',
 'Harvard Law School at Mass Ave / Jarvis St',
 'Harvard Square at Brattle St / Eliot St',
 'Harvard Square at Mass Ave/ Dunster',
 'Harvard University / SEAS Cruft-Pierce Halls at 29 Oxford St',
 'Harvard University Housing - 115 Putnam Ave at Peabody Terrace',
 'Harvard University Radcliffe Quadrangle at Shepard St / Garden St',
 'Harvard Unive

# Plot start & end stations

In [62]:
df_boston_grouped_start= df_boston.groupby(by=['start_station_id', 'start_station_latitude', 'start_station_longitude']).count()
# df_boston_grouped_start = pd.DataFrame(df_boston_grouped_start)
df_boston_grouped_start

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,start_time,end_time,end_station_id,start_station_name,end_station_name,bike_id,user_type,trip_length,end_station_latitude,end_station_longitude
start_station_id,start_station_latitude,start_station_longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,42.387151,-71.075978,56,56,56,56,56,56,56,56,56,56
3,42.340021,-71.100812,5018,5018,5018,5018,5018,5018,5018,5018,5018,5018
4,42.345392,-71.069616,9512,9512,9512,9512,9512,9512,9512,9512,9512,9512
5,42.341814,-71.090179,6175,6175,6175,6175,6175,6175,6175,6175,6175,6175
6,42.361174,-71.065142,13018,13018,13018,13018,13018,13018,13018,13018,13018,13018
...,...,...,...,...,...,...,...,...,...,...,...,...
176,42.386748,-71.119019,287,287,287,287,287,287,287,287,287,287
177,42.362648,-71.100061,858,858,858,858,858,858,858,858,858,858
178,42.359573,-71.101295,1197,1197,1197,1197,1197,1197,1197,1197,1197,1197
179,42.355601,-71.103945,894,894,894,894,894,894,894,894,894,894


In [63]:
import plotly.express as px
color_scale = [(0, 'orange'), (1,'red')]

fig = px.scatter_mapbox(df_boston, 
                        lat="start_station_latitude", 
                        lon="start_station_longitude", 
                        color="start_time",
                        color_continuous_scale=color_scale,
                        size="start_time",
                        zoom=8, 
                        height=800,
                        width=800)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

TypeError: unsupported operand type(s) for /: 'Timestamp' and 'int'