In [1]:
import pandas as pd
import numpy as np

df_boston = pd.read_csv('./data/boston_2015.csv', dtype={'start_station_id': np.int64, 'end_station_id': 'string', 'end_station_name': 'string', 'start_station_name': 'string', 'bike_id': np.int64, 'user_type': 'string'})
print(f'Total number of rows: {len(df_boston)}')

Total number of rows: 1122558


In [4]:
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer


# Preprocessing
### Set data type of 'end_station_id'

In [5]:
df_boston['end_station_id'].unique()

<StringArray>
[ '96',  '95',  '68',  '88',  '76', '118',  '75',  '67',  '36',  '23',
 ...
 '169', '174', '175', '159', '171', '178', '176', '179', '180', '177']
Length: 157, dtype: string

In [6]:
# Value '\\N' seems to be anomaly -> occurs only once, so drop!
num_occurences = len(df_boston.loc[df_boston["end_station_id"] == "\\N"])
print(f'Number of "\\\\N" occurences in end_station_id column: {num_occurences}')
df_boston.drop(index=df_boston.loc[df_boston["end_station_id"] == "\\N"].index, inplace=True, axis=1)

# now set column to dtype np.int64
df_boston = df_boston.astype({'end_station_id': np.int64})
df_boston.info()

Number of "\\N" occurences in end_station_id column: 1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1122557 entries, 0 to 1122557
Data columns (total 8 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   start_time          1122557 non-null  object
 1   end_time            1122557 non-null  object
 2   start_station_id    1122557 non-null  int64 
 3   end_station_id      1122557 non-null  int64 
 4   start_station_name  1122557 non-null  string
 5   end_station_name    1122557 non-null  string
 6   bike_id             1122557 non-null  int64 
 7   user_type           1122557 non-null  string
dtypes: int64(3), object(2), string(3)
memory usage: 77.1+ MB


### Set data type of time columns

In [7]:
df_boston['start_time'] = pd.to_datetime(df_boston['start_time'], format='%Y-%m-%d %X')
df_boston['end_time'] = pd.to_datetime(df_boston['end_time'], format='%Y-%m-%d %X')
df_boston.dtypes

start_time            datetime64[ns]
end_time              datetime64[ns]
start_station_id               int64
end_station_id                 int64
start_station_name            string
end_station_name              string
bike_id                        int64
user_type                     string
dtype: object

#### Check if station id and names are unique tuples

In [8]:
# len(df_boston['start_station_name'].unique())
df_boston.loc[df_boston['start_station_id'] == 96]['start_station_name'].unique()

<StringArray>
['Cambridge Main Library at Broadway / Trowbridge St']
Length: 1, dtype: string

In [9]:
# Check if station id and name always match
unique_start_tuples = np.unique(df_boston[['start_station_id', 'start_station_name']].values.astype('str'), axis=0)
unique_start_tuples

array([['1', '18 Dorrance Warehouse'],
       ['10', 'B.U. Central - 725 Comm. Ave.'],
       ['100', 'Davis Square'],
       ['102', 'Powder House Circle - Nathan Tufts Park'],
       ['103', 'JFK Crossing at Harvard St. / Thorndike St.'],
       ['104',
        'Harvard University Radcliffe Quadrangle at Shepard St / Garden St'],
       ['105', 'Lower Cambridgeport at Magazine St/Riverside Rd'],
       ['106', 'Mt Pleasant Ave / Dudley Town Common'],
       ['107', 'Ames St at Main St'],
       ['108',
        'Harvard University / SEAS Cruft-Pierce Halls at 29 Oxford St'],
       ['109', 'TD Garden - Causeway at Portal Park #1'],
       ['11', 'Longwood Ave / Binney St'],
       ['110', 'Harvard University Gund Hall at Quincy St / Kirkland S'],
       ['111', 'Packard Ave / Powderhouse Blvd'],
       ['112', 'Somerville Hospital at Highland Ave / Crocker St'],
       ['113', 'Andrew Station - Dorchester Ave at Humboldt Pl'],
       ['114', 'Teele Square at 239 Holland St'],
       [

In [10]:
#TODO: check for entire duplicates (also end stations)
unique_start_tuples[:,0:1]
u, c = np.unique(unique_start_tuples[:,0:1], return_counts=True)
dup = u[c > 1]
dup 

array([], dtype='<U70')

# Feature Engineering
#### Calculate trip_length 

In [11]:
df_boston['trip_length'] = (df_boston['end_time'] - df_boston['start_time'])
df_boston['trip_length'] = df_boston['trip_length'] / np.timedelta64(1, 's')
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,trip_length
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber,543.0
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber,438.0
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber,255.0
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber,432.0
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer,735.0


# First Data Exploration
### Understanding the revenue modell

![caption](data/thehubway_revenue_modell_2015.png)

In [37]:
# Pricing info pulled from: https://web.archive.org/web/20150206035343/http://www.thehubway.com:80/pricing
# INFO: bluebikes was previously called 'thehubway', see https://en.wikipedia.org/wiki/Bluebikes
# ASSUMTION: The revenue modell did not change over the year 2015. This was checked through all entries of the wayback-machine, which occured in 7 day intervalls. We can be pretty certain of this. 


# Get # of subscribers and customers
print(df_boston.groupby('user_type').size())
# TODO: Include this in revenue modell

# Thoughts on the revenue modell: We don't see individual user-ids for each ride. This means that we can only infer the "revenue" generated by each trip through the respective user_type.
# E.g. -> if we have 10 subscribers for a given timeframe, who undertake 100 trips, each trip would generate 1/10 of the users membership fee in the given timeframe
# Membership numbers for 2015: https://web.archive.org/web/20160208155519/http://www.thehubway.com/mediakit -> anual members: 13.248, Causal Passes: 102.445 (24 & 72 Hours) with a total of 1,1319,310 Trips (???)

no_trips_from_dataset_2015 = 11319310
no_trips_from_website_2015 = len(df_boston)
print(f'We only have {no_trips_from_website_2015/no_trips_from_dataset_2015} of the data!')

# Big Open Question: How should we deal with the missing data? (we only have 1/10 of the trip data apparently) ? -> https://s3.amazonaws.com/hubway-data/index.html this contains the raw data, it appears that there are a lot more than 112k rides..
# Open Question: How should we include the membership fees and casual passes costs in the revenue modell?


# get proprtion all the rides who might incur overtime fees (trip_length >30 min) to understand if this usage pattern is relevant for the revenue model
proportion_overtime_rides = len(df_boston[df_boston['trip_length'] >= (30*60)]) / len(df_boston) 
print(proportion_overtime_rides)

# Conclusion: Around 7% of rides were overtime -> we should include this in the revenue modell
# TODO: Engineer the following features for the revenue modell: revenue_through_overtime (calculated on the basis of trip_duration)


user_type
Customer      370585
Subscriber    751972
dtype: int64
We only have 0.09917185764856692 of the data!
                 start_time            end_time  start_station_id  \
0       2015-01-01 00:21:44 2015-01-01 00:30:47               115   
1       2015-01-01 00:27:03 2015-01-01 00:34:21                80   
2       2015-01-01 00:31:31 2015-01-01 00:35:46                91   
3       2015-01-01 00:53:46 2015-01-01 01:00:58               115   
4       2015-01-01 01:07:06 2015-01-01 01:19:21               105   
...                     ...                 ...               ...   
1122548 2015-12-31 23:20:21 2015-12-31 23:36:19                38   
1122549 2015-12-31 23:26:28 2016-01-01 00:16:48                24   
1122550 2015-12-31 23:26:49 2016-01-01 00:17:05                24   
1122551 2015-12-31 23:28:33 2015-12-31 23:37:51                41   
1122552 2015-12-31 23:35:03 2015-12-31 23:43:20                73   

         end_station_id                                start

### merging of boston station dataset and the df_boston data

In [2]:
#boston blue bikes stations data
bbb_stations_data = pd.read_csv('./data/current_bluebikes_stations.csv')


In [5]:
bbb_stations_data.head()

Unnamed: 0,Last Updated,12/5/2022,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,Number,Name,Latitude,Longitude,District,Public,Total docks,Deployment Year
1,K32015,1200 Beacon St,42.34414899,-71.11467361,Brookline,Yes,15,2021
2,W32006,160 Arsenal,42.36466403,-71.17569387,Watertown,Yes,11,2021
3,A32019,175 N Harvard St,42.363796,-71.129164,Boston,Yes,17,2014
4,S32035,191 Beacon St,42.38032335,-71.10878613,Somerville,Yes,19,2018


In [3]:
#don't know why but the column labels were not in the first row, which is why i had to rename them (same with u?)
bbb_stations_data.rename(columns={'12/5/2022': 'start_station_name',
                         'Last Updated': 'Number',
                         'Unnamed: 2': 'Latitude',
                         'Unnamed: 4': 'District',
                         'Unnamed: 5': 'Public',
                         'Unnamed: 3': 'Longitude',
                         'Unnamed: 6': 'Total docks',
                         'Unnamed: 7': 'Deployment Year'  }, inplace=True)


In [4]:
#merged data frame for every station that is in our dataset and the boston station dataset (if a station is not in both then its not in this merged set)
df_merged = df_boston.merge(bbb_stations_data)


In [7]:
#nr of unique stations in the merged data set
nr_unique_stations_m = np.unique(df_merged['start_station_name'].values.astype('str'), axis=0)

nr_unique_stations = np.unique(df_boston['start_station_name'].values.astype('str'), axis=0)

nr_unique_stations_s = np.unique(bbb_stations_data['start_station_name'].values.astype('str'), axis=0)


print(f'number of stations in original df {nr_unique_stations.size},\nnumber of stations in bbb-data df {nr_unique_stations_s.size},\nnumber of stations in merged df {nr_unique_stations_m.size}' )


number of stations in original df 156,
number of stations in bbb-data df 449,
number of stations in merged df 52


In [None]:
#over 100 stations are in original df but not in bbb_stations dataset. why? 
#probably slightly different labeled stations in respective sets

#TODO how to add and merge them? first find out which stations are effected. then maybe with character matching function: matching = [s for s in data if "Wilson Square" in s]?
