In [129]:
import pandas as pd
import numpy as np

df_boston = pd.read_csv('./data/boston_2015.csv', dtype={'start_station_id': np.int64, 'end_station_id': 'string', 'end_station_name': 'string', 'start_station_name': 'string', 'bike_id': np.int64, 'user_type': 'string'})
print(f'Total number of rows: {len(df_boston)}')

Total number of rows: 1122558


In [130]:
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer


# Preprocessing
### Set data type of 'end_station_id'

In [131]:
df_boston['end_station_id'].unique()

<StringArray>
[ '96',  '95',  '68',  '88',  '76', '118',  '75',  '67',  '36',  '23',
 ...
 '169', '174', '175', '159', '171', '178', '176', '179', '180', '177']
Length: 157, dtype: string

In [132]:
# Value '\\N' seems to be anomaly -> occurs only once, so drop!
num_occurences = len(df_boston.loc[df_boston["end_station_id"] == "\\N"])
print(f'Number of "\\\\N" occurences in end_station_id column: {num_occurences}')
df_boston.drop(index=df_boston.loc[df_boston["end_station_id"] == "\\N"].index, inplace=True, axis=1)

# now set column to dtype np.int64
df_boston = df_boston.astype({'end_station_id': np.int64})
df_boston.info()

Number of "\\N" occurences in end_station_id column: 1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1122557 entries, 0 to 1122557
Data columns (total 8 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   start_time          1122557 non-null  object
 1   end_time            1122557 non-null  object
 2   start_station_id    1122557 non-null  int64 
 3   end_station_id      1122557 non-null  int64 
 4   start_station_name  1122557 non-null  string
 5   end_station_name    1122557 non-null  string
 6   bike_id             1122557 non-null  int64 
 7   user_type           1122557 non-null  string
dtypes: int64(3), object(2), string(3)
memory usage: 77.1+ MB


### Set data type of time columns

In [133]:
df_boston['start_time'] = pd.to_datetime(df_boston['start_time'], format='%Y-%m-%d %X')
df_boston['end_time'] = pd.to_datetime(df_boston['end_time'], format='%Y-%m-%d %X')
df_boston.dtypes

start_time            datetime64[ns]
end_time              datetime64[ns]
start_station_id               int64
end_station_id                 int64
start_station_name            string
end_station_name              string
bike_id                        int64
user_type                     string
dtype: object

#### Check if station id and names are unique tuples

In [134]:
# len(df_boston['start_station_name'].unique())
df_boston.loc[df_boston['start_station_id'] == 96]['start_station_name'].unique()

<StringArray>
['Cambridge Main Library at Broadway / Trowbridge St']
Length: 1, dtype: string

In [143]:
# Check if station id and name always match
unique_start_tuples = np.unique(df_boston[['start_station_id', 'start_station_name']].values.astype('str'), axis=0)
unique_start_tuples

array([['1', '18 Dorrance Warehouse'],
       ['10', 'B.U. Central - 725 Comm. Ave.'],
       ['100', 'Davis Square'],
       ['102', 'Powder House Circle - Nathan Tufts Park'],
       ['103', 'JFK Crossing at Harvard St. / Thorndike St.'],
       ['104',
        'Harvard University Radcliffe Quadrangle at Shepard St / Garden St'],
       ['105', 'Lower Cambridgeport at Magazine St/Riverside Rd'],
       ['106', 'Mt Pleasant Ave / Dudley Town Common'],
       ['107', 'Ames St at Main St'],
       ['108',
        'Harvard University / SEAS Cruft-Pierce Halls at 29 Oxford St'],
       ['109', 'TD Garden - Causeway at Portal Park #1'],
       ['11', 'Longwood Ave / Binney St'],
       ['110', 'Harvard University Gund Hall at Quincy St / Kirkland S'],
       ['111', 'Packard Ave / Powderhouse Blvd'],
       ['112', 'Somerville Hospital at Highland Ave / Crocker St'],
       ['113', 'Andrew Station - Dorchester Ave at Humboldt Pl'],
       ['114', 'Teele Square at 239 Holland St'],
       [

In [149]:
#TODO: check for entire duplicates (also end stations)
unique_start_tuples[:,0:1]
u, c = np.unique(unique_start_tuples[:,0:1], return_counts=True)
dup = u[c > 1]
dup 

array([], dtype='<U70')

# Feature Engineering
#### Calculate trip_length 

In [136]:
df_boston['trip_length'] = (df_boston['end_time'] - df_boston['start_time'])
df_boston['trip_length'] = df_boston['trip_length'] / np.timedelta64(1, 's')
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,trip_length
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber,543.0
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber,438.0
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber,255.0
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber,432.0
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer,735.0
