In [109]:
import pandas as pd
import numpy as np

df_boston = pd.read_csv('./data/boston_2015.csv', dtype={'start_station_id': np.int64, 'end_station_id': str, 'end_station_name': str, 'start_staiton_name': str, 'bike_id': np.int64, 'user_type': str})
print(f'Total number of rows: {len(df_boston)}')

Total number of rows: 1122558


In [110]:
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer


# Preprocessing
### Set data type of 'end_station_id'

In [111]:
df_boston['end_station_id'].unique()

array(['96', '95', '68', '88', '76', '118', '75', '67', '36', '23', '80',
       '46', '115', '141', '104', '73', '6', '110', '87', '91', '117',
       '74', '107', '116', '70', '143', '97', '72', '84', '105', '51',
       '89', '142', '38', '85', '90', '108', '145', '149', '140', '81',
       '20', '29', '16', '109', '4', '32', '33', '21', '42', '14', '98',
       '17', '15', '54', '1', '59', '11', '\\N', '9', '120', '41', '10',
       '44', '58', '99', '60', '53', '71', '100', '55', '112', '66', '86',
       '78', '122', '22', '40', '114', '125', '123', '39', '45', '48',
       '24', '7', '129', '64', '135', '19', '43', '150', '49', '13',
       '151', '65', '47', '37', '131', '3', '12', '111', '136', '25',
       '57', '119', '5', '124', '94', '121', '27', '30', '63', '31',
       '138', '106', '113', '133', '26', '137', '139', '8', '103', '77',
       '93', '102', '152', '128', '130', '126', '35', '69', '56', '61',
       '134', '50', '52', '82', '132', '79', '92', '160', '161', '1

In [112]:
# Value '\\N' seems to be anomaly -> occurs only once, so drop!
num_occurences = len(df_boston.loc[df_boston["end_station_id"] == "\\N"])
print(f'Number of "\\\\N" occurences in end_station_id column: {num_occurences}')
df_boston.drop(index=df_boston.loc[df_boston["end_station_id"] == "\\N"].index, inplace=True, axis=1)

# now set column to dtype np.int64
df_boston = df_boston.astype({'end_station_id': np.int64})
df_boston.info()

Number of "\\N" occurences in end_station_id column: 1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1122557 entries, 0 to 1122557
Data columns (total 8 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   start_time          1122557 non-null  object
 1   end_time            1122557 non-null  object
 2   start_station_id    1122557 non-null  int64 
 3   end_station_id      1122557 non-null  int64 
 4   start_station_name  1122557 non-null  object
 5   end_station_name    1122557 non-null  object
 6   bike_id             1122557 non-null  int64 
 7   user_type           1122557 non-null  object
dtypes: int64(3), object(5)
memory usage: 77.1+ MB


### Set data type of time columns

In [113]:
df_boston['start_time'] = pd.to_datetime(df_boston['start_time'], format='%Y-%m-%d %X')
df_boston['end_time'] = pd.to_datetime(df_boston['end_time'], format='%Y-%m-%d %X')
df_boston.dtypes

start_time            datetime64[ns]
end_time              datetime64[ns]
start_station_id               int64
end_station_id                 int64
start_station_name            object
end_station_name              object
bike_id                        int64
user_type                     object
dtype: object

# Feature Engineering
#### Calculate trip_length 

In [117]:
df_boston['trip_length'] = (df_boston['end_time'] - df_boston['start_time'])
df_boston['trip_length'] = df_boston['trip_length'] / np.timedelta64(1, 's')
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,trip_length
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber,543.0
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber,438.0
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber,255.0
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber,432.0
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer,735.0


In [115]:
# len(df_boston['start_station_name'].unique())
df_boston.loc[df_boston['start_station_id'] == 96]['start_station_name'].unique()

array(['Cambridge Main Library at Broadway / Trowbridge St'], dtype=object)

In [116]:
# Check if station id and name always match
unique_start_tuples = np.unique(df_boston[['start_station_id', 'start_station_name']].values, axis=0)
unique_start_tuples

TypeError: The axis argument to unique is not supported for dtype object