# Import packages we need

Download file from:
[citi bike data] (https://www.kaggle.com/datasets/hassanabsar/nyc-citi-bike-ride-share-system-data-2023)

In [1]:
import numpy as np
import pandas as pd

# Basic data check

We need to perform some basic data check, set up the index, drop column that we have no informations about etc..

In [2]:
csv_path = '../files/sample_citibike_2023.csv'
df_citibike = pd.read_csv(csv_path, dtype={'start_station_id': str, 'end_station_id': str }) # fix inconsitent type
df_citibike.head()

Unnamed: 0.1,Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,12536574,6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.79346,-73.92389,member
1,20766033,F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.1,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member
2,17246346,CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual
3,3860641,541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.95879,member
4,27285265,EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member


In [3]:
df_citibike = df_citibike.drop('Unnamed: 0', axis='columns') # there is no information in documentation about this column, drop it
df_citibike = df_citibike.set_index('ride_id') # ride_id identifies each ride/observation

## Check dtypes of columns and convert them

In [4]:
df_citibike.dtypes

rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [5]:
df_citibike['started_at'] = df_citibike['started_at'].astype('datetime64[s]')
df_citibike['ended_at'] = df_citibike['ended_at'].astype('datetime64[s]')
df_citibike['rideable_type'] = df_citibike['rideable_type'].astype('category')
df_citibike['member_casual'] = df_citibike['member_casual'].astype('category')
df_citibike.head()

Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.79346,-73.92389,member
F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.1,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member
CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual
541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.95879,member
EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member


In [6]:
df_citibike.dtypes

rideable_type              category
started_at            datetime64[s]
ended_at              datetime64[s]
start_station_name           object
start_station_id             object
end_station_name             object
end_station_id               object
start_lat                   float64
start_lng                   float64
end_lat                     float64
end_lng                     float64
member_casual              category
dtype: object

# Feature engineering

Adding some new features to the dataset:  
1. ride_duration: how much time passed between started_at and ended_at
2. ride_time_of_day - based on started_at value, we calculate in which time of day the ride started

In [7]:
df_citibike['ride_duration[s]'] = (df_citibike['ended_at'] - df_citibike['started_at'])/pd.Timedelta(1, 's')
df_citibike.head()

Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_duration[s]
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.79346,-73.92389,member,1076.0
F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.1,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member,389.0
CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual,366.0
541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.95879,member,454.0
EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member,257.0


In [8]:
def calculate_time_of_day(started_at):
    match started_at.hour:
        case started_at.hour if started_at.hour < 6:
            return 'night'
        case started_at.hour if started_at.hour >= 6 and started_at.hour < 12:
            return 'morning'
        case started_at.hour if started_at.hour >= 12 and started_at.hour < 18:
            return 'afternoon'
        case started_at.hour if started_at.hour >= 18:
            return 'evening'

df_citibike['ride_time_of_day'] = df_citibike['started_at'].apply(calculate_time_of_day)

In [9]:
df_citibike

Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_duration[s],ride_time_of_day
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.793460,-73.923890,member,1076.0,morning
F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.10,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member,389.0,evening
CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual,366.0,afternoon
541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.958790,member,454.0,morning
EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member,257.0,afternoon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
015D961E96545D85,electric_bike,2023-03-31 17:57:48,2023-03-31 18:02:55,Graham Ave & Conselyea St,5291.05,Nassau Ave & Russell St,5581.01,40.715143,-73.944507,40.725570,-73.944340,member,307.0,afternoon
B4957865ED6E94CF,classic_bike,2023-03-08 13:40:27,2023-03-08 14:00:20,E 48 St & 5 Ave,6626.01,11 Ave & W 27 St,6425.04,40.757270,-73.977792,40.751396,-74.005226,casual,1193.0,afternoon
2892D1DA3F9483A1,classic_bike,2023-02-04 15:04:44,2023-02-04 15:07:40,Murray St & Greenwich St,5288.12,Vesey Pl & River Terrace,5297.02,40.714852,-74.011223,40.715338,-74.016584,member,176.0,afternoon
5FE9361CFBB85827,classic_bike,2023-08-16 18:06:38,2023-08-16 18:22:09,Prospect Ave & E 151 St,7830.03,2 Ave & E 122 St,7622.12,40.814413,-73.903985,40.800672,-73.934900,member,931.0,evening


# Findiung missing values

We need to find missing values in order to better understand the data better, to fill the missing data and to decide which features shoould be taken into consideration in analysis process

In [34]:
# check if we have any columns with empty data
columns_with_empty_values = [x for x in df_citibike.columns if len(df_citibike[df_citibike[x].isna()]) > 0]
columns_with_empty_values

['start_station_name',
 'start_station_id',
 'end_station_name',
 'end_station_id',
 'end_lat',
 'end_lng']

In [61]:
number_of_empty_values_in_columns = [len(df_citibike[df_citibike[x].isna()]) for x in columns_with_empty_values]

In [64]:
# let's check the length of na values in each column
dict_columns_len_of_null = { key: value for key, value in zip(columns_with_empty_values, number_of_empty_values_in_columns)}
dict_columns_len_of_null

{'start_station_name': 541,
 'start_station_id': 541,
 'end_station_name': 2808,
 'end_station_id': 2808,
 'end_lat': 713,
 'end_lng': 713}