# Import packages we need

Download file from:
[citi bike data] (https://www.kaggle.com/datasets/hassanabsar/nyc-citi-bike-ride-share-system-data-2023)

In [2]:
import numpy as np
import pandas as pd

# Basic data check

We need to perform some basic data check, set up the index, drop column that we have no informations about etc..

In [3]:
csv_path = '../files/sample_citibike_2023.csv'
df_citibike = pd.read_csv(csv_path, dtype={'start_station_id': str, 'end_station_id': str }) # fix inconsitent type
df_citibike.head()

Unnamed: 0.1,Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,12536574,6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.79346,-73.92389,member
1,20766033,F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.1,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member
2,17246346,CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual
3,3860641,541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.95879,member
4,27285265,EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member


In [4]:
df_citibike = df_citibike.drop('Unnamed: 0', axis='columns') # there is no information in documentation about this column, drop it
df_citibike = df_citibike.set_index('ride_id') # ride_id identifies each ride/observation

## Check dtypes of columns and convert them

In [5]:
df_citibike.dtypes

rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [6]:
df_citibike['started_at'] = df_citibike['started_at'].astype('datetime64[s]')
df_citibike['ended_at'] = df_citibike['ended_at'].astype('datetime64[s]')
df_citibike['rideable_type'] = df_citibike['rideable_type'].astype('category')
df_citibike['member_casual'] = df_citibike['member_casual'].astype('category')
df_citibike.head()

Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.79346,-73.92389,member
F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.1,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member
CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual
541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.95879,member
EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member


In [7]:
df_citibike.dtypes

rideable_type              category
started_at            datetime64[s]
ended_at              datetime64[s]
start_station_name           object
start_station_id             object
end_station_name             object
end_station_id               object
start_lat                   float64
start_lng                   float64
end_lat                     float64
end_lng                     float64
member_casual              category
dtype: object

# Feature engineering

Adding some new features to the dataset:  
1. ride_duration: how much time passed between started_at and ended_at
2. ride_time_of_day - based on started_at value, we calculate in which time of day the ride started

In [8]:
df_citibike['ride_duration[s]'] = (df_citibike['ended_at'] - df_citibike['started_at'])/pd.Timedelta(1, 's')
df_citibike.head()

Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_duration[s]
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.79346,-73.92389,member,1076.0
F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.1,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member,389.0
CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual,366.0
541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.95879,member,454.0
EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member,257.0


In [9]:
def calculate_time_of_day(started_at):
    match started_at.hour:
        case started_at.hour if started_at.hour < 6:
            return 'night'
        case started_at.hour if started_at.hour >= 6 and started_at.hour < 12:
            return 'morning'
        case started_at.hour if started_at.hour >= 12 and started_at.hour < 18:
            return 'afternoon'
        case started_at.hour if started_at.hour >= 18:
            return 'evening'

df_citibike['ride_time_of_day'] = df_citibike['started_at'].apply(calculate_time_of_day)

In [10]:
df_citibike

Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_duration[s],ride_time_of_day
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.793460,-73.923890,member,1076.0,morning
F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.10,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member,389.0,evening
CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual,366.0,afternoon
541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.958790,member,454.0,morning
EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member,257.0,afternoon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
015D961E96545D85,electric_bike,2023-03-31 17:57:48,2023-03-31 18:02:55,Graham Ave & Conselyea St,5291.05,Nassau Ave & Russell St,5581.01,40.715143,-73.944507,40.725570,-73.944340,member,307.0,afternoon
B4957865ED6E94CF,classic_bike,2023-03-08 13:40:27,2023-03-08 14:00:20,E 48 St & 5 Ave,6626.01,11 Ave & W 27 St,6425.04,40.757270,-73.977792,40.751396,-74.005226,casual,1193.0,afternoon
2892D1DA3F9483A1,classic_bike,2023-02-04 15:04:44,2023-02-04 15:07:40,Murray St & Greenwich St,5288.12,Vesey Pl & River Terrace,5297.02,40.714852,-74.011223,40.715338,-74.016584,member,176.0,afternoon
5FE9361CFBB85827,classic_bike,2023-08-16 18:06:38,2023-08-16 18:22:09,Prospect Ave & E 151 St,7830.03,2 Ave & E 122 St,7622.12,40.814413,-73.903985,40.800672,-73.934900,member,931.0,evening


# Findiung missing values

We need to find missing values in order to better understand the data better, to fill the missing data and to decide which features shoould be taken into consideration in analysis process

In [11]:
# check if we have any columns with empty data
columns_with_empty_values = [x for x in df_citibike.columns if len(df_citibike[df_citibike[x].isna()]) > 0]
columns_with_empty_values

['start_station_name',
 'start_station_id',
 'end_station_name',
 'end_station_id',
 'end_lat',
 'end_lng']

In [12]:
number_of_empty_values_in_columns = [len(df_citibike[df_citibike[x].isna()]) for x in columns_with_empty_values]

In [13]:
# let's check the length of na values in each column
dict_columns_len_of_null = { key: value for key, value in zip(columns_with_empty_values, number_of_empty_values_in_columns)}
dict_columns_len_of_null

{'start_station_name': 541,
 'start_station_id': 541,
 'end_station_name': 2808,
 'end_station_id': 2808,
 'end_lat': 713,
 'end_lng': 713}

In [14]:
# let's see how records which have na value in start_station_name column look like
missing_start_stations = df_citibike[df_citibike['start_station_name'].isna()]
missing_start_stations

Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_duration[s],ride_time_of_day
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
92B9560530AFAEBD,classic_bike,2023-09-20 07:18:17,2023-09-20 07:22:41,,,W 31 St & 7 Ave,6331.01,40.74,-73.98,40.749156,-73.991600,member,264.0,morning
95AA4A564C359F0A,classic_bike,2023-10-24 12:36:10,2023-10-24 13:39:05,,,,,40.79,-73.97,40.790000,-73.970000,member,3775.0,afternoon
7A36729774665509,classic_bike,2023-07-20 18:38:08,2023-07-20 19:40:00,,,,,40.74,-73.98,40.740000,-73.990000,member,3712.0,evening
A1E44B3AD4C4DEC7,classic_bike,2023-06-16 20:15:42,2023-06-16 21:31:02,,,,,40.79,-73.97,40.790000,-73.970000,casual,4520.0,evening
F7AF936BB45C1760,classic_bike,2023-06-10 19:27:07,2023-06-10 20:40:01,,,,,40.77,-73.98,40.770000,-73.980000,member,4374.0,evening
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0B8D703F6E21045C,classic_bike,2023-10-26 22:12:49,2023-10-26 23:15:24,,,,,40.86,-73.89,40.860000,-73.890000,member,3755.0,evening
1E9AB6D6DEF1AED4,classic_bike,2023-09-22 18:05:02,2023-09-22 18:09:59,,,W 30 St & 10 Ave,6459.07,40.76,-73.99,40.752694,-74.002353,member,297.0,evening
4576223B81C789E8,classic_bike,2023-10-15 11:48:46,2023-10-15 12:50:32,,,,,40.79,-73.97,40.790000,-73.970000,casual,3706.0,morning
84F1A217AC770C3F,classic_bike,2023-09-18 15:20:34,2023-09-18 15:41:31,,,Frederick Douglass Blvd & W 117 St,7688.12,40.80,-73.94,40.805159,-73.954692,member,1257.0,afternoon


In [15]:
# we don't have start station name, but it seems like we have start_lat and start_lng, so we can try to figure out the name of the station by taking lat/long properties
# and matching them to a record in our dataframe which have start_station_name not-na
df_start_latlng_merged = df_citibike.merge(missing_start_stations, on=['start_lat', 'start_lng'], how='inner')

# as we can see, we can identify station names/ids at least for some missing values only by using start_lat and start_lng
df_start_latlng_merged

Unnamed: 0,rideable_type_x,started_at_x,ended_at_x,start_station_name_x,start_station_id_x,end_station_name_x,end_station_id_x,start_lat,start_lng,end_lat_x,...,ended_at_y,start_station_name_y,start_station_id_y,end_station_name_y,end_station_id_y,end_lat_y,end_lng_y,member_casual_y,ride_duration[s]_y,ride_time_of_day_y
0,classic_bike,2023-04-16 20:56:34,2023-04-16 21:06:25,Driggs Ave & S 2 St,5235.1,McKibbin St & Manhattan Ave,4996.08,40.71,-73.96,40.705109,...,2023-11-05 10:03:51,,,Peck Slip & South St,5096.12,40.707519,-74.001081,casual,683.0,morning
1,classic_bike,2023-04-16 20:56:34,2023-04-16 21:06:25,Driggs Ave & S 2 St,5235.1,McKibbin St & Manhattan Ave,4996.08,40.71,-73.96,40.705109,...,2023-09-17 18:42:21,,,,,40.710000,-73.960000,member,7298.0,afternoon
2,classic_bike,2023-03-23 20:24:18,2023-03-23 20:33:10,Driggs Ave & S 2 St,5235.1,Leonard St & Nassau Ave,5550.09,40.71,-73.96,40.723957,...,2023-11-05 10:03:51,,,Peck Slip & South St,5096.12,40.707519,-74.001081,casual,683.0,morning
3,classic_bike,2023-03-23 20:24:18,2023-03-23 20:33:10,Driggs Ave & S 2 St,5235.1,Leonard St & Nassau Ave,5550.09,40.71,-73.96,40.723957,...,2023-09-17 18:42:21,,,,,40.710000,-73.960000,member,7298.0,afternoon
4,electric_bike,2023-05-06 06:51:18,2023-05-06 06:57:03,S 4 St & Roebling St,5195.06,N 12 St & Bedford Ave,5450.04,40.71,-73.96,40.720798,...,2023-11-05 10:03:51,,,Peck Slip & South St,5096.12,40.707519,-74.001081,casual,683.0,morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12526,electric_bike,2023-04-03 14:15:04,2023-04-03 14:21:25,N 5 St & Wythe Ave,5419.04,Nassau Ave & Newell St,5623.03,40.72,-73.96,40.724813,...,2023-10-23 16:48:57,,,,,40.690000,-73.920000,member,5873.0,afternoon
12527,classic_bike,2023-05-03 17:11:42,2023-05-03 17:32:49,N 5 St & Northside Piers,5458.02,E 68 St & 3 Ave,6896.16,40.72,-73.96,40.767128,...,2023-09-09 10:30:23,,,Broadway & Kent Ave,5134.09,40.710888,-73.968329,casual,282.0,morning
12528,classic_bike,2023-05-03 17:11:42,2023-05-03 17:32:49,N 5 St & Northside Piers,5458.02,E 68 St & 3 Ave,6896.16,40.72,-73.96,40.767128,...,2023-10-11 13:55:13,,,Greenpoint Ave & West St,5752.09,40.729803,-73.959099,member,2804.0,afternoon
12529,classic_bike,2023-05-03 17:11:42,2023-05-03 17:32:49,N 5 St & Northside Piers,5458.02,E 68 St & 3 Ave,6896.16,40.72,-73.96,40.767128,...,2023-10-21 16:26:15,,,N 11 St & Kent Ave,5489.04,40.722482,-73.959219,member,1328.0,afternoon


In [16]:
def fill_missing_start_station_data(ride_missing_station_data_datarow):
    """
    This function fills missing start_station_name and start_station_id in observations

    we are creating a mask which will find the records which have a station name not na
    we are applying this mask on our dataframe

    if a loc returns a dataframe with any data, update NaN in df)citibike dataframe
    """
    mask = (
                (df_citibike['start_lat'] == ride_missing_station_data_datarow['start_lat']) &
                (df_citibike['start_lng'] == ride_missing_station_data_datarow['start_lng']) &
                (df_citibike['start_station_name'].isna() == False)
            )
    
    station_data = df_citibike.loc[mask, ['start_station_id', 'start_station_name']]
    if len(station_data) > 0:
        df_citibike.loc[ride_missing_station_data_datarow.name, 'start_station_id'] = station_data['start_station_id'].iloc[0]
        df_citibike.loc[ride_missing_station_data_datarow.name, 'start_station_name'] = station_data['start_station_name'].iloc[0]
    

missing_start_stations.apply(fill_missing_start_station_data, axis=1)

ride_id
92B9560530AFAEBD    None
95AA4A564C359F0A    None
7A36729774665509    None
A1E44B3AD4C4DEC7    None
F7AF936BB45C1760    None
                    ... 
0B8D703F6E21045C    None
1E9AB6D6DEF1AED4    None
4576223B81C789E8    None
84F1A217AC770C3F    None
33E6A81627DA46DE    None
Length: 541, dtype: object

In [17]:
# As we can see, there are not too many stations we could fill, but at least we have some more
missing_start_stations_after_filling = df_citibike[df_citibike['start_station_name'].isna()]
print(len(missing_start_stations_after_filling), len(missing_start_stations))

458 541


In [18]:
# let's drop all the rows/observations which have na in start_station_name and start_station_id
df_citibike.dropna(subset=['start_station_name', 'start_station_id'], inplace=True)

In [19]:
# let's do the same with end_station_name and end_station_id

def fill_missing_end_station_data(ride_missing_station_data_datarow):
    """
    This function is very similar to fill_missing_start_station_data

    My goal here is not to show I now Do Not Repeat yourself rule, but my way of thinking
    on a path to solve a problem
    """
    mask = (
                (df_citibike['end_lat'] == ride_missing_station_data_datarow['end_lat']) &
                (df_citibike['end_lng'] == ride_missing_station_data_datarow['end_lng']) &
                (df_citibike['end_station_name'].isna() == False)
            )
    
    station_data = df_citibike.loc[mask, ['end_station_id', 'end_station_name']]
    if len(station_data) > 0:
        df_citibike.loc[ride_missing_station_data_datarow.name, 'end_station_id'] = station_data['end_station_id'].iloc[0]
        df_citibike.loc[ride_missing_station_data_datarow.name, 'end_station_name'] = station_data['end_station_name'].iloc[0]

missing_end_stations = df_citibike[df_citibike['end_station_name'].isna()]

missing_end_stations.apply(fill_missing_end_station_data, axis=1)

ride_id
F4D5CA208EBA3571    None
819A91CE7F968B1F    None
BFED0D44EBF62C4F    None
99F16EB31DE81385    None
57900D0568DF6B50    None
                    ... 
92A35DBFB748EF6A    None
380F54AF5863CDCA    None
DF6D331860AE7CF0    None
FBD795881855C26D    None
C58D35E2995793A0    None
Length: 2488, dtype: object

In [20]:
# As we can see, there are not too many stations we could fill, but at least we have some more
missing_end_stations_after_filling = df_citibike[df_citibike['end_station_name'].isna()]
print(len(missing_end_stations_after_filling), len(missing_end_stations))

2205 2488


In [21]:
df_citibike.dropna(subset=['end_station_name', 'end_station_id'], inplace=True)

In [65]:
df_citibike

Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_duration[s],ride_time_of_day
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.793460,-73.923890,member,1076.0,morning
F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.10,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member,389.0,evening
CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual,366.0,afternoon
541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.958790,member,454.0,morning
EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member,257.0,afternoon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
015D961E96545D85,electric_bike,2023-03-31 17:57:48,2023-03-31 18:02:55,Graham Ave & Conselyea St,5291.05,Nassau Ave & Russell St,5581.01,40.715143,-73.944507,40.725570,-73.944340,member,307.0,afternoon
B4957865ED6E94CF,classic_bike,2023-03-08 13:40:27,2023-03-08 14:00:20,E 48 St & 5 Ave,6626.01,11 Ave & W 27 St,6425.04,40.757270,-73.977792,40.751396,-74.005226,casual,1193.0,afternoon
2892D1DA3F9483A1,classic_bike,2023-02-04 15:04:44,2023-02-04 15:07:40,Murray St & Greenwich St,5288.12,Vesey Pl & River Terrace,5297.02,40.714852,-74.011223,40.715338,-74.016584,member,176.0,afternoon
5FE9361CFBB85827,classic_bike,2023-08-16 18:06:38,2023-08-16 18:22:09,Prospect Ave & E 151 St,7830.03,2 Ave & E 122 St,7622.12,40.814413,-73.903985,40.800672,-73.934900,member,931.0,evening


# A closer look into the data

In this section we will take a closer look into the data

We will find:
1. Most common start station
2. Most common end station
3. Where people from most common start station end their ride
4. Where people from most common end station start their ride
5. Additional metrics for ride_duration (the property we created) for a better insight
6. Some descriptions dfor categorical properties : member_casual, ride_time_of_day and rideable_type

## Ad.1: Most common start station, end_station, rideable_type, time of day and member type (casual or member)

In [84]:
start_station_groups = df_citibike.groupby(['start_station_name']).size().sort_values(ascending=False)
end_station_groups = df_citibike.groupby(['end_station_name']).size().sort_values(ascending=False)
rideable_type_groups = df_citibike.groupby(['rideable_type'], observed=False).size().sort_values(ascending=False)
time_of_day_groups = df_citibike.groupby(['ride_time_of_day']).size().sort_values(ascending=False)
member_casual_groups = df_citibike.groupby(['member_casual'], observed=False).size().sort_values(ascending=False)

top_frequencies = pd.DataFrame({
    'start_station_name': [start_station_groups.index[0], start_station_groups.iloc[0]],
    'end_station_name': [end_station_groups.index[0], end_station_groups.iloc[0]],
    'rideable_type': [rideable_type_groups.index[0], rideable_type_groups.iloc[0]],
    'time_of_day': [time_of_day_groups.index[0], time_of_day_groups.iloc[0]],
    'member_casual': [member_casual_groups.index[0], member_casual_groups.iloc[0]]
}, index=['Most frequent value', 'Number of frequencies'])

top_frequencies


Unnamed: 0,start_station_name,end_station_name,rideable_type,time_of_day,member_casual
Most frequent value,W 21 St & 6 Ave,W 21 St & 6 Ave,classic_bike,afternoon,member
Number of frequencies,4037,3997,894682,410960,812858
