# Import and Clean New York Citi Bike Data

- Monthly Data: July 2023 - December 2023

In [1]:
import numpy as np
import pandas as pd


In [2]:
cb_aug23_df = pd.read_csv('Resources/JC-202308-citibike-tripdata.csv')

cb_aug23_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,E2E964A161F786AB,classic_bike,2023-08-07 19:37:47,2023-08-07 19:41:14,6 St & Grand St,HB302,Madison St & 10 St,HB503,40.744398,-74.034501,40.749943,-74.035865,member
1,0660F2E48E3BB87F,classic_bike,2023-08-01 13:16:22,2023-08-01 13:26:02,6 St & Grand St,HB302,6 St & Grand St,HB302,40.744398,-74.034501,40.744398,-74.034501,member
2,940FC7C675232897,classic_bike,2023-08-15 17:28:23,2023-08-15 17:50:35,Heights Elevator,JC059,Heights Elevator,JC059,40.748721,-74.04048,40.748716,-74.040443,member
3,E967660CC5CD585B,classic_bike,2023-08-01 12:44:24,2023-08-01 12:49:45,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,HB102,40.735279,-74.04683,40.736068,-74.029127,member
4,D997CB0B855FE2D6,classic_bike,2023-08-08 12:31:16,2023-08-08 12:40:18,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,HB102,40.735208,-74.046964,40.736068,-74.029127,member


## Inspect the Data

In [3]:
cb_aug23_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112027 entries, 0 to 112026
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             112027 non-null  object 
 1   rideable_type       112027 non-null  object 
 2   started_at          112027 non-null  object 
 3   ended_at            112027 non-null  object 
 4   start_station_name  112027 non-null  object 
 5   start_station_id    112027 non-null  object 
 6   end_station_name    111701 non-null  object 
 7   end_station_id      111701 non-null  object 
 8   start_lat           112027 non-null  float64
 9   start_lng           112027 non-null  float64
 10  end_lat             111907 non-null  float64
 11  end_lng             111907 non-null  float64
 12  member_casual       112027 non-null  object 
dtypes: float64(4), object(9)
memory usage: 11.1+ MB


In [5]:
# Two types of rides -> Electric or Classic
cb_aug23_df['rideable_type'].value_counts()


rideable_type
classic_bike     101386
electric_bike     10641
Name: count, dtype: int64

In [6]:
cb_aug23_df['start_station_id'].value_counts()

start_station_id
JC115      5204
HB102      3968
HB103      3870
HB101      2956
JC066      2931
           ... 
6450.05       1
6847.02       1
6847.05       1
5207.01       1
6718.02       1
Name: count, Length: 112, dtype: int64

In [7]:
# More End Stations than Start Stations 112 vs 197
cb_aug23_df['end_station_id'].value_counts()

end_station_id
JC115      5798
HB102      3975
HB103      3867
HB101      2964
JC066      2951
           ... 
4713.01       1
5359.10       1
4510.04       1
5105.01       1
8123.02       1
Name: count, Length: 197, dtype: int64

In [8]:
cb_aug23_df['start_station_name'].value_counts()

start_station_name
Grove St PATH                                   5204
Hoboken Terminal - River St & Hudson Pl         3968
South Waterfront Walkway - Sinatra Dr & 1 St    3870
Hoboken Terminal - Hudson St & Hudson Pl        2956
Newport PATH                                    2931
                                                ... 
8 Ave & W 31 St                                    1
Broadway & W 56 St                                 1
7 Ave & W 55 St                                    1
Centre St & Chambers St                            1
77 St & 31 Ave                                     1
Name: count, Length: 112, dtype: int64

In [9]:
cb_aug23_df['end_station_name'].value_counts()

end_station_name
Grove St PATH                                   5798
Hoboken Terminal - River St & Hudson Pl         3975
South Waterfront Walkway - Sinatra Dr & 1 St    3867
Hoboken Terminal - Hudson St & Hudson Pl        2964
Newport PATH                                    2951
                                                ... 
Central Ave & Himrod St                            1
Hudson St & Reade St                               1
Greene Ave & Throop Ave                            1
Liberty St & Broadway                              1
Broadway & W 155 St                                1
Name: count, Length: 197, dtype: int64

In [10]:
# Make a test sample of 500 rows to see how the data will work in tableau
aug_test_sample = cb_aug23_df.sample(500)

In [11]:
aug_test_sample.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
34545,C25749C7089F0372,classic_bike,2023-08-02 11:53:36,2023-08-02 12:06:07,Bloomfield St & 15 St,HB203,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,40.75453,-74.02658,40.736982,-74.027781,casual
52969,E4C0D0928346D65E,classic_bike,2023-08-29 06:32:51,2023-08-29 06:38:28,Pershing Field,JC024,Hoboken Terminal - Hudson St & Hudson Pl,HB101,40.742448,-74.051975,40.735938,-74.030305,member
105407,EC858B601C736F5F,classic_bike,2023-08-05 11:39:11,2023-08-05 11:41:16,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Hoboken Terminal - River St & Hudson Pl,HB102,40.735938,-74.030305,40.736068,-74.029127,member
8794,C046189017C7224B,classic_bike,2023-08-24 12:40:54,2023-08-24 13:00:24,Communipaw & Berry Lane,JC084,Jackson Square,JC063,40.714358,-74.066611,40.71113,-74.0789,casual
77453,55BE1CECD80C382F,classic_bike,2023-08-04 17:56:13,2023-08-04 17:59:48,Newport PATH,JC066,Warren St,JC006,40.727009,-74.033955,40.721124,-74.038051,member


In [20]:
# Export Test File to look at in Tableau
# aug_test_sample.to_csv('Resources/cb_test_data.csv', index=False)

In [19]:
# Sampled in Tableau, result looks good

## Pull in the remaining Months July, Sept-Dec

In [15]:
# Read Citi Bike files into dataframes

cb_jul23_df = pd.read_csv('Resources/JC-202307-citibike-tripdata.csv')
cb_sep23_df = pd.read_csv('Resources/JC-202309-citibike-tripdata.csv')
cb_oct23_df = pd.read_csv('Resources/JC-202310-citibike-tripdata.csv')
cb_nov23_df = pd.read_csv('Resources/JC-202311-citibike-tripdata.csv')
cb_dec23_df = pd.read_csv('Resources/JC-202312-citibike-tripdata.csv')


In [16]:
# list of dataframes
dframes = [cb_jul23_df, cb_aug23_df, cb_sep23_df, cb_oct23_df, cb_nov23_df, cb_dec23_df]

In [17]:
cb_data_df = pd.concat(dframes)

cb_data_df.sample(50)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
50609,4A7193653604C2D3,classic_bike,2023-08-14 09:53:36,2023-08-14 10:01:18,Morris Canal,JC072,Hamilton Park,JC009,40.712286,-74.038195,40.727596,-74.044247,member
55744,4E673CECF659CFFF,classic_bike,2023-10-09 06:27:27,2023-10-09 06:30:18,Bergen Ave,JC095,Bergen Ave & Sip Ave,JC109,40.722108,-74.071405,40.731009,-74.064437,member
49390,2160002169DDFF09,classic_bike,2023-12-19 19:02:59,2023-12-19 19:07:22,Grove St PATH,JC115,Brunswick St,JC023,40.71941,-74.04309,40.724176,-74.050656,member
43054,AEF0F7492EE410B4,classic_bike,2023-08-03 08:23:15,2023-08-03 08:29:09,Hilltop,JC019,Newark Ave,JC032,40.731169,-74.057574,40.721525,-74.046305,member
69253,F799637AB74BBECC,classic_bike,2023-10-05 18:39:56,2023-10-05 18:47:50,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,Southwest Park - Jackson St & Observer Hwy,HB401,40.736982,-74.027781,40.737551,-74.041664,member
37081,54FD4B064C1D681C,classic_bike,2023-12-05 18:17:01,2023-12-05 18:23:17,7 St & Monroe St,HB304,12 St & Sinatra Dr N,HB201,40.746515,-74.038094,40.750604,-74.02402,member
49431,B108576C59FB8A8D,electric_bike,2023-11-02 16:57:07,2023-11-02 17:15:35,Hoboken Ave at Monmouth St,JC105,12 St & Sinatra Dr N,HB201,40.735208,-74.046964,40.750604,-74.02402,casual
43908,4F9CECD9D441198F,classic_bike,2023-10-12 17:41:50,2023-10-12 17:44:16,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,2 St & Park Ave,HB608,40.73707,-74.027824,40.739153,-74.033082,member
52458,9777863EDE30DBD6,classic_bike,2023-09-09 12:35:04,2023-09-09 12:39:03,Columbus Drive,JC014,Marin Light Rail,JC013,40.718702,-74.038701,40.714584,-74.042817,member
75929,7FD15AB8301986BC,classic_bike,2023-07-25 17:46:11,2023-07-25 17:59:55,Leonard Gordon Park,JC080,Monmouth and 6th,JC075,40.74591,-74.057271,40.725685,-74.04879,casual


In [18]:
cb_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 544778 entries, 0 to 58679
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             544778 non-null  object 
 1   rideable_type       544778 non-null  object 
 2   started_at          544778 non-null  object 
 3   ended_at            544778 non-null  object 
 4   start_station_name  544751 non-null  object 
 5   start_station_id    544751 non-null  object 
 6   end_station_name    542851 non-null  object 
 7   end_station_id      542851 non-null  object 
 8   start_lat           544778 non-null  float64
 9   start_lng           544778 non-null  float64
 10  end_lat             544274 non-null  float64
 11  end_lng             544274 non-null  float64
 12  member_casual       544778 non-null  object 
dtypes: float64(4), object(9)
memory usage: 58.2+ MB


## Export the DataFrame as File

In [22]:
cb_data_df.to_csv('Resources/cb_data.csv', index=False)