In [1]:
# Set up and dependencies
import pandas as pd

In [2]:
# Load Citibike dataset CSV files
bike_sep = pd.read_csv('202309-citibike-tripdata.csv', low_memory = False)
bike_dec = pd.read_csv('202312-citibike-tripdata.csv', low_memory = False)

In [3]:
bike_data = pd.concat([bike_sep, bike_dec])

In [4]:
bike_data.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,B0A0F1DEFA4B72FC,electric_bike,2023-09-03 10:20:41,2023-09-03 10:24:16,E 1 St & Bowery,5636.13,E 10 St & 2 Ave,5746.02,40.724861,-73.992131,40.729708,-73.986598,member
1,2B26AB15647BF4EE,classic_bike,2023-09-27 15:44:23,2023-09-27 15:53:25,Pearl St & Hanover Square,4993.02,Allen St & Rivington St,5414.06,40.70465,-74.009133,40.720196,-73.989978,member
2,9D2B5971CA4E513F,classic_bike,2023-09-19 13:40:48,2023-09-19 13:48:11,E 1 St & Bowery,5636.13,E 10 St & 2 Ave,5746.02,40.724753,-73.992116,40.729708,-73.986598,member
3,17E6760596DC3ABE,classic_bike,2023-09-30 16:27:50,2023-09-30 16:56:35,Central Ave & Himrod St,4713.01,Mott St & Prince St,5561.04,40.696706,-73.922935,40.72318,-73.9948,member
4,97EFF376A7E2DC70,classic_bike,2023-09-21 16:59:53,2023-09-21 17:07:36,St Marks Pl & 2 Ave,5669.1,Mott St & Prince St,5561.04,40.728419,-73.98714,40.72318,-73.9948,member


In [5]:
bike_data.count()

ride_id               5848827
rideable_type         5848827
started_at            5848827
ended_at              5848827
start_station_name    5844673
start_station_id      5844673
end_station_name      5828760
end_station_id        5828760
start_lat             5848827
start_lng             5848827
end_lat               5844110
end_lng               5844110
member_casual         5848827
dtype: int64

In [6]:
bike_data = bike_data.dropna(how="any")

In [7]:
bike_data.count()

ride_id               5826739
rideable_type         5826739
started_at            5826739
ended_at              5826739
start_station_name    5826739
start_station_id      5826739
end_station_name      5826739
end_station_id        5826739
start_lat             5826739
start_lng             5826739
end_lat               5826739
end_lng               5826739
member_casual         5826739
dtype: int64

In [8]:
bike_data.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [9]:
# Convert 'started_at' and 'ended_at' columns to datetime format
bike_data["started_at"] = pd.to_datetime(bike_data["started_at"])
bike_data["ended_at"] = pd.to_datetime(bike_data["ended_at"])

In [10]:
# Calculate trip duration in minutes and create a new column
bike_data["trip_duration"] = (bike_data["ended_at"] - bike_data["started_at"]).dt.total_seconds() / 60

In [11]:
bike_data.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
trip_duration                float64
dtype: object

In [12]:
# Filter short rides based on trip_duration less than 2 minutes
short_ride = bike_data[bike_data["trip_duration"] < 2]

# Drop short rides from the DataFrame
bike_data.drop(short_ride.index, inplace=True)

In [13]:
bike_data.shape

(5196626, 14)

In [16]:
# Rename columns
bike_data = bike_data.rename(columns={"ride_id": "Ride ID", "rideable_type": "Bike Type", "started_at": "Start Time",
                                      "ended_at": "End Time", "trip_duration": "Trip Duration (minutes)",
                                      "start_station_name": "Start station Name", "start_station_id": "Start Station ID",
                                      "end_station_name": "End Station Name", "end_station_id": "End Station ID",
                                      "start_lat": "Start Latitude", "start_lng": "Start Longitude",
                                      "end_lat": "End Latitude", "end_lng": "End Longitude", "member_casual": "Rider Type"})

In [17]:
bike_data.head(2)

Unnamed: 0,Ride ID,Bike Type,Start Time,End Time,Start station Name,Start Station ID,End Station Name,End Station ID,Start Latitude,Start Longitude,End Latitude,End Longitude,Rider Type,Trip Duration (minutes)
0,B0A0F1DEFA4B72FC,electric_bike,2023-09-03 10:20:41,2023-09-03 10:24:16,E 1 St & Bowery,5636.13,E 10 St & 2 Ave,5746.02,40.724861,-73.992131,40.729708,-73.986598,member,3.583333
1,2B26AB15647BF4EE,classic_bike,2023-09-27 15:44:23,2023-09-27 15:53:25,Pearl St & Hanover Square,4993.02,Allen St & Rivington St,5414.06,40.70465,-74.009133,40.720196,-73.989978,member,9.033333


In [18]:
# Round trip_duration column to the nearest whole numbers
bike_data["Trip Duration (minutes)"] = bike_data["Trip Duration (minutes)"].round().astype(int)

In [19]:
bike_data.head(2)

Unnamed: 0,Ride ID,Bike Type,Start Time,End Time,Start station Name,Start Station ID,End Station Name,End Station ID,Start Latitude,Start Longitude,End Latitude,End Longitude,Rider Type,Trip Duration (minutes)
0,B0A0F1DEFA4B72FC,electric_bike,2023-09-03 10:20:41,2023-09-03 10:24:16,E 1 St & Bowery,5636.13,E 10 St & 2 Ave,5746.02,40.724861,-73.992131,40.729708,-73.986598,member,4
1,2B26AB15647BF4EE,classic_bike,2023-09-27 15:44:23,2023-09-27 15:53:25,Pearl St & Hanover Square,4993.02,Allen St & Rivington St,5414.06,40.70465,-74.009133,40.720196,-73.989978,member,9


In [20]:
# Save the DataFrame to a new CSV file
bike_data.to_csv('Citibike-Sep-Dec-2023.csv', index=False)