# City Bike Data YOY July trends

In [1]:
# Import the various dependencies and setup
import pandas as pd
import numpy as np
import datetime as dt

### Import CSV data into DataFrame

In [2]:
# Map the path where the input csv file is located
YOY201307 = "201307.csv"
YOY201407 = "201407.csv"
YOY201507 = "201507.csv"
YOY201607 = "201607.csv"
YOY201707 = "201707.csv"
YOY201807 = "201807.csv"
YOY201907 = "201907.csv"
YOY202007 = "202007.csv"

In [3]:
# Specify the delimiter to read and store the csv file into a Pandas DataFrame
YOY201307_df = pd.read_csv(YOY201307, encoding = "UTF-8")
YOY201407_df = pd.read_csv(YOY201407, encoding = "UTF-8")
YOY201507_df = pd.read_csv(YOY201507, encoding = "UTF-8")
YOY201607_df = pd.read_csv(YOY201607, encoding = "UTF-8")
YOY201707_df = pd.read_csv(YOY201707, encoding = "UTF-8")
YOY201807_df = pd.read_csv(YOY201807, encoding = "UTF-8")
YOY201907_df = pd.read_csv(YOY201907, encoding = "UTF-8")
YOY202007_df = pd.read_csv(YOY202007, encoding = "UTF-8")

In [4]:
# Combine the multiple DataFrames into a single DataFrame
YOY_citi_bike_df = YOY201307_df.append([YOY201407_df, YOY201507_df, YOY201307_df, YOY201607_df, YOY201707_df,
                                        YOY201807_df, YOY201907_df, YOY202007_df], ignore_index = True)
YOY_citi_bike_df.head(2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,634,2013-07-01 00:00:00,2013-07-01 00:10:34,164.0,E 47 St & 2 Ave,40.753231,-73.970325,504.0,1 Ave & E 15 St,40.732219,-73.981656,16950,Customer,\N,0
1,1547,2013-07-01 00:00:02,2013-07-01 00:25:49,388.0,W 26 St & 10 Ave,40.749718,-74.00295,459.0,W 20 St & 11 Ave,40.746745,-74.007756,19816,Customer,\N,0


In [5]:
# Rename the DataFrame columns
YOY_citi_bike_df = YOY_citi_bike_df.rename(columns={"tripduration": "Trip Duration",
                                                    "starttime": "Start Time",
                                                    "stoptime": "Stop Time",
                                                    "start station id": "Start Station ID",
                                                    "start station name": "Start Station Name",
                                                    "start station latitude": "Start Station Latitude",
                                                    "start station longitude": "Start Station Longitude",
                                                    "end station id": "End Station ID",
                                                    "end station name": "End Station Name",
                                                    "end station latitude": "End Station Latitude",
                                                    "end station longitude": "End Station Longitude",
                                                    "bikeid": "Bike ID",
                                                    "usertype": "User Type",
                                                    "birth year": "Birth Year",
                                                    "gender": "Gender"
                                                   })
YOY_citi_bike_df.head(2)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender
0,634,2013-07-01 00:00:00,2013-07-01 00:10:34,164.0,E 47 St & 2 Ave,40.753231,-73.970325,504.0,1 Ave & E 15 St,40.732219,-73.981656,16950,Customer,\N,0
1,1547,2013-07-01 00:00:02,2013-07-01 00:25:49,388.0,W 26 St & 10 Ave,40.749718,-74.00295,459.0,W 20 St & 11 Ave,40.746745,-74.007756,19816,Customer,\N,0


In [6]:
# Replace the numeric gender values with corresponding descriptions
YOY_citi_bike_df["Gender"] = YOY_citi_bike_df["Gender"].replace([0, 1, 2], ["Unknown", "Male", "Female"])
YOY_citi_bike_df.head(2)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender
0,634,2013-07-01 00:00:00,2013-07-01 00:10:34,164.0,E 47 St & 2 Ave,40.753231,-73.970325,504.0,1 Ave & E 15 St,40.732219,-73.981656,16950,Customer,\N,Unknown
1,1547,2013-07-01 00:00:02,2013-07-01 00:25:49,388.0,W 26 St & 10 Ave,40.749718,-74.00295,459.0,W 20 St & 11 Ave,40.746745,-74.007756,19816,Customer,\N,Unknown


In [7]:
# Add a "Dataset" column to the DataFrame to be able to distinguish between YTD and YOY data
YOY_citi_bike_df["Dataset"] = "YOY"
YOY_citi_bike_df.head(2)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Dataset
0,634,2013-07-01 00:00:00,2013-07-01 00:10:34,164.0,E 47 St & 2 Ave,40.753231,-73.970325,504.0,1 Ave & E 15 St,40.732219,-73.981656,16950,Customer,\N,Unknown,YOY
1,1547,2013-07-01 00:00:02,2013-07-01 00:25:49,388.0,W 26 St & 10 Ave,40.749718,-74.00295,459.0,W 20 St & 11 Ave,40.746745,-74.007756,19816,Customer,\N,Unknown,YOY


In [8]:
# Convert the date times to "%Y-%m-%d %H:%M:%S" format
YOY_citi_bike_df["Start Time"] = pd.to_datetime(YOY_citi_bike_df["Start Time"])
YOY_citi_bike_df["Stop Time"] = pd.to_datetime(YOY_citi_bike_df["Stop Time"])
YOY_citi_bike_df.head(2)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Dataset
0,634,2013-07-01 00:00:00,2013-07-01 00:10:34,164.0,E 47 St & 2 Ave,40.753231,-73.970325,504.0,1 Ave & E 15 St,40.732219,-73.981656,16950,Customer,\N,Unknown,YOY
1,1547,2013-07-01 00:00:02,2013-07-01 00:25:49,388.0,W 26 St & 10 Ave,40.749718,-74.00295,459.0,W 20 St & 11 Ave,40.746745,-74.007756,19816,Customer,\N,Unknown,YOY


In [9]:
# Display the data types of the DataFrame
YOY_citi_bike_df.dtypes

Trip Duration                       int64
Start Time                 datetime64[ns]
Stop Time                  datetime64[ns]
Start Station ID                  float64
Start Station Name                 object
Start Station Latitude            float64
Start Station Longitude           float64
End Station ID                    float64
End Station Name                   object
End Station Latitude              float64
End Station Longitude             float64
Bike ID                             int64
User Type                          object
Birth Year                         object
Gender                             object
Dataset                            object
dtype: object

In [10]:
# Drop data with missing values and display the shape of the DataFrame
YOY_citi_bike_df.dropna(how = "any")
YOY_citi_bike_df.shape

(13057556, 16)

In [11]:
# Save the combined DataFrame to CSV
YOY_citi_bike_df.to_csv("Citi_Bike_Data_Analytics_YOY_July.csv", index = False, header = True)