<img src="images/citibike-logo.png">

## Citi Bike Program Data
### Preparing Data for Analysis on Tableau
- Main Data Source: https://www.citibikenyc.com/system-data
- 2019 Trip History Data Source: https://s3.amazonaws.com/tripdata/index.html
- 2020 Daily Ridership and Membership Data Source:
    - Q1: https://datawrapper.dwcdn.net/cZMp8/5/
    - Q2: https://datawrapper.dwcdn.net/MM5kM/1/
- Final Visualizations: https://public.tableau.com/views/CitiTripData2019/Story?:language=en&:display_count=y&publish=yes&:origin=viz_share_link


In [1]:
# DEPENDENCIES
import pandas as pd
import numpy as np
import datetime

### 2019 Trip History Data

In [2]:
# REFERENCES TO CSV DATASETS
trip_data_19_1 = "Data/citibike-trip-data_2019_01.csv"
trip_data_19_2 = "Data/citibike-trip-data_2019_02.csv"
trip_data_19_3 = "Data/citibike-trip-data_2019_03.csv"
trip_data_19_4 = "Data/citibike-trip-data_2019_04.csv"
trip_data_19_5 = "Data/citibike-trip-data_2019_05.csv"
trip_data_19_6 = "Data/citibike-trip-data_2019_06.csv"


# READ CSVs INTO PANDAS DATAFRAMES
trip_data_19_1_df = pd.read_csv(trip_data_19_1)
trip_data_19_2_df = pd.read_csv(trip_data_19_2)
trip_data_19_3_df = pd.read_csv(trip_data_19_3)
trip_data_19_4_df = pd.read_csv(trip_data_19_4)
trip_data_19_5_df = pd.read_csv(trip_data_19_5)
trip_data_19_6_df = pd.read_csv(trip_data_19_6)


trip_data_19_1_df.head(2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,201,2019-01-01 03:09:09.7110,2019-01-01 03:12:30.8790,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,29612,Subscriber,1993,1
1,505,2019-01-01 05:18:00.1060,2019-01-01 05:26:25.9050,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29213,Subscriber,1972,2


In [3]:
# MERGE AND STACK DFs VERTICALLY
combined_trip_data_19_df = pd.concat([trip_data_19_1_df,
                                   trip_data_19_2_df,
                                   trip_data_19_3_df,
                                   trip_data_19_4_df,
                                   trip_data_19_5_df,
                                   trip_data_19_6_df],
                                   axis=0)
combined_trip_data_19_df.head(2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,201,2019-01-01 03:09:09.7110,2019-01-01 03:12:30.8790,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,29612,Subscriber,1993,1
1,505,2019-01-01 05:18:00.1060,2019-01-01 05:26:25.9050,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29213,Subscriber,1972,2


In [4]:
print(f" Number of rows in combined DF: {len(combined_trip_data_19_df.tripduration):,d}")

 Number of rows in combined DF: 170,468


In [5]:
# RESET INDEX AND DROP UNECESSARY COLUMNS

# STEP 1
combined_trip_data_19_df.reset_index(inplace=True)

# STEP 2 - drop extra column "index"
combined_trip_data_19_df = combined_trip_data_19_df.drop("index", axis=1)
combined_trip_data_19_df.head(2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,201,2019-01-01 03:09:09.7110,2019-01-01 03:12:30.8790,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,29612,Subscriber,1993,1
1,505,2019-01-01 05:18:00.1060,2019-01-01 05:26:25.9050,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29213,Subscriber,1972,2


In [6]:
print(f" Number of rows in combined DF: {len(combined_trip_data_19_df.tripduration):,d}")

 Number of rows in combined DF: 170,468


In [7]:
# print(f"DATA TYPES COMBINES TRIP DF" "\n"
#      "---------------------------")
# combined_trip_data_19_df.dtypes

In [8]:
# RENAMING COLUMNS
list(combined_trip_data_19_df.columns.values)

['tripduration',
 'starttime',
 'stoptime',
 'start station id',
 'start station name',
 'start station latitude',
 'start station longitude',
 'end station id',
 'end station name',
 'end station latitude',
 'end station longitude',
 'bikeid',
 'usertype',
 'birth year',
 'gender']

In [9]:
combined_trip_data_19_df.rename(columns={'tripduration':'Trip Duration',
                                         'starttime':'Start Time',
                                         'stoptime':'Stop Time',
                                         'start station id':'Start Station ID',
                                         'start station name':'Start Station Name',
                                         'start station latitude':'Start Station Lat',
                                         'start station longitude':'Start Station Long',
                                         'end station id':'End Station ID',
                                         'end station name':'End Station Name',
                                         'end station latitude':'End Station Lat',
                                         'end station longitude':'End Station Long',
                                         'bikeid':'Bike ID',
                                         'usertype':'User Type',
                                         'birth year':'Birth Year',
                                         'gender':'Gender'}, inplace=True)


combined_trip_data_19_df.head(2)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Lat,Start Station Long,End Station ID,End Station Name,End Station Lat,End Station Long,Bike ID,User Type,Birth Year,Gender
0,201,2019-01-01 03:09:09.7110,2019-01-01 03:12:30.8790,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,29612,Subscriber,1993,1
1,505,2019-01-01 05:18:00.1060,2019-01-01 05:26:25.9050,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29213,Subscriber,1972,2


In [10]:
# EXPORTING CLEAN DATA
combined_trip_data_19_df.to_csv("Data/combined_trip_data_2019.csv")

### 2020 Trip History Data

In [19]:
# REFERENCES TO CSV DATASETS
trip_data_20_1 = "Data/citibike-trip-data_2020_01.csv"
trip_data_20_2 = "Data/citibike-trip-data_2020_02.csv"
trip_data_20_3 = "Data/citibike-trip-data_2020_03.csv"
trip_data_20_4 = "Data/citibike-trip-data_2020_04.csv"
trip_data_20_5 = "Data/citibike-trip-data_2020_05.csv"
trip_data_20_6 = "Data/citibike-trip-data_2020_06.csv"


# READ CSVs INTO PANDAS DATAFRAMES
trip_data_20_1_df = pd.read_csv(trip_data_20_1)
trip_data_20_2_df = pd.read_csv(trip_data_20_2)
trip_data_20_3_df = pd.read_csv(trip_data_20_3)
trip_data_20_4_df = pd.read_csv(trip_data_20_4)
trip_data_20_5_df = pd.read_csv(trip_data_20_5)
trip_data_20_6_df = pd.read_csv(trip_data_20_6)


trip_data_20_1_df.head(2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,226,2020-01-01 00:04:50.1920,2020-01-01 00:08:37.0370,3186,Grove St PATH,40.719586,-74.043117,3211,Newark Ave,40.721525,-74.046305,29444,Subscriber,1984,2
1,377,2020-01-01 00:16:01.6700,2020-01-01 00:22:19.0800,3186,Grove St PATH,40.719586,-74.043117,3269,Brunswick & 6th,40.726012,-74.050389,26305,Subscriber,1989,2


In [20]:
# MERGE AND STACK DFs VERTICALLY
combined_trip_data_20_df = pd.concat([trip_data_20_1_df,
                                      trip_data_20_2_df,
                                      trip_data_20_3_df,
                                      trip_data_20_4_df,
                                      trip_data_20_5_df,
                                      trip_data_20_6_df],
                                     axis=0)

combined_trip_data_20_df.head(2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,226,2020-01-01 00:04:50.1920,2020-01-01 00:08:37.0370,3186,Grove St PATH,40.719586,-74.043117,3211,Newark Ave,40.721525,-74.046305,29444,Subscriber,1984,2
1,377,2020-01-01 00:16:01.6700,2020-01-01 00:22:19.0800,3186,Grove St PATH,40.719586,-74.043117,3269,Brunswick & 6th,40.726012,-74.050389,26305,Subscriber,1989,2


In [21]:
print(f" Number of rows in combined DF: {len(combined_trip_data_20_df.tripduration):,d}")

 Number of rows in combined DF: 137,967


In [22]:
# RESET INDEX AND DROP UNECESSARY COLUMNS

# STEP 1
combined_trip_data_20_df.reset_index(inplace=True)

# STEP 2 - drop extra column "index"
combined_trip_data_20_df = combined_trip_data_20_df.drop("index", axis=1)
combined_trip_data_20_df.head(2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,226,2020-01-01 00:04:50.1920,2020-01-01 00:08:37.0370,3186,Grove St PATH,40.719586,-74.043117,3211,Newark Ave,40.721525,-74.046305,29444,Subscriber,1984,2
1,377,2020-01-01 00:16:01.6700,2020-01-01 00:22:19.0800,3186,Grove St PATH,40.719586,-74.043117,3269,Brunswick & 6th,40.726012,-74.050389,26305,Subscriber,1989,2


In [23]:
print(f" Number of rows in combined DF: {len(combined_trip_data_20_df.tripduration):,d}")

 Number of rows in combined DF: 137,967


In [24]:
# RENAMING COLUMNS
list(combined_trip_data_20_df.columns.values)

['tripduration',
 'starttime',
 'stoptime',
 'start station id',
 'start station name',
 'start station latitude',
 'start station longitude',
 'end station id',
 'end station name',
 'end station latitude',
 'end station longitude',
 'bikeid',
 'usertype',
 'birth year',
 'gender']

In [25]:
combined_trip_data_20_df.rename(columns={'tripduration':'Trip Duration',
                                         'starttime':'Start Time',
                                         'stoptime':'Stop Time',
                                         'start station id':'Start Station ID',
                                         'start station name':'Start Station Name',
                                         'start station latitude':'Start Station Lat',
                                         'start station longitude':'Start Station Long',
                                         'end station id':'End Station ID',
                                         'end station name':'End Station Name',
                                         'end station latitude':'End Station Lat',
                                         'end station longitude':'End Station Long',
                                         'bikeid':'Bike ID',
                                         'usertype':'User Type',
                                         'birth year':'Birth Year',
                                         'gender':'Gender'}, inplace=True)


combined_trip_data_20_df.head(2)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Lat,Start Station Long,End Station ID,End Station Name,End Station Lat,End Station Long,Bike ID,User Type,Birth Year,Gender
0,226,2020-01-01 00:04:50.1920,2020-01-01 00:08:37.0370,3186,Grove St PATH,40.719586,-74.043117,3211,Newark Ave,40.721525,-74.046305,29444,Subscriber,1984,2
1,377,2020-01-01 00:16:01.6700,2020-01-01 00:22:19.0800,3186,Grove St PATH,40.719586,-74.043117,3269,Brunswick & 6th,40.726012,-74.050389,26305,Subscriber,1989,2


In [26]:
# EXPORTING CLEAN DATA
combined_trip_data_20_df.to_csv("Data/combined_trip_data_2020.csv")

### Additional Work
After doing some outside work in Tableau, bringing in two tables and merging them to continue.

In [2]:
# REFERENCES TO CSV DATASETS
trip_data_2019 = "Data/combined_trip_data_2019.csv"
trip_data_2020 = "Data/combined_trip_data_2020.csv"


# READ CSVs INTO PANDAS DATAFRAMES
trip_data_2019_df = pd.read_csv(trip_data_2019)
trip_data_2020_df = pd.read_csv(trip_data_2020)


trip_data_2020_df.head(2)

Unnamed: 0,Year,Trip Duration Sec,Trip Duration Min,Month,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Lat,Start Station Long,End Station ID,End Station Name,End Station Lat,End Station Long,Bike ID,User Type,User Age,Gender
0,2020,226,3.8,January,04:50.2,08:37.0,3186,Grove St PATH,40.719586,-74.043117,3211,Newark Ave,40.721525,-74.046305,29444,Subscriber,36,Female
1,2020,377,6.3,January,16:01.7,22:19.1,3186,Grove St PATH,40.719586,-74.043117,3269,Brunswick & 6th,40.726012,-74.050389,26305,Subscriber,31,Female


In [3]:
trip_data_2019_df.head(2)

Unnamed: 0,Year,Trip Duration Sec,Trip Duration Min,Month,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Lat,Start Station Long,End Station ID,End Station Name,End Station Lat,End Station Long,Bike ID,User Type,User Age,Gender
0,2019,201,3.4,January,09:09.7,12:30.9,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,29612,Subscriber,26,Male
1,2019,505,8.4,January,18:00.1,26:25.9,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29213,Subscriber,47,Female


In [5]:
# MERGE AND STACK DFs VERTICALLY
combined_trip_data_df = pd.concat([trip_data_2019_df,
                                   trip_data_2020_df],
                                   axis=0)
combined_trip_data_df.head(2)

Unnamed: 0,Year,Trip Duration Sec,Trip Duration Min,Month,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Lat,Start Station Long,End Station ID,End Station Name,End Station Lat,End Station Long,Bike ID,User Type,User Age,Gender
0,2019,201,3.4,January,09:09.7,12:30.9,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,29612,Subscriber,26,Male
1,2019,505,8.4,January,18:00.1,26:25.9,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29213,Subscriber,47,Female


In [9]:
print(f" Number of rows in 2019 DF: {len(trip_data_2019_df.Year):,d}")
print(f" Number of rows in 2020 DF: {len(trip_data_2020_df.Year):,d}")
print(f" Number of rows in combined DF: {len(combined_trip_data_df.Year):,d}")

 Number of rows in 2019 DF: 170,468
 Number of rows in 2020 DF: 137,967
 Number of rows in combined DF: 308,435


In [10]:
# EXPORTING CLEAN DATA
combined_trip_data_df.to_csv("Data/combined_trip_data.csv")