In [1]:
# Import dependencies
import pandas as pd
import random
import datetime as dt

In [2]:
# Jupyter Notebook raises warnings about the data types in a couple columns of the later data sets. These columns get dropped, 
# so the warnings aren't important. This suppresses the warnings as the code runs.

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Create lists of raw data files. The structure of the files changes after January 2021, and need to be processed slightly differently.

raw_data_early = ["raw_data/201809-citibike-tripdata.csv", "raw_data/201810-citibike-tripdata.csv", "raw_data/201811-citibike-tripdata.csv", "raw_data/201812-citibike-tripdata.csv", "raw_data/201901-citibike-tripdata.csv", "raw_data/201902-citibike-tripdata.csv", "raw_data/201903-citibike-tripdata.csv", "raw_data/201904-citibike-tripdata.csv", "raw_data/201905-citibike-tripdata.csv", "raw_data/201906-citibike-tripdata.csv", "raw_data/201907-citibike-tripdata.csv", "raw_data/201908-citibike-tripdata.csv", "raw_data/201909-citibike-tripdata.csv", "raw_data/201910-citibike-tripdata.csv", "raw_data/201911-citibike-tripdata.csv", "raw_data/201912-citibike-tripdata.csv", "raw_data/202001-citibike-tripdata.csv", "raw_data/202002-citibike-tripdata.csv", "raw_data/202003-citibike-tripdata.csv", "raw_data/202004-citibike-tripdata.csv", "raw_data/202005-citibike-tripdata.csv", "raw_data/202006-citibike-tripdata.csv", "raw_data/202007-citibike-tripdata.csv", "raw_data/202008-citibike-tripdata.csv", "raw_data/202009-citibike-tripdata.csv", "raw_data/202010-citibike-tripdata.csv", "raw_data/202011-citibike-tripdata.csv", "raw_data/202012-citibike-tripdata.csv", "raw_data/202101-citibike-tripdata.csv"]

raw_data_later = ["raw_data/202102-citibike-tripdata.csv", "raw_data/202103-citibike-tripdata.csv", "raw_data/202104-citibike-tripdata.csv", "raw_data/202105-citibike-tripdata.csv", "raw_data/202106-citibike-tripdata.csv", "raw_data/202107-citibike-tripdata.csv", "raw_data/202108-citibike-tripdata.csv", "raw_data/202109-citibike-tripdata.csv", "raw_data/202110-citibike-tripdata.csv", "raw_data/202111-citibike-tripdata.csv", "raw_data/202112-citibike-tripdata.csv", "raw_data/202201-citibike-tripdata.csv", "raw_data/202202-citibike-tripdata.csv", "raw_data/202203-citibike-tripdata.csv", "raw_data/202204-citibike-tripdata.csv", "raw_data/202205-citibike-tripdata.csv", "raw_data/202206-citibike-tripdata.csv", "raw_data/202207-citibike-tripdata.csv", "raw_data/202208-citibike-tripdata.csv", "raw_data/202209-citibike-tripdata.csv"]

In [4]:
# This function cleans the data from data files from February, 2021 and later.

def clean_data_later(csv):
    # Read the CSV into a dataframe
    raw_data = pd.read_csv(csv)
    # Drop the columns I'm not interested in
    dropped_columns = raw_data.drop(columns=["ride_id", "start_station_id", "end_station_id", "rideable_type"])
    # Drop any entries with missing data
    dropped_nas = dropped_columns.dropna()
    # Drop any duplicated entries
    dropped_dups = dropped_nas.drop_duplicates(ignore_index=True)
    # Reindex the cleaned dataframe
    reindexed_df = dropped_dups.reset_index(drop=True)
    
    return reindexed_df

In [5]:
# This function is an adjusted version of my clean_data function. This function has the additional steps of masking the titles
# of the columns to match the columns of the data files after January, 2021. This function is only used for data files from
# January, 2021 and earlier.

def clean_data_early(csv):
    # Read the raw CSV into a dataframe
    raw_data = pd.read_csv(csv)
    # Mask the column titles so they match with the later data sets
    masked_cols = raw_data.rename(columns = {"tripduration": "ride_time", "starttime": "started_at", "stoptime": "ended_at", "start station id": "start_station_id", "start station name": "start_station_name", "start station latitude": "start_lat", "start station longitude": "start_lng", "end station id": "end_station_id", "end station name": "end_station_name", "end station latitude": "end_lat", "end station longitude": "end_lng", "usertype":"member_casual"})
    # Mask the values of the column to match later data sets
    masked_cols["member_casual"] = masked_cols["member_casual"].map({"Subscriber": "member", "Customer": "casual"})
    # Drop columns I'm not interested in
    dropped_cols = masked_cols.drop(columns=["start_station_id", "end_station_id", "bikeid", "birth year", "gender"])
    # Rearrange the columns to match the later data sets
    rearranged = dropped_cols[["started_at", "ended_at", "start_station_name", "start_lat", "start_lng", "end_station_name", "end_lat", "end_lng", "member_casual", "ride_time"]]
    # Drop any entries with incomplete data
    dropped_nas = rearranged.dropna()
    # Drop any duplicated entries
    dropped_dups = dropped_nas.drop_duplicates(ignore_index = True)
    # Reindex the dataframe
    reindexed_df = dropped_dups.reset_index(drop = True)
    
    return reindexed_df

In [6]:
# Takes a random sample of 5,000 entries for visualization with Tableau. This keeps the amount of data Tableau has to work 
# with to a more reasonable level, and it keeps the resulting CSV file under GitHub's maximum file size.
def take_random_sample(df):
    # Make a random list of 5,000 numbers from the range of indices
    random_list = random.sample(range(0, len(df)), 5000)
    # Use the list of random numbers to pull out entries with the corresponding index
    sample_df = df[df.index.isin(random_list)]
    # Reset the index
    sample_df = sample_df.reset_index(drop=True)
    
    return sample_df

In [7]:
# Round the geographic coordinates of each entry to 3 decimal points. This helps trips at the same stations to get grouped 
#together.

def round_coords(df):
    df["start_lat"] = df["start_lat"].round(3)
    df["start_lng"] = df["start_lng"].round(3)
    df["end_lat"] = df["end_lat"].round(3)
    df["end_lng"] = df["end_lng"].round(3)
    
    return df

In [8]:
# This calculates how long each trip took in seconds and adds it to the dataframe. This is only needed for data *after*
# January 2021.

def calculate_ride_time(df):
    ride_time = []

    for event in range(0, len(df)):
        # Convert started_at to datetime object, save it
        started_dt = dt.datetime.strptime(df["started_at"][event], "%Y-%m-%d %H:%M:%S")
        # Convert ended_at to datetime object, save it
        ended_dt = dt.datetime.strptime(df["ended_at"][event], "%Y-%m-%d %H:%M:%S")
        # Calculate elapsed time in seconds
        time_change = ended_dt - started_dt
        elapsed_seconds = time_change.seconds        
        # Append string to series
        ride_time.append(elapsed_seconds)
        
    df["ride_time"] = ride_time
    
    return df

In [9]:
ridership_dict = {}

In [10]:
def extract_month_year(cleaned_data):
    first_date = cleaned_data["started_at"][0].split("-")
    month_year = first_date[1] + "-" + first_date[0]
    return month_year

In [11]:
def total_ridership(cleaned_data):
    # Extract the month and year from the data
    month_year = extract_month_year(cleaned_data)
    # Count the number of riders in the full, cleaned data set
    ridership = len(cleaned_data)
    
    ridership_dict.update({month_year: ridership})

In [12]:
def process_early(raw_data):
    cleaned_data = clean_data_early(raw_data)
    total_ridership(cleaned_data)
    sample = take_random_sample(cleaned_data)
    rounded_sample = round_coords(sample)
    
    return rounded_sample

In [13]:
def process_later(raw_data):
    cleaned_data = clean_data_later(raw_data)
    sample = take_random_sample(cleaned_data)
    total_ridership(cleaned_data)
    rounded_sample = round_coords(sample)
    with_ride_time = calculate_ride_time(rounded_sample)
    
    return with_ride_time

In [14]:
processed_early_samples_df = pd.DataFrame()

In [15]:
for month in raw_data_early:
    processed_month = process_early(month)
    
    processed_early_samples_df = pd.concat(objs = [processed_early_samples_df, processed_month])
    processed_early_samples_df = processed_early_samples_df.reset_index(drop=True)

In [16]:
processed_later_samples_df = pd.DataFrame()

In [17]:
for month in raw_data_later:
    processed_later_month = process_later(month)
    
    processed_later_samples_df = pd.concat(objs = [processed_later_samples_df, processed_later_month])

In [18]:
processed_total_samples_df = pd.concat(objs=[processed_early_samples_df, processed_later_samples_df])

processed_total_samples_df

Unnamed: 0,started_at,ended_at,start_station_name,start_lat,start_lng,end_station_name,end_lat,end_lng,member_casual,ride_time
0,2018-09-01 01:31:16.9930,2018-09-01 01:49:28.4260,Reade St & Broadway,40.715,-74.006,Cadman Plaza E & Tillary St,40.696,-73.990,casual,1091
1,2018-09-01 02:14:07.5830,2018-09-01 02:28:05.8060,Lexington Ave & E 24 St,40.740,-73.984,Great Jones St,40.727,-73.994,casual,838
2,2018-09-01 03:50:58.5750,2018-09-01 04:15:21.7540,St Marks Pl & 1 Ave,40.728,-73.986,St Marks Pl & 2 Ave,40.728,-73.987,casual,1463
3,2018-09-01 05:49:04.5950,2018-09-01 06:03:51.4590,W 106 St & Central Park West,40.798,-73.961,Central Park West & W 72 St,40.776,-73.976,member,886
4,2018-09-01 07:13:23.4100,2018-09-01 07:55:01.5930,Central Park West & W 76 St,40.779,-73.974,Central Park West & W 76 St,40.779,-73.974,member,2498
...,...,...,...,...,...,...,...,...,...,...
4995,2022-09-24 12:28:27,2022-09-24 12:57:42,Columbia Heights & Cranberry St,40.700,-73.995,Van Brunt St & Van Dyke St,40.676,-74.015,casual,1755
4996,2022-09-14 15:00:05,2022-09-14 15:15:00,3 Ave & Wakeman Pl,40.638,-74.025,24 St & 5 Ave,40.660,-73.995,casual,895
4997,2022-09-10 07:49:20,2022-09-10 08:12:11,Irving Ave & Halsey St,40.695,-73.907,Market St & Henry St,40.713,-73.994,casual,1371
4998,2022-09-20 18:36:25,2022-09-20 18:42:11,Clinton St & Tillary St,40.696,-73.991,Bridge St & York St,40.701,-73.985,member,346


In [19]:
processed_total_samples_df.to_csv("data_for_analysis/sep_18_to_sep_22.csv")

In [20]:
ridership_df = pd.DataFrame.from_dict(ridership_dict, orient="index")

ridership_df.columns = ["Total_Rides"]

ridership_df

Unnamed: 0,Total_Rides
09-2018,1877168
10-2018,1878433
11-2018,1260275
12-2018,1016416
01-2019,967269
02-2019,943735
03-2019,1327950
04-2019,1766094
05-2019,1924563
06-2019,2125370


In [21]:
ridership_df.to_csv("data_for_analysis/monthly_ridership.csv")