In [1]:
# Import dependencies
import pandas as pd
import random
import datetime as dt

In [2]:
# Jupyter Notebook raises warnings about the data types in a couple columns of the later data sets. These columns get dropped, 
# so the warnings aren't important. This suppresses the warnings as the code runs.

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Create lists of raw data files. The structure of the files changes after January 2021, and need to be processed slightly differently.

raw_data_early = ["raw_data/201809-citibike-tripdata.csv", "raw_data/201810-citibike-tripdata.csv", "raw_data/201811-citibike-tripdata.csv", "raw_data/201812-citibike-tripdata.csv", "raw_data/201901-citibike-tripdata.csv", "raw_data/201902-citibike-tripdata.csv", "raw_data/201903-citibike-tripdata.csv", "raw_data/201904-citibike-tripdata.csv", "raw_data/201905-citibike-tripdata.csv", "raw_data/201906-citibike-tripdata.csv", "raw_data/201907-citibike-tripdata.csv", "raw_data/201908-citibike-tripdata.csv", "raw_data/201909-citibike-tripdata.csv", "raw_data/201910-citibike-tripdata.csv", "raw_data/201911-citibike-tripdata.csv", "raw_data/201912-citibike-tripdata.csv", "raw_data/202001-citibike-tripdata.csv", "raw_data/202002-citibike-tripdata.csv", "raw_data/202003-citibike-tripdata.csv", "raw_data/202004-citibike-tripdata.csv", "raw_data/202005-citibike-tripdata.csv", "raw_data/202006-citibike-tripdata.csv", "raw_data/202007-citibike-tripdata.csv", "raw_data/202008-citibike-tripdata.csv", "raw_data/202009-citibike-tripdata.csv", "raw_data/202010-citibike-tripdata.csv", "raw_data/202011-citibike-tripdata.csv", "raw_data/202012-citibike-tripdata.csv", "raw_data/202101-citibike-tripdata.csv"]

raw_data_later = ["raw_data/202102-citibike-tripdata.csv", "raw_data/202103-citibike-tripdata.csv", "raw_data/202104-citibike-tripdata.csv", "raw_data/202105-citibike-tripdata.csv", "raw_data/202106-citibike-tripdata.csv", "raw_data/202107-citibike-tripdata.csv", "raw_data/202108-citibike-tripdata.csv", "raw_data/202109-citibike-tripdata.csv", "raw_data/202110-citibike-tripdata.csv", "raw_data/202111-citibike-tripdata.csv", "raw_data/202112-citibike-tripdata.csv", "raw_data/202201-citibike-tripdata.csv", "raw_data/202202-citibike-tripdata.csv", "raw_data/202203-citibike-tripdata.csv", "raw_data/202204-citibike-tripdata.csv", "raw_data/202205-citibike-tripdata.csv", "raw_data/202206-citibike-tripdata.csv", "raw_data/202207-citibike-tripdata.csv", "raw_data/202208-citibike-tripdata.csv", "raw_data/202209-citibike-tripdata.csv"]

In [4]:
# This function cleans the data from data files from February, 2021 and later.

def clean_data_later(sample):
    # Drop the columns I'm not interested in
    dropped_columns = sample.drop(columns=["ride_id", "start_station_id", "end_station_id", "rideable_type"])
    # Drop any entries with missing data
    dropped_nas = dropped_columns.dropna()
    # Drop any duplicated entries
    dropped_dups = dropped_nas.drop_duplicates(ignore_index=True)
    # Reindex the cleaned dataframe
    reindexed_df = dropped_dups.reset_index(drop=True)
    
    return reindexed_df

In [5]:
# This function has the additional steps of masking the titles of the columns to match the columns of the data files after 
# January, 2021. This function is only used for data files from January, 2021 and earlier.

def clean_data_early(sample):
    # Mask the column titles so they match with the later data sets
    masked_cols = sample.rename(columns = {"tripduration": "ride_time", "starttime": "started_at", "stoptime": "ended_at", "start station id": "start_station_id", "start station name": "start_station_name", "start station latitude": "start_lat", "start station longitude": "start_lng", "end station id": "end_station_id", "end station name": "end_station_name", "end station latitude": "end_lat", "end station longitude": "end_lng", "usertype":"member_casual"})
    # Mask the values of the column to match later data sets
    masked_cols["member_casual"] = masked_cols["member_casual"].map({"Subscriber": "member", "Customer": "casual"})
    # Drop columns I'm not interested in
    dropped_cols = masked_cols.drop(columns=["start_station_id", "end_station_id", "bikeid", "birth year", "gender"])
    # Rearrange the columns to match the later data sets
    rearranged = dropped_cols[["started_at", "ended_at", "start_station_name", "start_lat", "start_lng", "end_station_name", "end_lat", "end_lng", "member_casual", "ride_time"]]
    # Drop any entries with incomplete data
    dropped_nas = rearranged.dropna()
    # Drop any duplicated entries
    dropped_dups = dropped_nas.drop_duplicates(ignore_index = True)
    # Reindex the dataframe
    reindexed_df = dropped_dups.reset_index(drop = True)
    
    return reindexed_df

In [6]:
# Takes a random sample of 5,000 entries for visualization with Tableau. This keeps the amount of data Tableau has to work 
# with to a more reasonable level, and it keeps the resulting CSV file under GitHub's maximum file size.

def take_random_sample(df):
    # Make a random list of 5,000 numbers from the range of indices
    random_list = random.sample(range(0, len(df)), 5000)
    # Use the list of random numbers to pull out entries with the corresponding index
    sample_df = df[df.index.isin(random_list)]
    # Reset the index
    sample_df = sample_df.reset_index(drop=True)
    
    return sample_df

In [7]:
# Round the geographic coordinates of each entry to 3 decimal points. This helps trips at the same stations to get grouped 
#together.

def round_coords(df):
    df["start_lat"] = df["start_lat"].round(3)
    df["start_lng"] = df["start_lng"].round(3)
    df["end_lat"] = df["end_lat"].round(3)
    df["end_lng"] = df["end_lng"].round(3)
    
    return df

In [8]:
# This extracts the month and year from the data set for sets before or including January 2021.

def extract_month_year_early(raw_data):
    first_date = raw_data["starttime"][0].split("-")
    month_year = first_date[1] + "-" + first_date[0]
    return month_year

In [9]:
# This extracts the month and year from the data set for sets on or after February 2021.

def extract_month_year(cleaned_data):
    first_date = cleaned_data["started_at"][0].split("-")
    month_year = first_date[1] + "-" + first_date[0]
    return month_year

In [10]:
# This captures the total number of rides for each month.

def total_ridership(raw_data):
    ridership = len(raw_data)
    
    return ridership

In [11]:
# This function processes the early data sets (before and including January 2021).

def process_early_better(csv_path):
    # Read in csv
    raw_data_df = pd.read_csv(csv_path)
    # Take the sample
    sample = take_random_sample(raw_data_df)
    # Clean the data 
    cleaned_data = clean_data_early(sample)
    # Round the latitudes and longitudes
    rounded_data = round_coords(cleaned_data)
    # Add month_year column
    rounded_data.insert(loc = 10, column = "month_year", value = extract_month_year_early(raw_data_df))
    # Add total_ridership column
    rounded_data.insert(loc = 11, column = "total_ridership", value = total_ridership(raw_data_df))
        
    return rounded_data

In [12]:
# This calculates how long each trip took in seconds and adds it to the dataframe. This is only needed for data *after*
# January 2021.

def calculate_ride_time(df):
    ride_time = []

    for event in range(0, len(df)):
        # Convert started_at to datetime object, save it
        started_dt = dt.datetime.strptime(df["started_at"][event], "%Y-%m-%d %H:%M:%S")
        # Convert ended_at to datetime object, save it
        ended_dt = dt.datetime.strptime(df["ended_at"][event], "%Y-%m-%d %H:%M:%S")
        # Calculate elapsed time in seconds
        time_change = ended_dt - started_dt
        elapsed_seconds = time_change.seconds        
        # Append string to series
        ride_time.append(elapsed_seconds)
        
    df["ride_time"] = ride_time
    
    return df

In [13]:
# This function processes the later data sets (February 2021 and later). 

def process_later_better(csv_path):
    # Read in the CSV
    raw_data_df = pd.read_csv(csv_path)
    # Take the sample
    sample = take_random_sample(raw_data_df)
    # Clean the sample
    cleaned_data = clean_data_later(sample)
    # Round the latitudes and longitudes
    rounded_data = round_coords(cleaned_data)
    # Add ride_time column
    with_ride_time = calculate_ride_time(rounded_data)
    # Add month_year column
    with_ride_time.insert(loc = 10, column = "month_year", value = extract_month_year(cleaned_data))
    # Add total_ridership column
    with_ride_time.insert(loc = 11, column = "total_ridership", value = total_ridership(raw_data_df))
    
    return with_ride_time

In [14]:
# Build an empty dataframe for the early processed samples.

processed_early_samples_df = pd.DataFrame()

In [15]:
# Process the early data sets.

for month in raw_data_early:
    processed_month = process_early_better(month)
    
    processed_early_samples_df = pd.concat(objs = [processed_early_samples_df, processed_month])
    processed_early_samples_df = processed_early_samples_df.reset_index(drop=True)

In [16]:
# Build an empty dataframe for the later processed samples.

processed_later_samples_df = pd.DataFrame()

In [17]:
# Process the later data sets.

for month in raw_data_later:
    processed_month = process_later_better(month)
    processed_later_samples_df = pd.concat(objs = [processed_later_samples_df, processed_month])
    processed_later_samples_df = processed_later_samples_df.reset_index(drop = True)

In [18]:
# Concatenate the data sets together to make one big data set.

total_data_df = pd.concat(objs = [processed_early_samples_df, processed_later_samples_df])

In [19]:
# Export the whole data set as a CSV.

total_data_df.to_csv("data_for_analysis/sep_18_to_sep_22.csv")