In [1]:
# Import dependencies
import pandas as pd
import random
import datetime as dt

In [2]:
# Cleans the data by dropping the columns I'm not interested in, removing entries with NaN values, and removing duplicates.
# This function only works for data *after* January, 2021. 
def clean_data(csv):
    raw_data = pd.read_csv(csv)
    dropped_columns = raw_data.drop(columns=["ride_id", "start_station_id", "end_station_id"])
    dropped_nas = dropped_columns.dropna()
    dropped_dups = dropped_nas.drop_duplicates(ignore_index=True)
    reindexed_df = dropped_dups.reset_index(drop=True)
    
    return reindexed_df

In [3]:
# Takes a random sample of 5,000 entries for visualization with Tableau. This keeps the amount of data Tableau has to work 
# with to a more reasonable level, and it keeps the resulting CSV file under GitHub's maximum file size.
def take_random_sample(df):
    random_list = random.sample(range(0, len(df)), 5000)
    sample_df = df[df.index.isin(random_list)]
    sample_df = sample_df.reset_index(drop=True)
    
    return sample_df

In [4]:
# Round the geographic coordinates of each entry to 3 decimal points. This helps trips at the same stations to get grouped 
#together. This only works for 

def round_coords(df):
    df["start_lat"] = df["start_lat"].round(3)
    df["start_lng"] = df["start_lng"].round(3)
    df["end_lat"] = df["end_lat"].round(3)
    df["end_lng"] = df["end_lng"].round(3)
    
    return df

In [5]:
# This calculates how long each trip took in seconds and adds it to the dataframe. This is only needed for 

def calculate_ride_time(df):
    ride_time = []

    for event in range(0, len(df)):
        # Convert started_at to datetime object, save it
        started_dt = dt.datetime.strptime(df["started_at"][event], "%Y-%m-%d %H:%M:%S")
        # Convert ended_at to datetime object, save it
        ended_dt = dt.datetime.strptime(df["ended_at"][event], "%Y-%m-%d %H:%M:%S")
        # Calculate elapsed time in seconds
        time_change = ended_dt - started_dt
        elapsed_seconds = time_change.seconds        
        # Append string to series
        ride_time.append(elapsed_seconds)
        
    df["ride_time"] = ride_time
    
    return df

In [6]:
# September 2022 data

raw_file = "raw_data/202209-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_sep_22 = calculate_ride_time(rounded)

total_ridership_sep_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [7]:
# August 2022 data

raw_file = "raw_data/202208-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_aug_22 = calculate_ride_time(rounded)

total_ridership_aug_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [8]:
# July 2022 data

raw_file = "raw_data/202207-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_jul_22 = calculate_ride_time(rounded)

total_ridership_jul_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [9]:
# June 2022

raw_file = "raw_data/202206-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_jun_22 = calculate_ride_time(rounded)

total_ridership_jun_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [10]:
# May 2022

raw_file = "raw_data/202205-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_may_22 = calculate_ride_time(rounded)

total_ridership_may_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [11]:
# April 2022

raw_file = "raw_data/202204-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_apr_22 = calculate_ride_time(rounded)

total_ridership_apr_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [12]:
# March 2022

raw_file = "raw_data/202203-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_mar_22 = calculate_ride_time(rounded)

total_ridership_mar_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [13]:
# February 2022

raw_file = "raw_data/202202-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_feb_22 = calculate_ride_time(rounded)

total_ridership_feb_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [14]:
# January 2022

raw_file = "raw_data/202201-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_jan_22 = calculate_ride_time(rounded)

total_ridership_jan_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [15]:
# December 2021

raw_file = "raw_data/202112-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_dec_21 = calculate_ride_time(rounded)

total_ridership_dec_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [16]:
# November 2021

raw_file = "raw_data/202111-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_nov_21 = calculate_ride_time(rounded)

total_ridership_nov_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [17]:
# October 2021

raw_file = "raw_data/202110-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_oct_21 = calculate_ride_time(rounded)

total_ridership_oct_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [18]:
# September 2021

raw_file = "raw_data/202109-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_sep_21= calculate_ride_time(rounded)

total_ridership_sep_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [19]:
# August 2021

raw_file = "raw_data/202108-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_aug_21 = calculate_ride_time(rounded)

total_ridership_aug_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [20]:
# July 2021

raw_file = "raw_data/202107-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_jul_21 = calculate_ride_time(rounded)

total_ridership_jul_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [21]:
# June 2021

raw_file = "raw_data/202106-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_jun_21 = calculate_ride_time(rounded)

total_ridership_jun_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [22]:
# May 2021

raw_file = "raw_data/202105-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_may_21 = calculate_ride_time(rounded)

total_ridership_may_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [23]:
# April 2021

raw_file = "raw_data/202104-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_apr_21 = calculate_ride_time(rounded)

total_ridership_apr_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [24]:
# March 2021

raw_file = "raw_data/202103-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_mar_21 = calculate_ride_time(rounded)

total_ridership_mar_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [25]:
# February 2021

raw_file = "raw_data/202102-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form_feb_21 = calculate_ride_time(rounded)

total_ridership_feb_2021 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [26]:
# This function is an adjusted version of my clean_data function. This function has the additional steps of masking the titles
# of the columns to match the columns
def clean_data_early(csv):
    raw_data = pd.read_csv(csv)
    masked_cols = raw_data.rename(columns = {"tripduration": "ride_time", "starttime": "started_at", "stoptime": "ended_at", "start station id": "start_station_id", "start station name": "start_station_name", "start station latitude": "start_lat", "start station longitude": "start_lng", "end station id": "end_station_id", "end station name": "end_station_name", "end station latitude": "end_lat", "end station longitude": "end_lng", "usertype":"member_casual"})
    masked_cols["member_casual"] = masked_cols["member_casual"].map({"Subscriber": "member", "Customer": "casual"})
    dropped_cols = masked_cols.drop(columns=["start_station_id", "end_station_id", "bikeid", "birth year", "gender"])
    rearranged = dropped_cols[["started_at", "ended_at", "start_station_name", "start_lat", "start_lng", "end_station_name", "end_lat", "end_lng", "member_casual", "ride_time"]]
    dropped_nas = rearranged.dropna()
    dropped_dups = dropped_nas.drop_duplicates(ignore_index = True)
    reindexed_df = dropped_dups.reset_index(drop = True)
    return reindexed_df

In [27]:
# January 2021

raw_file = "raw_data/202101-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_jan_21 = round_coords(sample)

total_ridership_jan_2021 = len(cleaned)

In [28]:
# December 2020

raw_file = "raw_data/202012-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_dec_20 = round_coords(sample)

total_ridership_dec_20 = len(cleaned)

In [29]:
# November 2020

raw_file = "raw_data/202011-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_nov_20 = round_coords(sample)

total_ridership_nov_20 = len(cleaned)

In [30]:
# October 2020

raw_file = "raw_data/202010-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_oct_20 = round_coords(sample)

total_ridership_oct_20 = len(cleaned)

In [31]:
# September 2020

raw_file = "raw_data/202009-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_sep_20 = round_coords(sample)

total_ridership_sep_20 = len(cleaned)

In [32]:
# August 2020

raw_file = "raw_data/202008-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_aug_20 = round_coords(sample)

total_ridership_aug_20 = len(cleaned)

In [33]:
# July 2020

raw_file = "raw_data/202007-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_jul_20 = round_coords(sample)

total_ridership_jul_20 = len(cleaned)

In [34]:
# June 2020

raw_file = "raw_data/202006-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_jun_20 = round_coords(sample)

total_ridership_jun_20 = len(cleaned)

In [35]:
# May 2020

raw_file = "raw_data/202005-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_may_20 = round_coords(sample)

total_ridership_may_20 = len(cleaned)

In [36]:
# April 2020

raw_file = "raw_data/202004-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_apr_20 = round_coords(sample)

total_ridership_apr_20 = len(cleaned)

In [37]:
# March 2020

raw_file = "raw_data/202003-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_mar_20 = round_coords(sample)

total_ridership_mar_20 = len(cleaned)

In [38]:
# February 2020

raw_file = "raw_data/202002-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_feb_20 = round_coords(sample)

total_ridership_feb_20 = len(cleaned)

In [39]:
# January 2020

raw_file = "raw_data/202001-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_jan_20 = round_coords(sample)

total_ridership_jan_20 = len(cleaned)

In [40]:
# December 2019

raw_file = "raw_data/201912-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_dec_19 = round_coords(sample)

total_ridership_dec_19 = len(cleaned)

In [41]:
# November 2019

raw_file = "raw_data/201911-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_nov_19 = round_coords(sample)

total_ridership_nov_19 = len(cleaned)

In [42]:
# October 2019

raw_file = "raw_data/201910-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_oct_19 = round_coords(sample)

total_ridership_oct_19 = len(cleaned)

In [43]:
# September 2019

raw_file = "raw_data/201909-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_sep_19 = round_coords(sample)

total_ridership_sep_19 = len(cleaned)

In [44]:
# August 2019

raw_file = "raw_data/201912-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_aug_19 = round_coords(sample)

total_ridership_aug_19 = len(cleaned)

In [45]:
# July 2019

raw_file = "raw_data/201907-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_jul_19 = round_coords(sample)

total_ridership_jul_19 = len(cleaned)

In [46]:
# June 2019

raw_file = "raw_data/201906-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_jun_19 = round_coords(sample)

total_ridership_jun_19 = len(cleaned)

In [47]:
# May 2019

raw_file = "raw_data/201905-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_may_19 = round_coords(sample)

total_ridership_may_19 = len(cleaned)

In [48]:
# April 2019

raw_file = "raw_data/201904-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_apr_19 = round_coords(sample)

total_ridership_apr_19 = len(cleaned)

In [49]:
# March 2019

raw_file = "raw_data/201903-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_mar_19 = round_coords(sample)

total_ridership_mar_19 = len(cleaned)

In [50]:
# February 2019

raw_file = "raw_data/201902-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_feb_19 = round_coords(sample)

total_ridership_feb_19 = len(cleaned)

In [51]:
# January 2019

raw_file = "raw_data/201901-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_jan_19 = round_coords(sample)

total_ridership_jan_19 = len(cleaned)

In [52]:
# December 2018

raw_file = "raw_data/201812-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_dec_18 = round_coords(sample)

total_ridership_dec_18 = len(cleaned)

In [53]:
# November 2018

raw_file = "raw_data/201811-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_nov_18 = round_coords(sample)

total_ridership_nov_18 = len(cleaned)

In [54]:
# October 2018

raw_file = "raw_data/201810-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_oct_18 = round_coords(sample)

total_ridership_oct_18 = len(cleaned)

In [55]:
# September 2018

raw_file = "raw_data/201809-citibike-tripdata.csv"
cleaned = clean_data_early(raw_file)
sample = take_random_sample(cleaned)
final_form_sep_18 = round_coords(sample)

total_ridership_sep_18 = len(cleaned)

In [56]:
months = [final_form_sep_22, final_form_aug_22, final_form_jul_22, final_form_jun_22, final_form_may_22, final_form_apr_22, final_form_mar_22, final_form_feb_22, final_form_jan_22, final_form_dec_21, final_form_nov_21, final_form_oct_21, final_form_sep_21, final_form_aug_21, final_form_jul_21, final_form_jun_21, final_form_may_21, final_form_apr_21, final_form_mar_21, final_form_feb_21, final_form_jan_21, final_form_dec_20, final_form_nov_20, final_form_oct_20, final_form_sep_20, final_form_aug_20, final_form_jul_20, final_form_jun_20, final_form_may_20, final_form_apr_20, final_form_mar_20, final_form_feb_20, final_form_jan_20, final_form_dec_19, final_form_nov_19, final_form_oct_19, final_form_sep_19, final_form_aug_19, final_form_jul_19, final_form_jun_19, final_form_may_19, final_form_apr_19, final_form_mar_19, final_form_feb_19, final_form_jan_19, final_form_dec_18, final_form_nov_18, final_form_oct_18, final_form_sep_18]

year = pd.concat(months)

year.to_csv("data_for_analysis/sep_18_through_sep_22.csv")

In [57]:
# Capture the total ridership for each month

month = ["09_2022", "08_2022", "07_2022", "06_2022", "05_2022", "04_2022", "03_2022", "02_2022", "01_2022", "12_2021", "11_2021", "10_2021", "09_2021", "08_2021", "07_2021", "06_2021", "05_2021", "04_2021", "03_2021", "02_2021", "01_2021", "12_2020", "11_2020", "10_2020", "09_2020", "08_2020", "07_2020", "06_2020", "05_2020", "04_2020", "03_2020", "02_2020", "01_2020", "12_2019", "11_2019", "10_2019", "09_2019", "08_2019", "07_2019", "06_2019", "05_2019", "04_2019", "03_2019", "02_2019", "01_2019", "12_2018", "11_2018", "10_2018", "09_2018"]
riders = [total_ridership_sep_2022, total_ridership_aug_2022, total_ridership_jul_2022, total_ridership_jun_2022, total_ridership_may_2022, total_ridership_apr_2022, total_ridership_mar_2022, total_ridership_feb_2022, total_ridership_jan_2022, total_ridership_dec_2021, total_ridership_nov_2021, total_ridership_oct_2021, total_ridership_sep_2021, total_ridership_aug_2021, total_ridership_jul_2021, total_ridership_jun_2021, total_ridership_may_2021, total_ridership_apr_2021, total_ridership_mar_2021, total_ridership_feb_2021, total_ridership_jan_2021, total_ridership_dec_20, total_ridership_nov_20, total_ridership_oct_20, total_ridership_sep_20, total_ridership_aug_20, total_ridership_jul_20, total_ridership_jun_20, total_ridership_may_20, total_ridership_apr_20, total_ridership_mar_20, total_ridership_feb_20, total_ridership_jan_20, total_ridership_dec_19, total_ridership_nov_19, total_ridership_oct_19, total_ridership_sep_19, total_ridership_aug_19, total_ridership_jul_19, total_ridership_jun_19, total_ridership_may_19, total_ridership_apr_19, total_ridership_mar_19, total_ridership_feb_19, total_ridership_jan_19, total_ridership_dec_18, total_ridership_nov_18, total_ridership_oct_18, total_ridership_sep_18]

ridership_dict = {"Month": month, "Total_Ridership": riders}

total_riders = pd.DataFrame(ridership_dict)

total_riders.to_csv("data_for_analysis/total_riders.csv")