In [None]:
# Import dependencies
import pandas as pd
import random
import datetime as dt

In [None]:
# Cleans the data by dropping the columns I'm not interested in, removing entries with NaN values, and removing duplicates.
def clean_data(csv):
    raw_data = pd.read_csv(csv)
    dropped_columns = raw_data.drop(columns=["ride_id", "start_station_id", "end_station_id"])
    dropped_nas = dropped_columns.dropna()
    dropped_dups = dropped_nas.drop_duplicates(ignore_index=True)
    reindexed_df = dropped_dups.reset_index(drop=True)
    
    return reindexed_df

In [None]:
# Takes a random sample of 100,000 entries for visualization with Tableau. This keeps the amount of data Tableau has to work 
# with to a more reasonable level, and it keeps the resulting CSV file under GitHub's maximum file size.
def take_random_sample(df):
    random_list = random.sample(range(0, len(df)), 100000)
    sample_df = df[df.index.isin(random_list)]
    sample_df = sample_df.reset_index(drop=True)
    
    return sample_df

In [None]:
# Round the geographic coordinates of each entry to 3 decimal points. This helps trips at the same stations to get grouped together.
def round_coords(df):
    df["start_lat"] = df["start_lat"].round(3)
    df["start_lng"] = df["start_lng"].round(3)
    df["end_lat"] = df["end_lat"].round(3)
    df["end_lng"] = df["end_lng"].round(3)
    
    return df

In [None]:
def calculate_ride_time(df):
    # Calculate the time each trip took, save it to a list

    ride_time = []

    for event in range(0, len(df)):
        # Convert started_at to datetime object, save it
        started_dt = dt.datetime.strptime(df["started_at"][event], "%Y-%m-%d %H:%M:%S")
        # Convert ended_at to datetime object, save it
        ended_dt = dt.datetime.strptime(df["ended_at"][event], "%Y-%m-%d %H:%M:%S")
        # Calculate elapsed time in seconds
        time_change = ended_dt - started_dt
        elapsed_seconds = time_change.seconds
        # Convert elapsed time in seconds to H:M:S string
        ride_length = str(dt.timedelta(seconds = elapsed_seconds))
        # Append string to series
        ride_time.append(ride_length)
        
    df["ride_time"] = ride_time
    
    return df

In [None]:
# September 2022 data

raw_file = "raw_data/202209-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/09_2022_random_sample.csv")

total_ridership_sep_2022 = len(cleaned)

In [None]:
# August 2022 data

raw_file = "raw_data/202208-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/08_2022_random_sample.csv")

total_ridership_aug_2022 = len(cleaned)

In [None]:
# July 2022 data

raw_file = "raw_data/202207-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/07_2022_random_sample.csv")

total_ridership_jul_2022 = len(cleaned)

In [None]:
# June 2022

raw_file = "raw_data/202206-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/06_2022_random_sample.csv")

total_ridership_jun_2022 = len(cleaned)

In [None]:
# May 2022

raw_file = "raw_data/202205-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/05_2022_random_sample.csv")

total_ridership_may_2022 = len(cleaned)

In [None]:
# April 2022

raw_file = "raw_data/202204-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/04_2022_random_sample.csv")

total_ridership_apr_2022 = len(cleaned)

In [None]:
# March 2022

raw_file = "raw_data/202203-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/03_2022_random_sample.csv")

total_ridership_mar_2022 = len(cleaned)

In [None]:
# February 2022

raw_file = "raw_data/202202-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/02_2022_random_sample.csv")

total_ridership_feb_2022 = len(cleaned)

In [None]:
# Jan 2022

raw_file = "raw_data/202201-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/01_2022_random_sample.csv")

total_ridership_jan_2022 = len(cleaned)

In [None]:
# December 2021

raw_file = "raw_data/202112-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/12_2021_random_sample.csv")

total_ridership_dec_2021 = len(cleaned)

In [None]:
# November 2021

raw_file = "raw_data/202111-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/11_2021_random_sample.csv")

total_ridership_nov_2021 = len(cleaned)

In [None]:
# October 2021

raw_file = "raw_data/202110-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/10_2021_random_sample.csv")

total_ridership_oct_2021 = len(cleaned)

In [None]:
# September 2021

raw_file = "raw_data/202109-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/09_2021_random_sample.csv")

total_ridership_sep_2021 = len(cleaned)

In [None]:
# Capture the total ridership for each month

month = ["09_2022", "08_2022", "07_2022", "06_2022", "05_2022", "04_2022", "03_2022", "02_2022", "01_2022", "12_2021", "11_2021", "10_2021", "09_2021"]
riders = [total_ridership_sep_2022, total_ridership_aug_2022, total_ridership_jul_2022, total_ridership_jun_2022, total_ridership_may_2022, total_ridership_apr_2022, total_ridership_mar_2022, total_ridership_feb_2022, total_ridership_jan_2022, total_ridership_dec_2021, total_ridership_nov_2021, total_ridership_oct_2021, total_ridership_sep_2021]

ridership_dict = {"Month": month, "Total_Ridership": riders}

total_riders = pd.DataFrame(ridership_dict)

In [None]:
total_riders.to_csv("data_for_analysis/total_riders.csv")