In [1]:
# Import dependencies
import pandas as pd
import random
import datetime as dt

In [2]:
# Cleans the data by dropping the columns I'm not interested in, removing entries with NaN values, and removing duplicates.
def clean_data(csv):
    raw_data = pd.read_csv(csv)
    dropped_columns = raw_data.drop(columns=["ride_id", "start_station_id", "end_station_id"])
    dropped_nas = dropped_columns.dropna()
    dropped_dups = dropped_nas.drop_duplicates(ignore_index=True)
    reindexed_df = dropped_dups.reset_index(drop=True)
    
    return reindexed_df

In [3]:
# Takes a random sample of 100,000 entries for visualization with Tableau. This keeps the amount of data Tableau has to work 
# with to a more reasonable level, and it keeps the resulting CSV file under GitHub's maximum file size.
def take_random_sample(df):
    random_list = random.sample(range(0, len(df)), 100000)
    sample_df = df[df.index.isin(random_list)]
    sample_df = sample_df.reset_index(drop=True)
    
    return sample_df

In [4]:
# Round the geographic coordinates of each entry to 3 decimal points. This helps trips at the same stations to get grouped together.
def round_coords(df):
    df["start_lat"] = df["start_lat"].round(3)
    df["start_lng"] = df["start_lng"].round(3)
    df["end_lat"] = df["end_lat"].round(3)
    df["end_lng"] = df["end_lng"].round(3)
    
    return df

In [5]:
def calculate_ride_time(df):
    # Calculate the time each trip took, save it to a list

    ride_time = []

    for event in range(0, len(df)):
        # Convert started_at to datetime object, save it
        started_dt = dt.datetime.strptime(df["started_at"][event], "%Y-%m-%d %H:%M:%S")
        # Convert ended_at to datetime object, save it
        ended_dt = dt.datetime.strptime(df["ended_at"][event], "%Y-%m-%d %H:%M:%S")
        # Calculate elapsed time in seconds
        time_change = ended_dt - started_dt
        elapsed_seconds = time_change.seconds
        # Convert elapsed time in seconds to H:M:S string
        ride_length = str(dt.timedelta(seconds = elapsed_seconds))
        # Append string to series
        ride_time.append(ride_length)
        
    df["ride_time"] = ride_time
    
    return df

In [6]:
# September 2022 data

raw_file = "raw_data/202209-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/09_2022_random_sample.csv")

total_ridership_sep_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [7]:
# August 2022 data

raw_file = "raw_data/202208-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/08_2022_random_sample.csv")

total_ridership_aug_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [8]:
# July 2022 data

raw_file = "raw_data/202207-citibike-tripdata.csv"
cleaned = clean_data(raw_file)
sample = take_random_sample(cleaned)
rounded = round_coords(sample)
final_form = calculate_ride_time(rounded)

final_form.to_csv("data_for_analysis/07_2022_random_sample.csv")

total_ridership_jul_2022 = len(cleaned)

  raw_data = pd.read_csv(csv)


In [11]:
# Capture the total ridership for each month

month = ["09_2022", "08_2022", "07_2022"]
riders = [total_ridership_sep_2022, total_ridership_aug_2022, total_ridership_jul_2022]

ridership_dict = {"Month": month, "Total_Ridership": riders}

total_riders = pd.DataFrame(ridership_dict)

In [12]:
total_riders.to_csv("data_for_analysis/total_riders.csv")