This file takes in raw data in csvs and extracts the subset we are interested in -- 1 bus inbound. It deletes the raw files for space considerations.

In [7]:
import pandas as pd
import os

In [20]:
def process_file(path):
    full_df = pd.read_csv(path, parse_dates=["scheduled", "actual", "service_date"])
    
    full_df.rename({
        "stop_name": "time_point_id",
        "stop_sequence": "time_point_order"
    }, inplace=True, axis='columns')

    # Get subset of stops involving the one bus
    one_inbound = full_df.query("route_id=='01' and direction=='Inbound'")
    one_inbound = one_inbound.dropna(subset=["half_trip_id"])
    
    # Pivot so there's one row per trip
    one_trips = one_inbound.pivot(index="half_trip_id", columns="time_point_id", values="actual")

    #reorder cols
    time_point_ids = ["hhgat", "maput", "cntsq", "mit", "hynes", "masta", "Wasma", "Melwa", "Dudly"]
    one_trips = one_trips[time_point_ids]

    print(f"Before cleaning, {len(one_trips)} trips.")
    
    # Data cleaning
    one_trips.dropna(inplace = True)
    one_trips.query("hhgat < maput and Melwa < Dudly", inplace=True)
    print(f"After cleaning, {len(one_trips)} trips.")
    
    # From unpivioted table, extract trip-metadata (start time, service date, total duration)
    hhgat = one_inbound[one_inbound.time_point_id=="hhgat"]
    hhgat.set_index("half_trip_id", inplace=True, verify_integrity=True)
    one_trips.insert(0, "service_date", hhgat.service_date)
    one_trips.insert(1, "duration", (one_trips["Dudly"] - one_trips["hhgat"]).dt.seconds)
    
    return one_trips

In [14]:
process_file("../raw_data/Bus Arrival Departure Times Jan-Mar 2020.csv")

Before cleaning, 9891 trips.
After cleaning, 9078 trips.


time_point_id,service_date,duration,hhgat,maput,cntsq,mit,hynes,masta,Wasma,Melwa,Dudly
half_trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
46202752.0,2020-01-01,1913,1900-01-01 06:49:49,1900-01-01 06:53:22,1900-01-01 06:58:01,1900-01-01 07:01:52,1900-01-01 07:05:57,1900-01-01 07:12:25,1900-01-01 07:14:42,1900-01-01 07:20:57,1900-01-01 07:21:42
46202754.0,2020-01-01,1294,1900-01-01 07:49:34,1900-01-01 07:53:04,1900-01-01 07:55:01,1900-01-01 07:57:18,1900-01-01 07:59:36,1900-01-01 08:03:12,1900-01-01 08:06:34,1900-01-01 08:10:23,1900-01-01 08:11:08
46202756.0,2020-01-01,1625,1900-01-01 08:57:33,1900-01-01 09:01:42,1900-01-01 09:04:40,1900-01-01 09:09:07,1900-01-01 09:12:35,1900-01-01 09:14:51,1900-01-01 09:18:13,1900-01-01 09:23:52,1900-01-01 09:24:38
46202758.0,2020-01-01,1862,1900-01-01 10:19:20,1900-01-01 10:22:57,1900-01-01 10:26:10,1900-01-01 10:29:26,1900-01-01 10:33:48,1900-01-01 10:38:45,1900-01-01 10:44:08,1900-01-01 10:49:34,1900-01-01 10:50:22
46202760.0,2020-01-01,1935,1900-01-01 11:54:27,1900-01-01 11:59:07,1900-01-01 12:02:49,1900-01-01 12:07:55,1900-01-01 12:13:00,1900-01-01 12:17:51,1900-01-01 12:21:24,1900-01-01 12:26:08,1900-01-01 12:26:42
...,...,...,...,...,...,...,...,...,...,...,...
47214043.0,2020-03-31,1544,1900-01-01 18:42:25,1900-01-01 18:45:12,1900-01-01 18:48:43,1900-01-01 18:52:28,1900-01-01 18:55:51,1900-01-01 19:00:21,1900-01-01 19:04:14,1900-01-01 19:07:41,1900-01-01 19:08:09
47215034.0,2020-03-31,2580,1900-01-01 18:01:23,1900-01-01 18:24:11,1900-01-01 18:25:53,1900-01-01 18:28:47,1900-01-01 18:30:41,1900-01-01 18:34:39,1900-01-01 18:40:15,1900-01-01 18:43:50,1900-01-01 18:44:23
47215458.0,2020-03-31,1688,1900-01-01 09:46:36,1900-01-01 09:48:57,1900-01-01 09:53:11,1900-01-01 09:55:44,1900-01-01 09:59:17,1900-01-01 10:03:56,1900-01-01 10:09:08,1900-01-01 10:14:10,1900-01-01 10:14:44
47215721.0,2020-03-31,1308,1900-01-01 09:08:55,1900-01-01 09:10:13,1900-01-01 09:12:49,1900-01-01 09:16:22,1900-01-01 09:19:47,1900-01-01 09:23:03,1900-01-01 09:26:39,1900-01-01 09:30:07,1900-01-01 09:30:43


In [25]:
directory =  "../raw_data/"
output = "../data/one_trips.csv"
for file in os.listdir(directory):
    if file.lower().endswith('.csv'):
        print(f"Processing file {directory+file}")
        processed = process_file(directory + file)
        
        if os.path.exists(output):
            old_data = pd.read_csv(output, index_col="half_trip_id")
            merged_data = old_data.append(processed, verify_integrity=True)
            merged_data.to_csv(output)
        else:
            processed.to_csv(output)
            
        os.remove(directory + file)
        print("File processed!\n")

Processing file ../raw_data/MBTA Bus Arrival Departure Jan-Mar 2019.csv
Before cleaning, 8741 trips.
After cleaning, 8090 trips.
File processed!

Processing file ../raw_data/MBTA Bus Arrival Departure Apr-June 2019.csv
Before cleaning, 8855 trips.
After cleaning, 7888 trips.
File processed!

Processing file ../raw_data/MBTA Bus Arrival Departure Jul-Sept 2019.csv
Before cleaning, 9462 trips.
After cleaning, 8632 trips.
File processed!

Processing file ../raw_data/MBTA Bus Arrival Departure Oct-Dec 2019.csv
Before cleaning, 10415 trips.
After cleaning, 9566 trips.
File processed!

