In [1]:
import pandas as pd
from pandas import Timestamp
import datetime as datetime

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

In [2]:
#this is a dataset for the timestamps of busses at the Amsterdam & 125th st stop on M100 line going uptown 
busArrivals = pd.read_csv("../../data/Arrivals_M100.csv")

# loopTime is the minimum amount of time, in minutes, that it takes a bus to complete the bus route and 
    # arrive at this stop to complete the circuit once again
loopTime = datetime.timedelta(minutes=105)

# Ensure ordering by VehicleRef (a vehicle identifier for busses) and RecordedAtTime (timestamps)
busArrivals = busArrivals.loc[:,['RecordedAtTime','VehicleRef']].sort_values(['VehicleRef','RecordedAtTime'])

# Resetting Index and deleting resulting index column after ordering for shift later on.
busArrivals = busArrivals.reset_index()
busArrivals.drop(columns=['index',],inplace=True)

# Ensure that RecordedAtTime is of correct data type to find timedelta
busArrivals['RecordedAtTime'] = pd.to_datetime(busArrivals['RecordedAtTime'])

# find difference between CURRENT timestamp and PREVIOUS for each gps-timestamp
    #busArrivals['timeDelta'] = busArrivals_Grouped['RecordedAtTime'].diff()
busArrivals['timeDelta'] = busArrivals['RecordedAtTime'].diff()

# we want to find all the timestamps where busses pull away from this one stop. 
    # the departure time is when we consider that a passenger is no longer waiting for their journey to start. 
    # hence, we count bus idleing as part of the passengers experienced wait time

# wherever the difference between two consecutive timestamps is greater than the loopTime, 
    # the bus has finished it's route and come back to the same stop it started at.
    # The bus is not idleing.  
busArrivals['hasLooped'] = busArrivals['timeDelta'] > loopTime

#fixing some edge cases e.g. the first datapoint has no timedeta because no other time precedes it
busArrivals.loc[0,'timeDelta'] = 0

# # where the timedeta is NaT, set to haslooped=True. We do this so that the first Entry for a given Vehicleref won't 
#     # count as a departure time, but the last entry from the previous VehicleRef entry will.
# busArrivals.loc[busArrivals['timeDelta'].isnull(),'hasLooped'] = True

# wherever the next arrival is a Looparound, the current timestamp is considered a departure from the stop
busArrivals['isDeparting'] = busArrivals['hasLooped'].shift(-1)

#the last entry in the entire dataframe must be included as a departure
busArrivals.loc[busArrivals.index[-1], 'isDeparting']= True

# If the next bus is not the same as the current bus, then this entry must be considered a departure
busArrivals['NextVehicleRef'] = busArrivals['VehicleRef'].shift(-1).fillna("") #create next bus column by shifting current bus up by 1 relative to index
mask = busArrivals['VehicleRef'] != busArrivals['NextVehicleRef']
busArrivals.loc[mask, 'isDeparting'] = True

# return all rows where the busses are departing
busArrivals = busArrivals[busArrivals['isDeparting'] != False]

(5398, 18)

In [3]:
busArrivals.shape

(2490, 6)