In [1]:
import pandas as pd
from pandas import Timestamp
import datetime as datetime

In [27]:
#this is a dataset for the timestamps of busses at the Amsterdam & 125th st stop on M100 line going uptown 
busArrivals = pd.read_csv("./data/Arrivals_M100.csv")

# loopTime is the minimum amount of time, in minutes, that it takes a bus to complete the bus route and 
    # arrive at this stop to complete the circuit once again
loopTime=datetime.timedelta(minutes=120)

# Ensure ordering by timestamp
busArrivals = busArrivals.loc[:,['RecordedAtTime','VehicleRef']].sort_values('RecordedAtTime')

#==== Test Sample of busArrivals
#busArrivals = busArrivals.sort_values(['VehicleRef','RecordedAtTime']).head(10)
#====

# Resetting Index and deleting resulting index column after ordering for shift later on.
busArrivals = busArrivals.reset_index()
busArrivals.drop(columns=['index',],inplace=True)

# Ensure that RecordedAtTime is of correct data type
busArrivals['RecordedAtTime'] = pd.to_datetime(busArrivals['RecordedAtTime'])

# group by individual bus (not the line, but the vehicle itself)
busArrivals_Grouped = busArrivals.groupby('VehicleRef')

# find difference between CURRENT timestamp and PREVIOUS for each gps-timestamp
busArrivals['timeDelta'] = busArrivals_Grouped['RecordedAtTime'].diff()

# we want to find all the timestamps where busses pull away from this one stop. 
    # the departure time is when we consider that a passenger is no longer waiting for their journey to start. 
    # hence, we count bus idleing as part of the passengers experienced wait time

# wherever the difference between two consecutive timestamps is greater than the loopTime, 
    # the bus has finished it's route and come back to the same stop it started at.
    # The bus is not idleing.  
busArrivals['hasLooped'] = busArrivals['timeDelta'] > loopTime

#fixing some edge cases e.g. the first datapoint has no timedeta because no other time precedes it
busArrivals.loc[0,'timeDelta'] = 0
busArrivals.loc[busArrivals['timeDelta'].isnull(),'hasLooped'] = True

# wherever the next arrival is a Looparound, the current timestamp is considered a departure from the stop
busArrivals['isDeparting'] = busArrivals['hasLooped'].shift(-1)

busArrivals.head(20)
busArrivals[busArrivals['isDeparting'] != False].head(10)

Unnamed: 0,RecordedAtTime,VehicleRef,timeDelta,hasLooped,isDeparting
0,2017-08-01 00:11:39,NYCT_4349,0,False,True
1,2017-08-01 00:21:06,NYCT_4368,NaT,True,True
4,2017-08-01 01:01:11,NYCT_8375,0 days 00:09:19,False,True
13,2017-08-01 06:01:49,NYCT_8368,0 days 00:00:00,False,True
16,2017-08-01 06:11:35,NYCT_4348,0 days 00:00:00,False,True
17,2017-08-01 06:41:31,NYCT_4363,NaT,True,True
18,2017-08-01 06:51:52,NYCT_8366,NaT,True,True
19,2017-08-01 07:02:15,NYCT_4372,NaT,True,True
20,2017-08-01 07:11:14,NYCT_8391,NaT,True,True
24,2017-08-01 07:22:23,NYCT_8368,0 days 00:00:00,False,True
