In [1]:
import pandas as pd

trips = pd.read_csv("data/trips.txt")
calendar = pd.read_csv("data/calendar.txt")
stoptimes = pd.read_csv("data/stop_times.txt")
calendar_dates = pd.read_csv("data/calendar_dates.txt")
stops = pd.read_csv("data/stops.txt")
print(trips.head())

# format date columns
calendar['start_date'] = pd.to_datetime(calendar['start_date'], format='%Y%m%d')
calendar['end_date'] = pd.to_datetime(calendar['end_date'], format='%Y%m%d')
calendar_dates['date'] = pd.to_datetime(calendar_dates['date'], format='%Y%m%d')    

   route_id  service_id    trip_id                       trip_headsign  \
0    100133       86832  347468088             Bellevue Transit Center   
1    102548       23519  347619778  Bellevue Transit Center Crossroads   
2    102548       23519  347619868  Bellevue Transit Center Crossroads   
3    102548       23519  347619998  Bellevue Transit Center Crossroads   
4    102548       23519  347620378  Bellevue Transit Center Crossroads   

                    tts_trip_headsign  trip_short_name  direction_id  \
0             bellevue transit center              NaN             0   
1  bellevue transit center crossroads              NaN             1   
2  bellevue transit center crossroads              NaN             1   
3  bellevue transit center crossroads              NaN             1   
4  bellevue transit center crossroads              NaN             1   

   block_id  shape_id  peak_flag  fare_id  wheelchair_accessible  \
0   7633168  11240002          0      101             

In [2]:
def getActiveDaysForTrip(trip_id):
    trip = trips[trips['trip_id'] == trip_id]
    service_id = trip['service_id'].values[0]
    print(f"Service ID for trip {trip_id} is {service_id}")
    
    calendar_service = calendar[calendar['service_id'] == service_id]
    # map to monday, tuesday, etc.
    active_days = []
    for day, active in zip(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                           calendar_service[['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']].values[0]):
        if active:
            active_days.append(day)
    
    # add dates from calendar_dates
    return {"days": active_days,
            'start_date': calendar_service['start_date'].values[0],
            'end_date': calendar_service['end_date'].values[0],
            'service_id': service_id,
            }

In [3]:
def getTripsForBlock(block_id):
    block_trips = trips[trips['block_id'] == block_id]
    return block_trips

In [4]:
def getTripIdsForBlock(block_id):
    block_trips = getTripsForBlock(block_id)
    return block_trips['trip_id'].tolist()

In [5]:
def getExceptionsForService(service_id):
    # store in a set with map to dict type 1 and 2
    # 1 for added service, 2 for removed service
    calendar_dates['exception_type'] = calendar_dates['exception_type'].astype(int)
    exceptions = calendar_dates[calendar_dates['service_id'] == service_id]
    exceptions_dict = {}
    for _, row in exceptions.iterrows():
        date = row['date']
        if date not in exceptions_dict:
            exceptions_dict[date] = row['exception_type']
        else:
            # if already exists, we can only have one exception type per date
            # so we can ignore this case or raise an error
            pass
    return exceptions_dict

In [6]:
def isTripActiveOnDate(trip_id, date):
    trip = trips[trips['trip_id'] == trip_id]
    service_id = trip['service_id'].values[0]
    
    # check if date is in calendar
    calendar_service = calendar[calendar['service_id'] == service_id]
    if date < calendar_service['start_date'].values[0] or date > calendar_service['end_date'].values[0]:
        return False
    
    # check if day of week is active
    day_of_week = date.day_name()
    if day_of_week not in getActiveDaysForTrip(trip_id)['days']:
        return False
    
    # check if there are any exceptions for this date
    exceptions = getExceptionsForService(service_id)
    if date in exceptions:
        if exceptions[date] == 2:  # removed service
            return False
    
    return True


In [7]:
tripScheduleCount = stoptimes.groupby('trip_id').size().reset_index(name='count')

In [8]:
tripScheduleCount[tripScheduleCount['count'] == 6]

Unnamed: 0,trip_id,count
13483,669642098,6
20038,694538678,6
29591,765831658,6
29594,765831688,6
29597,765831718,6
29598,765831728,6
29610,765831848,6
29613,765831878,6
29630,765843508,6
29635,765843558,6


In [9]:
def getTrip(trip_id):
    trip = trips[trips['trip_id'] == trip_id]
    if trip.empty:
        return None
    return trip.iloc[0]

In [10]:
# lets get trips
def getSchedule(trip_id):
    return stoptimes[stoptimes['trip_id'] == trip_id].sort_values(by='stop_sequence')


In [11]:
def getServiceID(tripid):
    trip = trips[trips['trip_id'] == tripid]
    if trip is None:
        return None
    return trip['service_id'].values[0]

In [12]:
def getCompleteBlockSchedule(block_id):
    trip_ids = getTripIdsForBlock(block_id)
    block_schedule = []
    for trip_id in trip_ids:
        schedule = getSchedule(trip_id)
        block_schedule.append(schedule)
    return pd.concat(block_schedule).sort_values(by='arrival_time').reset_index(drop=True)

In [13]:
def getServiceIdsForBlock(block_id):
    trips = getTripsForBlock(block_id)
    # return trip ids and service ids
    return trips[['trip_id', 'service_id']]

In [14]:
print(getSchedule(765895068))


           trip_id arrival_time departure_time  stop_id  stop_sequence  \
1037236  765895068     16:42:00       16:42:00    64592              1   
1037237  765895068     16:49:00       16:49:00    64476            140   
1037238  765895068     17:00:00       17:00:00    64477            286   
1037239  765895068     17:07:00       17:07:00    64478            477   
1037240  765895068     17:11:28       17:11:28    64484            535   
1037241  765895068     17:15:00       17:15:00    65140            590   

                      stop_headsign  pickup_type  drop_off_type  \
1037236                         NaN            0              0   
1037237                         NaN            0              0   
1037238                         NaN            0              0   
1037239                         NaN            0              0   
1037240                         NaN            0              0   
1037241  Mount Baker Transit Center            0              0   

         sh

{
    st1{
        arrival 1751672580 4:43
        departure 1751672580 4:43
        stop id 64592
    }
    crnt 4:44
    st2{
        arrival 1751673300 4:45
        departure 1751673300 4:45
        stop id 64476            
    }
    st2{
        arrival 1751673900 5:05
        departure 1751673900 5:05
        stop id 64592
    }

    delay 30
}

In [15]:
#  get service days 
print(getActiveDaysForTrip(765895068))

Service ID for trip 765895068 is 789
{'days': ['Friday'], 'start_date': np.datetime64('2025-07-04T00:00:00.000000000'), 'end_date': np.datetime64('2025-07-04T00:00:00.000000000'), 'service_id': np.int64(789)}


In [16]:
print(getExceptionsForService(789))

{}


In [17]:
print(getTrip(765895068))

route_id                        102646
service_id                         789
trip_id                      765895068
trip_headsign            Issaquah Alps
tts_trip_headsign        Issaquah Alps
trip_short_name                    NaN
direction_id                         0
block_id                       7660204
shape_id                      41634002
peak_flag                            0
fare_id                            101
wheelchair_accessible                1
bikes_allowed                        1
Name: 27563, dtype: object


In [18]:
print(getTripIdsForBlock(7660204))

[765895128, 765895158, 765895118, 765895068, 765895098, 765895178, 765895318, 765895348, 765895468]


In [19]:
stops[stops['stop_id'] == 64592]

Unnamed: 0,stop_id,stop_code,stop_name,tts_stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
4400,64592,64592,Issaquah Transit Center - Bay 5,Issaquah Transit Center Bay Five,,47.542248,-122.062225,19,,0,,America/Los_Angeles,1


In [20]:
# dont skip , print whole text
schedule = getCompleteBlockSchedule(7660204)
schedule.to_csv("complete_block_schedule.csv", index=False)

In [21]:
print(getServiceIdsForBlock(7660204))

         trip_id  service_id
25938  765895128         789
25939  765895158         789
26471  765895118         789
27563  765895068         789
27577  765895098         789
27579  765895178         789
27580  765895318         789
27767  765895348         789
29811  765895468         789


In [22]:
# make blocks
def getTripsSequenceForBlock(block_id , service_id):
    trips = getTripsForBlock(block_id)
    print(trips.columns)
    # lets see how many service ids are there
    print(trips['service_id'].unique())
    trips = trips[trips['service_id'] == service_id]
    trip_sequence = []
    for _, trip in trips.iterrows():
        trip_schedule = getSchedule(trip['trip_id'])
        trip_sequence.append(trip_schedule)
    return pd.concat(trip_sequence).sort_values(by='arrival_time').reset_index(drop=True)

In [23]:
getTripsSequenceForBlock(7660204 , 789).sort_values(by="arrival_time").reset_index(drop=True).to_csv("trips_sequence_for_block7660204.csv", index=False)

Index(['route_id', 'service_id', 'trip_id', 'trip_headsign',
       'tts_trip_headsign', 'trip_short_name', 'direction_id', 'block_id',
       'shape_id', 'peak_flag', 'fare_id', 'wheelchair_accessible',
       'bikes_allowed'],
      dtype='object')
[789]


In [24]:
trips = pd.read_csv("data/trips.txt")
trips[trips["service_id"] == 789]


Unnamed: 0,route_id,service_id,trip_id,trip_headsign,tts_trip_headsign,trip_short_name,direction_id,block_id,shape_id,peak_flag,fare_id,wheelchair_accessible,bikes_allowed
25283,100336,789,762848898,Downtown Seattle Water Taxi,Downtown Seattle Water Taxi,,1,7653694,40973001,0,700,1,1
25284,100336,789,762848968,West Seattle Water Taxi,West Seattle Water Taxi,,0,7653694,31973002,0,700,1,1
25285,100336,789,762849028,West Seattle Water Taxi,West Seattle Water Taxi,,0,7653694,31973002,0,700,1,1
25286,100272,789,762849108,Seacrest Marina,Seacrest Marina,,0,7653695,11773004,0,300,1,1
25287,100481,789,762849118,Alki,Alkeye,,0,7653695,11775001,0,300,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29807,102646,789,765895038,Issaquah Alps,Issaquah Alps,,0,7660202,41634002,0,101,1,1
29808,102650,789,765895088,Mount Si/Teneriffe,Mount Sigh slash Teneriffe,,0,7660200,41636007,0,101,1,1
29809,102646,789,765895288,Issaquah Alps,Issaquah Alps,,0,7660206,41634007,0,101,1,1
29810,102646,789,765895418,Issaquah Alps,Issaquah Alps,,0,7660208,41634002,0,101,1,1


In [25]:
tripsForService789 = trips[trips["service_id"] == 789]
trip789b7653694 =  tripsForService789[tripsForService789["block_id"] == 7653694]

In [26]:
stoptimes = pd.read_csv("data/stop_times.txt")
stopsForReqTrips = stoptimes[stoptimes['trip_id'].isin(trip789b7653694['trip_id'].tolist())]

In [27]:
stopsForReqTrips.sort_values(by='arrival_time').to_csv("stops_for_req_trips.csv", index=False)

In [28]:
trips[trips['trip_id'] == 694056178]

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,tts_trip_headsign,trip_short_name,direction_id,block_id,shape_id,peak_flag,fare_id,wheelchair_accessible,bikes_allowed
15029,102736,86832,694056178,Burien Transit Center Westwood Village,burien transit center westwood village,,0,7631599,21678004,0,101,1,1


In [29]:
getServiceIdsForBlock(7631599)

Unnamed: 0,trip_id,service_id
15029,694056178,86832
15031,694056348,86832
16337,694056038,86832
16338,694056048,86832
16700,694055998,86832
16873,694056188,86832
17061,694057118,86832
17070,694057698,86832
17352,694055988,86832
19168,694057048,86832


In [30]:
def getServiceInfo(service_id):
    service_info = calendar[calendar['service_id'] == service_id]
    if service_info.empty:
        return None
    service_info = service_info.iloc[0]
    # get active days
    active_days = []
    for day, active in zip(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                           service_info[['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']].values):
        if active:
            active_days.append(day) 
    # get exceptions
    exceptions = getExceptionsForService(service_id)
    return {
        'service_id': service_id,
        'start_date': service_info['start_date'],
        'end_date': service_info['end_date'],
        'active_days': active_days,
        'exceptions': exceptions
    }

In [31]:
# print(getTripsSequenceForBlock(7631599,86832))
print(getTripIdsForBlock(7631599))

[694056178, 694056348, 694056038, 694056048, 694055998, 694056188, 694057118, 694057698, 694055988, 694057048, 694056028, 694057208, 694057418, 694057598, 694056888, 732459508, 732459518, 732459438]


In [32]:
print(getServiceInfo(86832))

{'service_id': 86832, 'start_date': Timestamp('2025-05-01 00:00:00'), 'end_date': Timestamp('2025-08-29 00:00:00'), 'active_days': ['Tuesday', 'Wednesday', 'Thursday'], 'exceptions': {Timestamp('2025-05-26 00:00:00'): 2, Timestamp('2025-08-04 00:00:00'): 1, Timestamp('2025-08-11 00:00:00'): 1, Timestamp('2025-06-02 00:00:00'): 1, Timestamp('2025-07-28 00:00:00'): 1, Timestamp('2025-05-19 00:00:00'): 1, Timestamp('2025-07-21 00:00:00'): 1, Timestamp('2025-05-12 00:00:00'): 1, Timestamp('2025-07-07 00:00:00'): 1, Timestamp('2025-07-14 00:00:00'): 1, Timestamp('2025-05-05 00:00:00'): 1, Timestamp('2025-06-30 00:00:00'): 1, Timestamp('2025-08-25 00:00:00'): 1, Timestamp('2025-06-16 00:00:00'): 1, Timestamp('2025-06-23 00:00:00'): 1, Timestamp('2025-08-18 00:00:00'): 1, Timestamp('2025-06-09 00:00:00'): 1, Timestamp('2025-07-04 00:00:00'): 2, Timestamp('2025-06-20 00:00:00'): 1, Timestamp('2025-08-15 00:00:00'): 1, Timestamp('2025-06-06 00:00:00'): 1, Timestamp('2025-08-01 00:00:00'): 1, Ti

In [33]:
getCompleteBlockSchedule(7631599).to_csv("complete_block_schedule_7631599.csv", index=False)

In [34]:
trips[trips['trip_id'] == 694608358]

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,tts_trip_headsign,trip_short_name,direction_id,block_id,shape_id,peak_flag,fare_id,wheelchair_accessible,bikes_allowed
19061,100202,17361,694608358,Northgate Station NW Hospital,northgate station northwest hospital,,1,7633720,20345002,0,101,1,1


In [35]:
getCompleteBlockSchedule(7633720).to_csv("complete_block_schedule_7633720_sorted.csv", index=False)

In [36]:
def generateBlockScheduleForTrip(trip_id):    
    trip = trips[trips['trip_id'] == trip_id]
    block = trip['block_id']
    return getCompleteBlockSchedule(block.values[0]) , block.values[0] 

In [37]:
def getRouteForTrip(trip_id):
    trip = trips[trips['trip_id'] == trip_id]
    route_id = trip['route_id'].values[0]
    return route_id

In [38]:
sche , block =  generateBlockScheduleForTrip(630541968)
sche.to_csv(f"complete_block_schedule_{block}.csv", index=False)

In [39]:

sche , block =  generateBlockScheduleForTrip(721361888)
sche.to_csv(f"complete_block_schedule_{block}.csv", index=False)

In [40]:
sche , block =  generateBlockScheduleForTrip(686405558)
sche.to_csv(f"complete_block_schedule_{block}.csv", index=False)

In [41]:
getRouteForTrip(724948068)

np.int64(100254)

In [42]:
getServiceInfo(86832)

{'service_id': 86832,
 'start_date': Timestamp('2025-05-01 00:00:00'),
 'end_date': Timestamp('2025-08-29 00:00:00'),
 'active_days': ['Tuesday', 'Wednesday', 'Thursday'],
 'exceptions': {Timestamp('2025-05-26 00:00:00'): 2,
  Timestamp('2025-08-04 00:00:00'): 1,
  Timestamp('2025-08-11 00:00:00'): 1,
  Timestamp('2025-06-02 00:00:00'): 1,
  Timestamp('2025-07-28 00:00:00'): 1,
  Timestamp('2025-05-19 00:00:00'): 1,
  Timestamp('2025-07-21 00:00:00'): 1,
  Timestamp('2025-05-12 00:00:00'): 1,
  Timestamp('2025-07-07 00:00:00'): 1,
  Timestamp('2025-07-14 00:00:00'): 1,
  Timestamp('2025-05-05 00:00:00'): 1,
  Timestamp('2025-06-30 00:00:00'): 1,
  Timestamp('2025-08-25 00:00:00'): 1,
  Timestamp('2025-06-16 00:00:00'): 1,
  Timestamp('2025-06-23 00:00:00'): 1,
  Timestamp('2025-08-18 00:00:00'): 1,
  Timestamp('2025-06-09 00:00:00'): 1,
  Timestamp('2025-07-04 00:00:00'): 2,
  Timestamp('2025-06-20 00:00:00'): 1,
  Timestamp('2025-08-15 00:00:00'): 1,
  Timestamp('2025-06-06 00:00:00')

In [43]:
getServiceID(686405558)

np.int64(23519)

In [44]:
getServiceInfo(23519)

{'service_id': 23519,
 'start_date': Timestamp('2025-05-03 00:00:00'),
 'end_date': Timestamp('2025-08-23 00:00:00'),
 'active_days': ['Saturday'],
 'exceptions': {}}

In [45]:
sche , block =  generateBlockScheduleForTrip(686404378)
sche.to_csv(f"complete_block_schedule_{1_686404378}.csv", index=False)

In [46]:
getServiceID(534531868)

np.int64(23519)

In [47]:
blockIds = trips['block_id'].unique()
schedules = [getCompleteBlockSchedule(x) for x in blockIds]
# sort based on last arrival time of each schedule
schedules_sorted = sorted(schedules, key=lambda x: x['arrival_time'].max())

In [50]:
# last schedule
schedules_sorted[-1]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
0,686482188,08:49:00,08:49:00,41902,1,,0,0,0.0,1
1,686482188,08:50:31,08:50:31,41904,18,,0,0,1494.2,0
2,686482188,08:52:13,08:52:13,41908,31,,0,0,3150.6,0
3,686482188,08:53:18,08:53:18,41965,37,,0,0,4211.8,0
4,686482188,08:54:08,08:54:08,41970,40,,0,0,5040.1,0
...,...,...,...,...,...,...,...,...,...,...
686,727421178,29:18:59,29:18:59,41980,159,Capitol Hill,0,0,23486.2,0
687,727421178,29:19:40,29:19:40,41982,162,Capitol Hill,0,0,24008.4,0
688,727421178,29:21:04,29:21:04,41986,168,Capitol Hill,0,0,25090.7,0
689,727421178,29:23:03,29:23:03,41987,181,Capitol Hill,0,0,26611.0,0


In [52]:
getServiceInfo(getServiceID(727421178))

{'service_id': np.int64(23519),
 'start_date': Timestamp('2025-05-03 00:00:00'),
 'end_date': Timestamp('2025-08-23 00:00:00'),
 'active_days': ['Saturday'],
 'exceptions': {}}