In [1]:
import numpy as np
import pandas as pd 
import os
import datetime

# read in stop_times txt file
# stop_id is the unique id that defined the stop, this is different from the public stop number 
# value on the TTC website, a mapping is found in stops.txt, stop_sequence represents which stop 
# from the very first stop it is 
data_folder = "./data/sampleDataset/gtfs2024-06-23To2024-07-27/"
stop_times_df = pd.read_csv(data_folder + "stop_times.txt", sep=",", header=0)
stop_times_df.head(10)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,47366437,18:45:00,18:45:00,14155,1,,0,0,
1,47366437,18:46:06,18:46:06,3807,2,,0,0,0.3546
2,47366437,18:46:50,18:46:50,6904,3,,0,0,0.5903
3,47366437,18:47:59,18:47:59,1163,4,,0,0,0.9613
4,47366437,18:49:00,18:49:00,7723,5,,0,0,1.2849
5,47366437,18:50:08,18:50:08,2498,6,,0,0,1.6424
6,47366437,18:50:40,18:50:40,805,7,,0,0,1.8512
7,47366437,18:51:09,18:51:09,1047,8,,0,0,2.0416
8,47366437,18:51:40,18:51:40,8848,9,,0,0,2.2421
9,47366437,18:52:58,18:52:58,4206,10,,0,0,2.7426


In [2]:
# read in stops txt file 
# maps the internal(?) stop_id to the external stop_code as shown in https://www.ttc.ca/routes-and-schedules
stops_df = pd.read_csv(data_folder + "stops.txt", sep=",", header=0)
stops_df.head(10)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,1
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1
2,264,940,Davenport Rd at Dupont St,,43.675511,-79.401938,,,,,,2
3,265,1871,Davisville Ave at Cleveland St,,43.702088,-79.378112,,,,,,1
4,266,11700,Disco Rd at Attwell Dr,,43.701362,-79.594843,,,,,,1
5,267,3478,Disco Rd at Attwell Dr,,43.701043,-79.595806,,,,,,1
6,268,3479,Disco Rd at Carlingview Dr East Side,,43.69981,-79.600769,,,,,,1
7,269,3480,Disco Rd at Carlingview Dr,,43.699929,-79.601176,,,,,,1
8,270,14258,Don Mills Rd at Eglinton Ave East,,43.720674,-79.339121,,,,,,1
9,271,9125,Don Mills Rd at Eglinton Ave East North Side,,43.721373,-79.338926,,,,,,1


In [3]:
# read in trips.txt
# each trip_id maps tp a route_id
trips_df = pd.read_csv(data_folder + "trips.txt", sep=",", header=0)
trips_df.head(10)

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed
0,72011,1,47366443,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059558,1018209,1,1
1,72011,1,47366444,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059558,1018210,1,1
2,72011,1,47366450,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059558,1018210,1,1
3,72011,1,47366449,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059558,1018210,1,1
4,72011,1,47366448,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059558,1018210,1,1
5,72011,1,47366447,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059558,1018210,1,1
6,72011,1,47366446,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059558,1018210,1,1
7,72011,1,47366455,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059558,1018210,1,1
8,72011,1,47366445,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059558,1018210,1,1
9,72011,1,47366440,EAST - 10 VAN HORNE towards VICTORIA PARK,,0,2059557,1018210,1,1


In [4]:
# read in routes.txt
# maps the route_id to the bus number
routes_df = pd.read_csv(data_folder + "routes.txt", sep=",", header=0)
routes_df.head(10)

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,72234,1,1,LINE 1 (YONGE-UNIVERSITY),,1,,D5C82B,000000
1,72011,1,10,VAN HORNE,,3,,FF0000,FFFFFF
2,72012,1,100,FLEMINGDON PARK,,3,,FF0000,FFFFFF
3,72013,1,101,DOWNSVIEW PARK,,3,,FF0000,FFFFFF
4,72014,1,102,MARKHAM RD.,,3,,FF0000,FFFFFF
5,72015,1,104,FAYWOOD,,3,,FF0000,FFFFFF
6,72016,1,105,DUFFERIN NORTH,,3,,FF0000,FFFFFF
7,72017,1,106,SENTINEL,,3,,FF0000,FFFFFF
8,72018,1,107,YORK UNIVERSITY HEIGHTS,,3,,FF0000,FFFFFF
9,72019,1,108,DRIFTWOOD,,3,,FF0000,FFFFFF


In [5]:
# read in calendar.txt which stores which days of the week each service_id is and the 
# schedule range in terms of start date and end date 
calendar_df = pd.read_csv(data_folder + "calendar.txt", sep=",", header=0)

# modify the start_date and end_date to be in datetime format 
calendar_df["start_date"] = pd.to_datetime(calendar_df["start_date"], format="%Y%m%d")
calendar_df["end_date"] = pd.to_datetime(calendar_df["end_date"], format="%Y%m%d")
calendar_df.head(10)

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,1,1,1,1,1,1,0,0,2024-06-23,2024-07-27
1,2,0,0,0,0,0,1,0,2024-06-23,2024-07-27
2,3,0,0,0,0,0,0,1,2024-06-23,2024-07-27
3,4,0,0,0,0,0,0,0,2024-06-23,2024-07-27
4,501,0,0,0,0,0,0,0,2024-06-23,2024-07-27
5,4401,0,0,0,0,0,0,0,2024-06-23,2024-07-27
6,4501,0,0,0,0,0,0,0,2024-06-23,2024-07-27
7,2901,0,0,0,0,0,0,0,2024-06-23,2024-07-27
8,502,0,0,0,0,0,0,0,2024-06-23,2024-07-27


In [6]:
# filter for service_id != 1, 2, 3

From the above, we can see that service_id==1 means the schedule from Monday to Friday, 2 means saturday and 3 means sunday.

In [7]:
# read in calendar_dates.txt which tells us about special service days when services are added (type 1)
# or cancelled (type 2), for example, normal services might be modified and replaced on holidays such 
# as Canada day
calendar_dates_df = pd.read_csv(data_folder + "calendar_dates.txt", sep=",", header=0)

# process date to be a datetime object 
calendar_dates_df["date"] = pd.to_datetime(calendar_dates_df["date"], format="%Y%m%d")

# for each service id, group the dates they affect together 
calendar_dates_df = calendar_dates_df.groupby(["exception_type", "service_id"]).date.apply(list).reset_index()

calendar_dates_df.head(10)

Unnamed: 0,exception_type,service_id,date
0,1,4,[2024-07-01 00:00:00]
1,1,501,"[2024-06-24 00:00:00, 2024-06-25 00:00:00, 202..."
2,1,4401,"[2024-06-24 00:00:00, 2024-06-26 00:00:00, 202..."
3,1,4501,"[2024-06-25 00:00:00, 2024-06-28 00:00:00, 202..."
4,2,1,[2024-07-01 00:00:00]


In [8]:
# obtain all schedules that should be replaced (exception_type 1)
add_schedule = calendar_dates_df[calendar_dates_df["exception_type"] == 1]

# obtain all schedules that should be replaced (exception_type 2)
# and delete these dates from the regular schedule array later
replace_schedule = calendar_dates_df[calendar_dates_df["exception_type"] == 2]

In [9]:
add_schedule.head(10)

Unnamed: 0,exception_type,service_id,date
0,1,4,[2024-07-01 00:00:00]
1,1,501,"[2024-06-24 00:00:00, 2024-06-25 00:00:00, 202..."
2,1,4401,"[2024-06-24 00:00:00, 2024-06-26 00:00:00, 202..."
3,1,4501,"[2024-06-25 00:00:00, 2024-06-28 00:00:00, 202..."


In [10]:
replace_schedule.head(10)

Unnamed: 0,exception_type,service_id,date
4,2,1,[2024-07-01 00:00:00]


In [11]:
# join stop_time df with stops df on stop_id
overall_df = pd.merge(stop_times_df[["trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "shape_dist_traveled"]], stops_df[["stop_id", "stop_code", "stop_name", "stop_lat", "stop_lon", "wheelchair_boarding"]], on=["stop_id"])

# join stop_times_combined with trips_df on trip_id
overall_df = pd.merge(overall_df, trips_df[["trip_id", "route_id", "service_id", "trip_headsign", "direction_id", "shape_id"]], on=["trip_id"])

# join with routes_df on route_id for route_short_name (the bus number) and route_long_name (the official route name)
overall_df = pd.merge(overall_df, routes_df[["route_id", "route_short_name", "route_long_name", "route_type"]], on=["route_id"])

# join with calendar_df to find the start and end date of the given schedule 
overall_df = pd.merge(overall_df, calendar_df[["service_id", "start_date", "end_date"]], on=["service_id"])

# filter for buses only
# route_type = 0: Tram and Streetcar, 1: Subway, 3: Bus
overall_df = overall_df[overall_df["route_type"] == 3]

overall_df = overall_df[["route_short_name", "route_long_name", "trip_headsign", "route_id", "trip_id", "arrival_time", "departure_time", "stop_id", "stop_code", "stop_name", "stop_sequence", "stop_lat", "stop_lon", "direction_id", "service_id", "wheelchair_boarding", "start_date", "end_date", "shape_id", "shape_dist_traveled"]]

overall_df.head(10)

Unnamed: 0,route_short_name,route_long_name,trip_headsign,route_id,trip_id,arrival_time,departure_time,stop_id,stop_code,stop_name,stop_sequence,stop_lat,stop_lon,direction_id,service_id,wheelchair_boarding,start_date,end_date,shape_id,shape_dist_traveled
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,2024-06-23,2024-07-27,1018210,
1,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:46:06,18:46:06,3807,1949,Don Mills Rd at Leith Hill Rd North Side,2,43.777534,-79.347811,0,1,1,2024-06-23,2024-07-27,1018210,0.3546
2,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:46:50,18:46:50,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,3,43.77953,-79.348701,0,1,1,2024-06-23,2024-07-27,1018210,0.5903
3,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:47:59,18:47:59,1163,1938,Don Mills Rd at Godstone Rd,4,43.782682,-79.348922,0,1,1,2024-06-23,2024-07-27,1018210,0.9613
4,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:49:00,18:49:00,7723,1919,Don Mills Rd at Deerford Rd,5,43.785281,-79.35057,0,1,1,2024-06-23,2024-07-27,1018210,1.2849
5,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:50:08,18:50:08,2498,7384,Van Horne Ave at Don Mills Rd East Side,6,43.787416,-79.352355,0,1,1,2024-06-23,2024-07-27,1018210,1.6424
6,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:50:40,18:50:40,805,7387,Van Horne Ave at Hobart Dr,7,43.787936,-79.349884,0,1,1,2024-06-23,2024-07-27,1018210,1.8512
7,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:51:09,18:51:09,1047,7390,Van Horne Ave at Houston Cres (West),8,43.787625,-79.347637,0,1,1,2024-06-23,2024-07-27,1018210,2.0416
8,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:51:40,18:51:40,8848,7391,Van Horne Ave at Kingslake Rd,9,43.786639,-79.345567,0,1,1,2024-06-23,2024-07-27,1018210,2.2421
9,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:52:58,18:52:58,4206,7385,Van Horne Ave at Edmonton Rd East Side,10,43.786993,-79.339497,0,1,1,2024-06-23,2024-07-27,1018210,2.7426


In [12]:
# check if arrival time and departure time is actually the same 
diff_arrival_departure = overall_df.loc[~(overall_df["arrival_time"] == overall_df["departure_time"])]
diff_arrival_departure.shape

(8, 20)

In [13]:
# do additional processing on 24H, 25H, 26H, 27H, 28H, 29H, and 30H which corresponds to 12 AM, 1 AM, 2AM, 3AM, 4AM, 5AM, 6AM
# for night buses (might be a way to distinguish nigh buses if no other ways?)
def standardize_time(time_str):
    if time_str[1] != ':' and int(time_str[:2]) > 23:
        new_str = "0" + str(int(time_str[:2]) - 24) + time_str[2:]
        # slice the seconds digit 
        new_str = new_str[:-3]
        return new_str
    else:
        if time_str[1] == ':':
            time_str = "0" + time_str
        return time_str[:-3]
    
arrival_time_standard = overall_df["arrival_time"].apply(standardize_time)
departure_time_standard = overall_df["departure_time"].apply(standardize_time)
overall_df["arrival_time"] = arrival_time_standard
overall_df["departure_time"] = departure_time_standard

# convert the arrival and departure time into standard datetime formats 
overall_df['arrival_time'] = pd.to_datetime(overall_df['arrival_time'], format="%H:%M").dt.time
overall_df['departure_time'] = pd.to_datetime(overall_df['departure_time'], format="%H:%M").dt.time


In [14]:
# build a function that extracts the bus schedule for a bus given the bus number, stop_code (the stop code on the TTC website), 
# and day of the week in the format of 1-7 (monday to sunday), and direction, one of [N, S, E, W]
def get_schedule(bus, stop_code, day):
    # default to monday to friday schedule 
    service_id = 1
    if day == 6:
        service_id = 2
    if day == 7:
        service_id = 3
    results = overall_df[(overall_df["route_short_name"] == bus) & (overall_df["stop_code"] == stop_code) & (overall_df["service_id"] == service_id)]
    # sort the result by time in ascending order 
    results = results.sort_values(by="arrival_time")
    return results
        

In [15]:
# build a function that extracts the bus schedule for a bus given the bus number, stop_code (the stop code on the TTC website), 
# and day of the week in the format of 1-7 (monday to sunday), and direction, one of [N, S, E, W]
def get_route_schedule(bus, day):
    # default to monday to friday schedule 
    service_id = 1
    if day == 6:
        service_id = 2
    if day == 7:
        service_id = 3
    results = overall_df[(overall_df["route_short_name"] == bus) & (overall_df["service_id"] == service_id)]
    # sort the result by time in ascending order 
    results = results.sort_values(by="arrival_time")
    return results

In [16]:
# for each schedule row, map it to all specific calendar dates it represents and the exact time 
# find all the calendar dates within the current time range given 

def day_of_week(input_date):
    """
    Returns the weekday from Monday (1) to Sunday (7)
    
    """
    return (input_date.astype('datetime64[D]').view('int64') - 4) % 7 + 1

current_start_date = calendar_df["start_date"].unique()[0]
current_end_date = calendar_df["end_date"].unique()[0]
print(current_start_date)
print(current_end_date)
print(day_of_week(current_start_date))
print(day_of_week(current_end_date))

# create a dictionary with all the schedule types 
schedule_dict = {}
service_ids = calendar_df["service_id"].unique()
for service_id in service_ids:
    schedule_dict[service_id] = [] 

# compute dates for service_id 1, 2, 3
def compute_dates(start_date, end_date):
    cur_date = start_date
    while cur_date <= end_date:
        cur_weekday = day_of_week(cur_date)
        if cur_weekday == 6:
            schedule_dict[2].append(cur_date)
        elif cur_weekday == 7:
            schedule_dict[3].append(cur_date)
        else:
            schedule_dict[1].append(cur_date)
        cur_date = cur_date + np.timedelta64(1, 'D')

compute_dates(current_start_date, current_end_date)

# populate the dates for add_schedule 
for idx, row in add_schedule.iterrows():
    schedule_dict[row["service_id"]] = [t.to_numpy() for t in row["date"]]
    
# remove any affliated schedules in replace_schedule 
for idx, row in replace_schedule.iterrows():
    service_id = row["service_id"]
    dates_to_replace = row["date"]
    for d in dates_to_replace:
        if d in schedule_dict[service_id]:
            schedule_dict[service_id].remove(d)

print(schedule_dict)

2024-06-23T00:00:00.000000000
2024-07-27T00:00:00.000000000
7
6
{1: [numpy.datetime64('2024-06-24T00:00:00.000000000'), numpy.datetime64('2024-06-25T00:00:00.000000000'), numpy.datetime64('2024-06-26T00:00:00.000000000'), numpy.datetime64('2024-06-27T00:00:00.000000000'), numpy.datetime64('2024-06-28T00:00:00.000000000'), numpy.datetime64('2024-07-02T00:00:00.000000000'), numpy.datetime64('2024-07-03T00:00:00.000000000'), numpy.datetime64('2024-07-04T00:00:00.000000000'), numpy.datetime64('2024-07-05T00:00:00.000000000'), numpy.datetime64('2024-07-08T00:00:00.000000000'), numpy.datetime64('2024-07-09T00:00:00.000000000'), numpy.datetime64('2024-07-10T00:00:00.000000000'), numpy.datetime64('2024-07-11T00:00:00.000000000'), numpy.datetime64('2024-07-12T00:00:00.000000000'), numpy.datetime64('2024-07-15T00:00:00.000000000'), numpy.datetime64('2024-07-16T00:00:00.000000000'), numpy.datetime64('2024-07-17T00:00:00.000000000'), numpy.datetime64('2024-07-18T00:00:00.000000000'), numpy.datetim

In [17]:
# for each row of df, check which service_id it has, and populate the 
# days_list column with the corresponding days from days_dict
def add_days_list(df, days_dict):
    df["days_list"] = np.empty((len(df), 0)).tolist()
    for idx, row in df.iterrows():
        df.at[idx, "days_list"] = days_dict[row["service_id"]]
    
add_days_list(overall_df, schedule_dict)
overall_df.head(10)

Unnamed: 0,route_short_name,route_long_name,trip_headsign,route_id,trip_id,arrival_time,departure_time,stop_id,stop_code,stop_name,...,stop_lat,stop_lon,direction_id,service_id,wheelchair_boarding,start_date,end_date,shape_id,shape_dist_traveled,days_list
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,43.775988,-79.346987,0,1,1,2024-06-23,2024-07-27,1018210,,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."
1,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:46:00,18:46:00,3807,1949,Don Mills Rd at Leith Hill Rd North Side,...,43.777534,-79.347811,0,1,1,2024-06-23,2024-07-27,1018210,0.3546,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."
2,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:46:00,18:46:00,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,...,43.77953,-79.348701,0,1,1,2024-06-23,2024-07-27,1018210,0.5903,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."
3,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:47:00,18:47:00,1163,1938,Don Mills Rd at Godstone Rd,...,43.782682,-79.348922,0,1,1,2024-06-23,2024-07-27,1018210,0.9613,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."
4,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:49:00,18:49:00,7723,1919,Don Mills Rd at Deerford Rd,...,43.785281,-79.35057,0,1,1,2024-06-23,2024-07-27,1018210,1.2849,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."
5,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:50:00,18:50:00,2498,7384,Van Horne Ave at Don Mills Rd East Side,...,43.787416,-79.352355,0,1,1,2024-06-23,2024-07-27,1018210,1.6424,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."
6,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:50:00,18:50:00,805,7387,Van Horne Ave at Hobart Dr,...,43.787936,-79.349884,0,1,1,2024-06-23,2024-07-27,1018210,1.8512,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."
7,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:51:00,18:51:00,1047,7390,Van Horne Ave at Houston Cres (West),...,43.787625,-79.347637,0,1,1,2024-06-23,2024-07-27,1018210,2.0416,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."
8,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:51:00,18:51:00,8848,7391,Van Horne Ave at Kingslake Rd,...,43.786639,-79.345567,0,1,1,2024-06-23,2024-07-27,1018210,2.2421,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."
9,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:52:00,18:52:00,4206,7385,Van Horne Ave at Edmonton Rd East Side,...,43.786993,-79.339497,0,1,1,2024-06-23,2024-07-27,1018210,2.7426,"[2024-06-24T00:00:00.000000000, 2024-06-25T00:..."


In [18]:
overall_df.shape

(3682047, 21)

In [19]:
def duplicate_by_date(df):
    # duplicate the row for each element of days_list
    return df.explode("days_list")

In [20]:
overall_df = duplicate_by_date(overall_df)

In [21]:
def combine_date_and_time(df):
    # combine date and time together 
    df["expected_arrival_time"] = df.apply(lambda x: datetime.datetime.combine(x["days_list"], x["arrival_time"]), 1)
    df["expected_departure_time"] = df.apply(lambda x: datetime.datetime.combine(x["days_list"], x["departure_time"]), 1)

In [22]:
# call combine times
combine_date_and_time(overall_df)

In [23]:
overall_df.head(10)

Unnamed: 0,route_short_name,route_long_name,trip_headsign,route_id,trip_id,arrival_time,departure_time,stop_id,stop_code,stop_name,...,direction_id,service_id,wheelchair_boarding,start_date,end_date,shape_id,shape_dist_traveled,days_list,expected_arrival_time,expected_departure_time
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-06-24,2024-06-24 18:45:00,2024-06-24 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-06-25,2024-06-25 18:45:00,2024-06-25 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-06-26,2024-06-26 18:45:00,2024-06-26 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-06-27,2024-06-27 18:45:00,2024-06-27 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-06-28,2024-06-28 18:45:00,2024-06-28 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-07-02,2024-07-02 18:45:00,2024-07-02 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-07-03,2024-07-03 18:45:00,2024-07-03 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-07-04,2024-07-04 18:45:00,2024-07-04 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-07-05,2024-07-05 18:45:00,2024-07-05 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,18:45:00,18:45:00,14155,14633,Don Mills Station,...,0,1,1,2024-06-23,2024-07-27,1018210,,2024-07-08,2024-07-08 18:45:00,2024-07-08 18:45:00


In [24]:
# find all unique bus routes 
unique_bus_routes = sorted(overall_df["route_short_name"].unique())
print(unique_bus_routes)

[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 160, 161, 162, 165, 167, 168, 169, 171, 176, 184, 189, 200, 201, 202, 203, 300, 302, 307, 310, 315, 320, 322, 324, 325, 329, 332, 334, 335, 336, 337, 339, 340, 341, 343, 352, 353, 354, 363, 365, 384, 385, 395, 396, 400, 402, 403, 404, 405, 507, 510, 900, 902, 903, 905, 924, 925, 927, 929, 935, 937, 939, 941, 945, 952, 953, 954, 960, 968, 984, 985, 986, 989, 995, 996]


In [25]:
# remove arrival_time and departure_time, and start_date and end_date
overall_df = overall_df[["route_short_name", "route_long_name", "trip_headsign", "route_id", "trip_id", "stop_id", "stop_code", "stop_name", "stop_sequence", "stop_lat", "stop_lon", "direction_id", "service_id", "wheelchair_boarding", "shape_id", "shape_dist_traveled", "days_list", "expected_arrival_time", "expected_departure_time"]]

# rename days_list to be date
overall_df = overall_df.rename(columns={"days_list": "date"})

overall_df.head(10)

Unnamed: 0,route_short_name,route_long_name,trip_headsign,route_id,trip_id,stop_id,stop_code,stop_name,stop_sequence,stop_lat,stop_lon,direction_id,service_id,wheelchair_boarding,shape_id,shape_dist_traveled,date,expected_arrival_time,expected_departure_time
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-06-24,2024-06-24 18:45:00,2024-06-24 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-06-25,2024-06-25 18:45:00,2024-06-25 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-06-26,2024-06-26 18:45:00,2024-06-26 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-06-27,2024-06-27 18:45:00,2024-06-27 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-06-28,2024-06-28 18:45:00,2024-06-28 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-07-02,2024-07-02 18:45:00,2024-07-02 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-07-03,2024-07-03 18:45:00,2024-07-03 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-07-04,2024-07-04 18:45:00,2024-07-04 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-07-05,2024-07-05 18:45:00,2024-07-05 18:45:00
0,10,VAN HORNE,EAST - 10 VAN HORNE towards VICTORIA PARK,72011,47366437,14155,14633,Don Mills Station,1,43.775988,-79.346987,0,1,1,1018210,,2024-07-08,2024-07-08 18:45:00,2024-07-08 18:45:00


In [30]:
test_df = overall_df[overall_df["service_id"] == 4501]
test_df.head(30)

Unnamed: 0,route_short_name,route_long_name,trip_headsign,route_id,trip_id,stop_id,stop_code,stop_name,stop_sequence,stop_lat,stop_lon,direction_id,service_id,wheelchair_boarding,shape_id,shape_dist_traveled,date,expected_arrival_time,expected_departure_time
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-06-25,2024-06-25 11:20:00,2024-06-25 11:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-06-28,2024-06-28 11:20:00,2024-06-28 11:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-02,2024-07-02 11:20:00,2024-07-02 11:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-05,2024-07-05 11:20:00,2024-07-05 11:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-09,2024-07-09 11:20:00,2024-07-09 11:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-12,2024-07-12 11:20:00,2024-07-12 11:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-16,2024-07-16 11:20:00,2024-07-16 11:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-19,2024-07-19 11:20:00,2024-07-19 11:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-23,2024-07-23 11:20:00,2024-07-23 11:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-26,2024-07-26 11:20:00,2024-07-26 11:20:00


In [27]:
# save overall_df as a csv file 
overall_df.to_csv('./data/sampleDataset/gtfs2024-06-23To2024-07-27/routes_schedule_data.csv', index=False)  

In [28]:
# build a function that extracts the bus schedule for a bus given the bus number, stop_code (the stop code on the TTC website), 
# and the day (in a datetime object format)
def get_stop_schedule(bus, stop_code, date):
    results = overall_df[(overall_df["route_short_name"] == bus) & (overall_df["stop_code"] == stop_code) & (overall_df["date"] == date)]
    # sort the result by time in ascending order 
    results = results.sort_values(by="expected_arrival_time")
    return results

In [29]:
# build a function that extracts the bus schedule for a bus given the bus number and the date in datetime format
def get_route_schedule(bus, date):
    results = overall_df[(overall_df["route_short_name"] == bus) & (overall_df["date"] == date)]
    # sort the result by time in ascending order 
    results = results.sort_values(by="expected_arrival_time")
    return results

In [32]:
get_stop_schedule(402, 4649, datetime.datetime(2024, 7, 2)).head(10)

Unnamed: 0,route_short_name,route_long_name,trip_headsign,route_id,trip_id,stop_id,stop_code,stop_name,stop_sequence,stop_lat,stop_lon,direction_id,service_id,wheelchair_boarding,shape_id,shape_dist_traveled,date,expected_arrival_time,expected_departure_time
4246100,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380884,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-02,2024-07-02 10:20:00,2024-07-02 10:20:00
4246017,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380881,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-02,2024-07-02 11:20:00,2024-07-02 11:20:00
4246199,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380887,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-02,2024-07-02 12:50:00,2024-07-02 12:50:00
4246166,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380886,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-02,2024-07-02 13:50:00,2024-07-02 13:50:00
4246133,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380885,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-02,2024-07-02 14:50:00,2024-07-02 14:50:00
4246232,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380888,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019186,2.0859,2024-07-02,2024-07-02 15:50:00,2024-07-02 15:50:00
4246075,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380883,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019185,2.0859,2024-07-02,2024-07-02 16:50:00,2024-07-02 16:50:00
4246050,402,PARKDALE,SOUTH - 402 PARKDALE COMMUNITY BUS towards QUE...,72123,47380882,8785,4649,Symington Ave at Bloor St West,8,43.657415,-79.447871,0,4501,1,1019185,2.0859,2024-07-02,2024-07-02 17:50:00,2024-07-02 17:50:00


In [33]:
get_route_schedule(402, datetime.datetime(2024, 7, 2)).head(10)

Unnamed: 0,route_short_name,route_long_name,trip_headsign,route_id,trip_id,stop_id,stop_code,stop_name,stop_sequence,stop_lat,stop_lon,direction_id,service_id,wheelchair_boarding,shape_id,shape_dist_traveled,date,expected_arrival_time,expected_departure_time
4246435,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,488,11635,Toronto Rehab - Bickle Centre at 130 Dunn Ave,1,43.635202,-79.432965,1,4501,2,1019187,,2024-07-02,2024-07-02 09:00:00,2024-07-02 09:00:00
4246434,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,14768,11723,West Lodge Ave at May Robinson Apartments,2,43.641943,-79.435951,1,4501,2,1019187,0.8858,2024-07-02,2024-07-02 09:10:00,2024-07-02 09:10:00
4246422,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,5294,6842,Queen St West at Lansdowne Ave,3,43.640685,-79.436463,1,4501,1,1019187,1.2068,2024-07-02,2024-07-02 09:12:00,2024-07-02 09:12:00
4246423,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,3798,6852,Queen St West at Sorauren Ave,4,43.639824,-79.440785,1,4501,1,1019187,1.5666,2024-07-02,2024-07-02 09:14:00,2024-07-02 09:14:00
4246424,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,4692,6857,Queen St West at Triller Ave,5,43.639319,-79.44338,1,4501,1,1019187,1.783,2024-07-02,2024-07-02 09:16:00,2024-07-02 09:16:00
4246425,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,9459,6849,Queen St West at Roncesvalles Ave,6,43.638864,-79.445773,1,4501,1,1019187,1.9819,2024-07-02,2024-07-02 09:17:00,2024-07-02 09:17:00
4246433,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,14769,13479,St Joseph'S Health Centre at 30 The Queensway,7,43.639439,-79.450798,1,4501,2,1019187,2.2963,2024-07-02,2024-07-02 09:19:00,2024-07-02 09:19:00
4246419,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,24707,16429,Queen St West at Roncesvalles Ave East Side,8,43.638812,-79.445427,1,4501,1,1019187,3.4276,2024-07-02,2024-07-02 09:24:00,2024-07-02 09:24:00
4246420,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,15132,15341,Queen St West at Triller Ave,9,43.639092,-79.443874,1,4501,1,1019187,3.5552,2024-07-02,2024-07-02 09:24:00,2024-07-02 09:24:00
4246421,402,PARKDALE,NORTH - 402 PARKDALE COMMUNITY BUS towards DUF...,72123,47380895,7830,10152,Queen St West at Beaty Ave,10,43.639599,-79.441272,1,4501,1,1019187,3.7726,2024-07-02,2024-07-02 09:25:00,2024-07-02 09:25:00
