In [1]:
from sqlalchemy import create_engine
import pandas as pd
import re

from datetime import datetime, timedelta
import matplotlib

import numpy as np
import math

# For the sake of progress display.
from IPython.display import display, update_display

A quick helper class for displaying progress info during a long operation.

In [2]:
class Progress:
    
    def __init__(self, name, count):
        self.name = name
        self.count = count
        display("Starting '" + name + "'", display_id=name)
        self.last_update = 0
        self.step = math.ceil(self.count/100) # Update when about 1% has elapsed.
    
    def update(self, value):
        if value != self.count and int(value - self.last_update) % self.step > 0:
            return # Don't update too often since that will slow down loops a lot!
        self.last_update = value
        p = (value+1)/self.count
        message = f"{self.name}: {int(100*p)}% complete."
        update_display(message, display_id=self.name)
        
    def complete(self):
        self.update(self.count)

Quick test:

In [3]:
N = 1000000
progress = Progress("Test", N)
count = 0
for i in range(N):
    count += 1
    progress.update(i)
progress.complete()
print(count)

'Test: 100% complete.'

1000000


Another helper method for sorting tables and resetting the index. This shouldn't be necessary for every table, but we've had awkwardness with this in the past and there is no harm in tagging it on.

In [4]:
def sort(table, *fields):
    
    if len(fields) == 0:
        fields = table.columns
    
    # The arguments here are important:
    #  - an unstable sort would defeat the purpose, we're trying to keep foreign keys stable
    #  - making the index a part of the sort would have no affect at all in the db
    
    table.sort_values(by=list(fields), inplace=True, kind="stable", ignore_index=True)

Load GTFS data
==

This can take a minute - stop_times is 500Mb.

The csv table files are expected to be in the same directory as this notebook.

In [44]:
files = [
    "agency",
    "calendar",
    "calendar_dates",
    "routes",
    "shapes",
    "stop_times",
    "stops",
    #"transfers", We don't need this table.
    "trips",
]

# This one is derived from the dublin bus historical
# data, it is not a table in the GTFS spec.
files.append("boundaries")

for name in files:
    print("Reading " + name)
    globals()[name] = pd.read_csv(name + ".txt")
    
    
print("Parsing dates")

def parse_dates(table, field):
    table[field] = pd.to_datetime(table[field], format="%Y%m%d")

parse_dates(calendar, "start_date")
parse_dates(calendar, "end_date")
parse_dates(calendar_dates, "date")
    
print("Complete")

Reading agency
Reading calendar
Reading calendar_dates
Reading routes
Reading shapes
Reading stop_times
Reading stops
Reading trips
Reading boundaries
Parsing dates
Complete


Minor note: directions is store as numbers sometimes

In [6]:
DIRECTION_OUTBOUND = 0
DIRECTION_INBOUND = 1
directions = [DIRECTION_OUTBOUND, DIRECTION_INBOUND]

We could also make a table for it?

Connect to Database
==

Test the database connection.

In [7]:
USER = "admin"
PASSWORD = "12345678"
HOST = "dublinbus.cwaixvtk8gyq.us-east-1.rds.amazonaws.com"
PORT = 3306
SCHEMA = "dublin_bus"
CONNECTION_STRING = f"mysql://{USER}:{PASSWORD}@{HOST}:{PORT}/{SCHEMA}"

engine = create_engine(CONNECTION_STRING)
with engine.connect():
    print("Successfully tested DB connection")

Successfully tested DB connection


Functions for pandas sql and foreign keys.

In [108]:
PREFIX = "api_"

foreign_sources = globals().get("foreign_sources", dict())

# Enable or disable transactions
ENABLE_SQL = True

def commit(data, table_name, index_label="id"):
    
    foreign_sources[table_name] = data
    
    table_name = PREFIX + table_name
    
    if ENABLE_SQL:
        data.to_sql(table_name, engine, if_exists="append", 
                    index=(index_label is not None), index_label=index_label,
                   chunksize=5000, method="multi")
    else:
        print("(Database transactions disabled)")
        
        
        
def use_foreign_key(table, field_name, foreign_name, foreign_field):
        
    try:
        foreign_table = foreign_sources[foreign_name]
    except:
        raise ValueError("No table called '" + str(foreign_name) + "' has been comitted.")
        
    # This process can take a while for large tables, so show progress
    N = len(foreign_table)
    M = len(table)
    progress = Progress(f"Converting {field_name}", N+M)
    
    conversion = dict()
    for i in foreign_table.index:
        conversion[foreign_table[foreign_field][i]] = i
        progress.update(i)

    # The standard dataframe replace consumed lots and lots of memory and was way too slow, 
    # I don't know why. So a list is used here instead.
    
    new_values = list()
    for i, old_value in enumerate(table[field_name]):
        new_values.append(conversion[old_value])
        progress.update(N + i)

    table[field_name] = new_values
    
    progress.complete()

Minor note: a simple numerical ID is included in every table, for the sake of easy and performant inner joins. I try to include the original char based IDs these entities have for the sake of interoperation with external services, and for debugging.

Send GTFS Data to Database
==

Names
--

This is not part of GTFS, it's derived from the routes table.

From the user point of view, routes are identified by name, not route ID. This means that many of the queries from the frontend will be using the (non-unique) name of a route, and multiple routes with the same name should be treated as a single route when aggregating results.

Since Django works well with foreign keys, and MySQL works well with integer primary keys, it helps to have an index table of names.

In [9]:
def shorten_name(name):
    match = re.match("(\w+)(?:\-\w+)?", name)
    if match is None:
        raise ValueError(f"Unrecognized name format: '{name}'")
    return match.groups()[0]

def shorten_names(table, field_name="name"):
    table[field_name] = table[field_name].apply(shorten_name)
    
names = set(routes.route_short_name.apply(shorten_name))
records = [[name] for name in names]
route_names = pd.DataFrame(records, columns=["name"])
sort(route_names, "name")

display(route_names.head(5))
commit(route_names, "routenames")

Unnamed: 0,name
0,1
1,100
2,101
3,102
4,103


(Database transactions disabled)


Function for the sake of inspecting the names and seeing if they are what we expect as users.

In [10]:
def show_names():

    current = 0
    for i in range(1000):
        for j in range(10):
            if current >= len(route_names):
                continue
            print(route_names.name[route_names.index[current]], end="")
            if j != 9:
                print(", ", end="")
            current += 1
        print()

        if current >= len(route_names):
                break
                
#show_names()

Agencies
--

In [11]:
agency

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang
0,03C,GoAhead Commuter,https://www.transportforireland.ie,Europe/Dublin,EN
1,978,Dublin Bus,https://www.transportforireland.ie,Europe/Dublin,EN
2,01,Bus Éireann,https://www.transportforireland.ie,Europe/Dublin,EN
3,01X,Expressway,https://www.transportforireland.ie,Europe/Dublin,EN
4,03,Go-Ahead,https://www.transportforireland.ie,Europe/Dublin,EN


In [12]:
agency_v2 = agency.copy()[["agency_id", "agency_name"]]
agency_v2.columns = ["external_id", "name"]
sort(agency_v2, "external_id")

display(agency_v2)
commit(agency_v2, "agency")

Unnamed: 0,external_id,name
0,01,Bus Éireann
1,01X,Expressway
2,03,Go-Ahead
3,03C,GoAhead Commuter
4,978,Dublin Bus


(Database transactions disabled)


Calendar and Calendar Exceptions
==

Each entry in the main calendar is a 'service', which runs from a start date to an end date on certain days of the week.

In [13]:
calendar.head(2)

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,1,0,0,0,0,0,0,1,2021-06-23,2021-12-18
1,1#1,1,0,0,0,0,0,1,2021-06-23,2021-12-04


In [14]:
# This is for the sake of the cells after this, to keep naming consistant
services = calendar.copy()

# This is for the db.
services_v2 = services.copy()
services_v2.rename(columns={"service_id": "external_id"}, inplace=True)
sort(services_v2, "external_id")

display(services_v2.head(2))
commit(services_v2, "services")

Unnamed: 0,external_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,1,0,0,0,0,0,0,1,2021-06-23,2021-12-18
1,1#1,1,0,0,0,0,0,1,2021-06-23,2021-12-04


(Database transactions disabled)


Exceptions to the main calendar are given in a second table.

In [15]:
calendar_dates.head(2)

Unnamed: 0,service_id,date,exception_type
0,1,2021-08-01,2
1,1,2021-08-08,2


In [16]:
service_exceptions = calendar_dates.copy()
service_exceptions_v2 = service_exceptions.copy()
use_foreign_key(service_exceptions_v2, "service_id", "services", "external_id")
sort(service_exceptions_v2)

display(service_exceptions_v2.head(2))
commit(service_exceptions_v2, "serviceexceptions")

'Converting service_id: 100% complete.'

Unnamed: 0,service_id,date,exception_type
0,0,2021-08-01,2
1,0,2021-08-08,2


(Database transactions disabled)


Routes
--

This table is just an index of route names, it does not contain information r.e. paths or stops or times.

Also, 'route' here refers to the GTFS version of route. The short name '46A' is associated with 3 such routes.

In [17]:
routes.head(2)

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type
0,10-100-e20-1,1,100,Drogheda Bus Station -,3
1,10-101-e20-1,1,101,Busáras - Drogheda Bus Station,3


`route_type` is ignored here, it's always 3 as of writing this.

In [18]:
routes_v2 = routes.copy().drop("route_type", axis=1)
routes_v2.columns = [
    "external_id", "agency", "name", "long_name"
]
sort(routes_v2, "external_id")

shorten_names(routes_v2)
display(routes_v2.head(2))

routes_v3 = routes_v2.copy()
routes_v3.columns = ["external_id", "agency_id", "name_id", "long_name"]
use_foreign_key(routes_v3, "agency_id", "agency", "external_id")
use_foreign_key(routes_v3, "name_id", "routenames", "name")

display(routes_v3.head(2))
commit(routes_v3, "routes")

Unnamed: 0,external_id,agency,name,long_name
0,10-100-e20-1,1,100,Drogheda Bus Station -
1,10-101-e20-1,1,101,Busáras - Drogheda Bus Station


'Converting agency_id: 100% complete.'

'Converting name_id: 100% complete.'

Unnamed: 0,external_id,agency_id,name_id,long_name
0,10-100-e20-1,0,1,Drogheda Bus Station -
1,10-101-e20-1,0,2,Busáras - Drogheda Bus Station


(Database transactions disabled)


**Note!** - the short name is not a unique ID

Shapes
--

Shapes describe the path the bus takes, not just the stops. So the line a shape describes follows the road exactly.

I'm sure if we'll need this table. It might come in useful with real time icons.

In [19]:
shapes.head(2)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,10-100-e20-1.229.O,53.71161,-6.353111,1,0.0
1,10-100-e20-1.229.O,53.711849,-6.352845,2,31.85


In [20]:
shapes_v2 = shapes.copy()
shapes_v2.columns = ["external_id", "lat", "lon", "sequence", "dist_traveled"]
sort(shapes_v2, "external_id")

display(shapes_v2.head(10))
commit(shapes_v2, "shapes")

Unnamed: 0,external_id,lat,lon,sequence,dist_traveled
0,10-100-e20-1.229.O,53.71161,-6.353111,1,0.0
1,10-100-e20-1.229.O,53.711849,-6.352845,2,31.85
2,10-100-e20-1.229.O,53.712178,-6.352604,3,71.73
3,10-100-e20-1.229.O,53.712258,-6.352571,4,80.94
4,10-100-e20-1.229.O,53.712468,-6.352745,5,106.85
5,10-100-e20-1.229.O,53.712566,-6.353347,6,147.95
6,10-100-e20-1.229.O,53.713637,-6.357411,7,440.72
7,10-100-e20-1.229.O,53.713797,-6.357904,8,477.74
8,10-100-e20-1.229.O,53.713908,-6.358173,9,499.32
9,10-100-e20-1.229.O,53.714136,-6.358437,10,530.08


(Database transactions disabled)


Stops
--

This is just an index of stops names & locations, it does not describe the involvement of stops in routes or trips.

That's now ready to go to the db.

In [21]:
stops.head(2)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,7010B158131,"Ulsterbus Depot, stop 158131",54.996629,-7.317866
1,7010B158241,"Magee Campus, stop 158241",55.004476,-7.321782


Separating the name into the actual name and the number seems like something handy for styling

In [92]:
stops_v2 = stops.copy()
stops_v2["external_name"] = stops_v2.stop_name.copy()

names = list()
numbers = list()
for i in stops_v2.index:
    stop_name = stops_v2.external_name[i]
    match = re.match("(.*?),?\s*(?:[Ss]top\s+)?(?:[Nn]o\.?\s*)?(\d+)", stop_name)
    if match is None:
        names.append(stop_name)
        numbers.append(None)
    else:
        name, number = match.groups()
        names.append(name)
        numbers.append(number)
        
stops_v2["name"] = names
stops_v2["number"] = numbers

def convert(value):
    return -1 if value is None else value
stops_v2.number = stops_v2.number.apply(convert).astype(int)

stops_v2 = stops_v2[["stop_id", "name", "number",  "stop_lon", "stop_lat", "external_name"]]
stops_v2.columns = ["external_id", "name", "number", "lon", "lat", "external_name"]

sort(stops_v2, "external_id")

display(stops_v2.head(2))
commit(stops_v2, "stops")

Unnamed: 0,external_id,name,number,lon,lat,external_name
0,7010B158131,Ulsterbus Depot,158131,-7.317866,54.996629,"Ulsterbus Depot, stop 158131"
1,7010B158241,Magee Campus,158241,-7.321782,55.004476,"Magee Campus, stop 158241"


(Database transactions disabled)


Trips
--

This is an index of trips. A trip is linked to a route and a service. The service dictates the days it operates on. The route does not uniquely identify the path - `stop_times` or `shapes` is needed for that.

In [23]:
trips.head(2)

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id
0,10-100-e20-1,1,1954979.1.10-100-e20-1.232.I,10-100-e20-1.232.I,- Drogheda Bus Station,1
1,10-100-e20-1,1,1954981.1.10-100-e20-1.231.O,10-100-e20-1.231.O,Drogheda Bus Station -,0


In [24]:
trips_v2 = trips.copy()
use_foreign_key(trips_v2, "route_id", "routes", "external_id")
use_foreign_key(trips_v2, "service_id", "services", "external_id")
use_foreign_key(trips_v2, "shape_id", "shapes", "external_id")

trips_v2.rename(columns={
    "trip_id": "external_id",
    "trip_headsign": "headsign",
    "direction_id": "direction"
}, inplace=True)

display(trips_v2.head(2))

trips_v2 = trips_v2[[
    "external_id", "route_id", "service_id", 
    "shape_id", "headsign", "direction"
]]

sort(trips_v2, "external_id")

display(trips_v2.head(2))

commit(trips_v2, "trips")

'Converting route_id: 100% complete.'

'Converting service_id: 100% complete.'

'Converting shape_id: 100% complete.'

Unnamed: 0,route_id,service_id,external_id,shape_id,headsign,direction
0,0,0,1954979.1.10-100-e20-1.232.I,1147,- Drogheda Bus Station,1
1,0,0,1954981.1.10-100-e20-1.231.O,839,Drogheda Bus Station -,0


Unnamed: 0,external_id,route_id,service_id,shape_id,headsign,direction
0,1.10432.2-18-ga2-1.15.I,293,21,718337,Sandymount - Palmerstown,1
1,1.Mo-Fr.2-197-y11-3.6.O,299,42,724962,"Ashbourne Retail Pk, stop 101001",0


(Database transactions disabled)


Stop Times
--

This is a big table, with 6 million rows. It links trips and stops together via a time and seq. number, and gives some handy metadata.

In [25]:
stop_times.head(2)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,1143.y1006.17-130-cm1-1.82.I,06:10:00,06:10:00,8260B136421,1,,0,0,0.0
1,1143.y1006.17-130-cm1-1.82.I,06:18:00,06:18:00,8260B1324001,2,,0,0,7572.44


In [26]:
stop_times_v2 = stop_times.copy()

def shift_time(table, field):
    new_values = list()
    count = 0
    for item in table[field].values:
        if item == "24:00:00": 
            item = "23:59:00"
            count += 1
        new_values.append(item)
    table[field] = new_values
    print(f"Shifted 24:00 on {count} rows for field '{field}'")
    

shift_time(stop_times_v2, "arrival_time")
shift_time(stop_times_v2, "departure_time")

use_foreign_key(stop_times_v2, "trip_id", "trips", "external_id")
use_foreign_key(stop_times_v2, "stop_id", "stops", "external_id")
sort(stop_times_v2, "trip_id", "stop_sequence")

display(stop_times_v2.head(2))
commit(stop_times_v2, "stoptimes")

Shifted 24:00 on 1314 rows for field 'arrival_time'
Shifted 24:00 on 1296 rows for field 'departure_time'


'Converting trip_id: 100% complete.'

'Converting stop_id: 100% complete.'

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,0,06:00:00,06:00:00,377,1,,0,0,0.0
1,0,06:02:00,06:02:00,1528,2,,0,0,496.44


(Database transactions disabled)


Boundaries
--

A table derived from UCD Dublin Bus historical data as opposed to the GTFS data.

It gives a suggested bound on delays expected for trips.

In [46]:
boundaries.head(2)

Unnamed: 0,Route,Direction,Stop,Max,Min
0,61,1,1035,472.0,-111.0
1,70D,2,3340,42.0,-75.0


In [None]:
boundaries_v2 = boundaries.copy()
boundaries_v2.columns = ["route", "direction", "stop", "max", "min"]

# The historical convention is 1, 2 for outbound, inbound
# The GTFS convention used by the rest of the tables is 0, 1 for oubound, inbound
boundaries_v2.direction = boundaries_v2.direction - 1

# Some stops and routes don't exist in the GTFS database, remove them
boundaries_v2 = boundaries_v2[boundaries_v2.route.isin(route_names.name)]
boundaries_v2 = boundaries_v2[boundaries_v2.stop.isin(stops_v2.number)]


use_foreign_key(boundaries_v2, "stop", "stops", "number")
use_foreign_key(boundaries_v2, "route", "routenames", "name")

boundaries_v2.columns = ["route_name_id", "direction", "stop_id", "max", "min"]
boundaries_v2.head(2)
commit(boundaries_v2, "predictionbounds")

'Converting stop: 100% complete.'

'Converting route: 100% complete.'

Additional Derived Tables
==

Named Route Stops
--

Each route has multiple paths and multiple possible sets of stops. There is no explicit list of stops for a route (much less a shortname) in GTFS, it's implied implicitly by the stop times table and the trips table.

However, we'd like a representative subset - the stops associated with the most common shape_id for a short name should do there.

We also note the remaining stops, and tag them with a "variant" flag.

The utility functions below are intended to make linking tables together easier. (There are probably built-in pandas ways of doing this, but I didn't find any from a quick internet search.)

In [27]:
def lookup(table, key_fields, value_fields):
    
    # Group sets of distinct values in a table by a 
    # common key, and store them in a dictionary 
    # (Kind of like .groupby, but more direct?)
    
    # Iterables can be passed for the keys or values
    # to make multikeys / group lists of values.
    
    multikey = isinstance(key_fields, list) or isinstance(key_fields, tuple)
    multivalue = isinstance(value_fields, list) or isinstance(value_fields, tuple)
        
    keys = set()
    for key in table[key_fields].values:
        if multikey:
            key = tuple(key)
        keys.add(key)
        
    result = dict()
    for key in keys:
        result[key] = set()
        
    keys = table[key_fields].values
    values = table[value_fields].values
        
    for i in range(len(table)):
        
        key = keys[i]
        value = values[i]
        
        if multikey: key = tuple(key)
        if multivalue: value = tuple(value)
        
        result[key].add(value)
    
    return result

def invert(lookup_ab):
     
    # Invert a lookup as created by the previous function.
    # (Lookup the keys by the values they correspond to)
        
    lookup_ba = dict()
    for a, b_set in lookup_ab.items():
        for b in b_set:
            a_set = lookup_ba.get(b, None)
            if a_set is None:
                a_set = set()
                lookup_ba[b] = a_set
            a_set.add(a)
            
    return lookup_ba
            
def chain(lookup_ab, lookup_bc):
    
    # For two lookups where the values of
    # the first are keys in the second, make a
    # new lookup linking keys of the first to
    # values of the second.
    
    lookup_ba = invert(lookup_ab)
    
    lookup_ac = dict()
    for a in lookup_ab:
        lookup_ac[a] = set()
    
    for b, c_set in lookup_bc.items():
        a_set = lookup_ba[b]
        for a in a_set:
            for c in c_set:
                lookup_ac[a].add(c)
        
    return lookup_ac

Now, Link name/agency pairs to shapes, and shapes to stops.

This takes quite a while - the code above is written for concision rather than efficiency and is slow, and the stop_times table that is used has 6 million rows.

In [28]:
print("Linking names amd agencies to shapes")

name_agency_routes = lookup(routes_v2, ["name", "agency"], "external_id")
route_shapes = lookup(trips, "route_id", "shape_id")
name_shapes = chain(name_agency_routes, route_shapes)

print("Linking shapes to stops")

shape_trips = lookup(trips, "shape_id", "trip_id")
trip_stops = lookup(stop_times, "trip_id", ["stop_id", "stop_sequence"])
shape_stops = chain(shape_trips, trip_stops)

print("Complete")

Linking names amd agencies to shapes
Linking shapes to stops
Complete


It turns out that stop sequences aren't perfect. For example, trip '10176.y1006.60-41B-d12-1.225.I' is missing stop_sequence 10, it just goes from 9 to 11. And there are lots of examples of that.

It seems like no two stops ever have the same sequence in a trip, which is good at least

In [29]:
for shape, stop_sequence in shape_stops.items():
    numbers = list()
    for stop, number in stop_sequence:
        numbers.append(number)
    if len(numbers) != len(set(numbers)):
        print("Duplicates detected for shape:", shape)
        break
else:
    print("No duplicates found")

No duplicates found


Step two is to find a representative shape for each route.

In [30]:
# I'd like to do this by trip frequency. The one thing to be careful about is that each 
# trip is part of a service, and different services cover different numbers of days. 
# So the first thing is to weight each service. 
# (just from the base calendar, won't bother with exceptions here)

days_of_week = [
    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"
]

print("Weighting services")

service_weights = dict()
for i in calendar.index:
    weekdays = 0
    for day in days_of_week:
        weekdays += calendar[day][i]
    period = calendar.end_date[i] - calendar.start_date[i]
    weight = period.days*(weekdays/7)
    service_id = calendar.service_id[i]
    service_weights[service_id] = weight

print("Weighting shapes")

route_shape_weights = dict()

for route_id in routes.route_id:
    shape_weights = dict()
    for route_shape in route_shapes[route_id]:
        shape_weights[route_shape] = 0
    route_shape_weights[route_id] = shape_weights
    
for i in trips.index:
    route_id = trips.route_id[i]
    shape_id = trips.shape_id[i]
    service_id = trips.service_id[i]
    weight = service_weights[service_id]
    route_shape_weights[route_id][shape_id] += weight
    
print("Selecting shapes by weight")
    
directions = [DIRECTION_OUTBOUND, DIRECTION_INBOUND]

name_agency_shapes = dict()
name_agency_weights = dict()
for key in name_agency_routes:
    current_shapes = dict()
    current_weights = dict()
    for direction in directions:
        current_shapes[direction] = None
        current_weights[direction] = 0
    name_agency_shapes[key] = current_shapes
    name_agency_weights[key] = current_weights

for i in routes_v2.index:
    
    route_id = routes_v2.external_id[i]
    agency_id = routes_v2.agency[i]
    name = routes_v2.name[i]
    shape_weights = route_shape_weights[route_id]
    
    current_shapes = name_agency_shapes[name, agency_id]
    current_weights = name_agency_weights[name, agency_id]
    
    for shape_id, weight in shape_weights.items():
        
        if shape_id[-1] == 'O':
            direction = DIRECTION_OUTBOUND
        else:
            direction = DIRECTION_INBOUND
        
        if weight > current_weights[direction]:
            current_weights[direction] = weight
            current_shapes[direction] = shape_id
            
    name_agency_shapes[name, agency_id] = current_shapes
    name_agency_weights[name, agency_id] = current_weights
    
print("Complete") 

Weighting services
Weighting shapes
Selecting shapes by weight
Complete


Step 3 is to collect stops for each route using the shape lookups. By using the representative shape selected and the shape lookup, we can identify which stops are representative, and give those stops a sequence number.

In [31]:
names = routes.route_short_name.unique()

records = list()

for name, agency_id in name_agency_routes:
    for direction in directions:
        
        all_shapes = set()
        for route in name_agency_routes[name, agency_id]:
            all_shapes.update(route_shapes[route])
                
        main_shape = name_agency_shapes[name, agency_id][direction]
        if main_shape is None:
            # Some routes are one direction only
            continue
        
        all_stops = set()
        for shape in all_shapes:
            if (shape[-1] == 'O') == (direction == DIRECTION_OUTBOUND):
                for stop, number in shape_stops[shape]:
                    all_stops.add(stop)
                    
        main_stop_sequence = shape_stops[main_shape]
        main_stops = dict()
        for stop, number in main_stop_sequence:
            main_stops[stop] = number
        
        for stop in all_stops:
            if stop in main_stops:
                main = True
                sequence = main_stops[stop]
            else:
                main = False
                sequence = None
            records.append([name, agency_id, direction, stop, main, sequence, main_shape])

route_stops = pd.DataFrame(records, columns=["name", "agency", "direction", "stop_id", "main", "sequence", "shape_id"])
use_foreign_key(route_stops, "shape_id", "shapes", "external_id")

# Pandas has support for nullable integers, which works well here
route_stops.sequence = route_stops.sequence.astype("Int64")

route_stops.head(3)

'Converting shape_id: 100% complete.'

Unnamed: 0,name,agency,direction,stop_id,main,sequence,shape_id
0,84X,978,0,8350DB004210,True,22,1009750
1,84X,978,0,8220DB000848,True,5,1009750
2,84X,978,0,8350DB004305,True,34,1009750


Finally, the routes stops data can be sent to the database

In [32]:
route_stops_v2 = route_stops.copy()

use_foreign_key(route_stops_v2, "stop_id", "stops", "external_id")
shorten_names(route_stops_v2)
use_foreign_key(route_stops_v2, "name", "routenames", "name")
use_foreign_key(route_stops_v2, "agency", "agency", "external_id")

route_stops_v2.rename(columns={
    "name": "name_id",
    "agency": "agency_id"
}, inplace=True)

sort(route_stops_v2)

display(route_stops_v2.head(2))
commit(route_stops_v2, "routestops")

'Converting stop_id: 100% complete.'

'Converting name: 100% complete.'

'Converting agency: 100% complete.'

Unnamed: 0,name_id,agency_id,direction,stop_id,main,sequence,shape_id
0,0,4,0,152,True,13,765016
1,0,4,0,153,True,15,765016


(Database transactions disabled)
