# Routes and Mileage

## Extract routes and endpoints from static MTA data

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import numpy as np

In [3]:
routes = pd.Series([], dtype=str)
endpoints_1 = pd.Series([], dtype=str)
endpoints_2 = pd.Series([], dtype=str)

boroughs = ["google_transit_bronx", "google_transit_brooklyn", "google_transit_manhattan", "google_transit_queens",
           "google_transit_staten_island"]

for borough in boroughs:
    filename = f"../../data/{borough}/routes.txt"
    df = pd.read_csv(filename)
    routes = routes.append(df["route_short_name"], ignore_index=True)
    start_finish_df = df["route_long_name"].str.split(" - ", expand=True)
    endpoints_1 = endpoints_1.append(start_finish_df[0], ignore_index=True)
    endpoints_2 = endpoints_2.append(start_finish_df[1], ignore_index=True)

routes_df = pd.DataFrame()
routes_df["Route"] = routes
routes_df["End point 1"] = endpoints_1
routes_df["End point 2"] = endpoints_2
routes_df = routes_df.set_index("Route")

In [4]:
routes_df.head()

Unnamed: 0_level_0,End point 1,End point 2
Route,Unnamed: 1_level_1,Unnamed: 2_level_1
B1,Bay Ridge,Manhattan Beach
B11,Sunset Park,Midwood
B12,Lefferts Gardens,East New York
B13,Spring Creek,Wyckoff Hospital
B14,Spring Creek,Crown Heights


## Extract Mileage from bus route profiles

In [33]:
profiles_df = pd.read_csv("mta_bus_data/bus_route_profiles.csv")
profiles_df.set_index("Route", inplace=True)
profiles_df.index = profiles_df.index.str.replace(" SBS", "-SBS").str.lower().str.rstrip("*")
profiles_df.head(20)

Unnamed: 0_level_0,"Average Weekday Ridership, 2011","Average Weekday Ridership, 2016","Ridership Change, 2011-2016",On-Time Performance 2016,Wait Assessment 2016,Average Speed (mph),Length of Route (in miles),"Frequency during AM Rush, Buses per Hour (7:30am to 8:30am)","Frequency in the Evening, Buses per Hour (8:30pm to 10:30pm)",# of Stops along Route,...,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
Route,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bx1,38891,36487,-6%,69%,85%,7.0,8.8,7.0,4.0,47.0,...,,,,,,,,,,
bx2,38891,36487,-6%,60%,75%,6.0,8.0,7.0,3.0,50.0,...,,,,,,,,,,
bx3,15800,14921,-6%,65%,79%,7.0,4.5,11.0,7.0,28.0,...,,,,,,,,,,
bx4,12141,11411,-6%,63%,79%,6.0,4.7,5.0,3.0,32.0,...,,,,,,,,,,
bx4a,12141,11411,-6%,59%,77%,6.0,4.9,6.0,3.0,35.0,...,,,,,,,,,,
bx5,12186,11194,-8%,69%,82%,8.0,7.7,10.0,6.0,36.0,...,,,,,,,,,,
bx6,22146,23800,7%,66%,81%,6.0,5.8,17.0,9.0,32.0,...,,,,,,,,,,
bx7,14771,13720,-7%,72%,82%,7.0,6.0,10.0,5.0,43.0,...,,,,,,,,,,
bx8,6763,7282,8%,69%,82%,8.0,8.7,8.0,2.0,55.0,...,,,,,,,,,,
bx9,26868,27175,1%,67%,81%,6.0,6.4,15.0,8.0,37.0,...,,,,,,,,,,


In [37]:
for route in routes:
    stripped_route = route.replace("-SBS", "").lower()
    miles = 0
    if route.lower() in profiles_df.index:
        miles = profiles_df.loc[route.lower(), "Length of Route (in miles)"]
    elif stripped_route in profiles_df.index:
        miles = profiles_df.loc[stripped_route, "Length of Route (in miles)"]
    routes_df.loc[route, "Length (miles)"] = miles
routes_df.head(20)

Unnamed: 0_level_0,End point 1,End point 2,Length (miles)
Route,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bx1,Riverdale,Mott Haven,8.8
Bx10,Riverdale,Norwood,7.4
Bx11,Parkchester,Southern Blvd & GW Bridge,4.9
Bx12,Pelham Pkwy,Fordham Rd,8.1
Bx12-SBS,Pelham Pkwy,Fordham Rd,7.6
Bx13,George Washington Bridge,Yankee Stadium,4.1
Bx15,Fordham Plaza,The Hub,7.6
Bx16,Pelham,Norwood,7.4
Bx17,Fordham Plaza,Port Morris,5.7
Bx18,Morris Heights/High Bridge Circulator,,2.4


In [39]:
OUTPUT_FILE_ROUTES = "my_data/bus_route_endpoint_mileage.csv"
routes_df.to_csv(OUTPUT_FILE_ROUTES)

## Get bus stops in every route

In [5]:
BUSTIME_URL = "https://bustime.mta.info"

stops_df = pd.DataFrame(columns=["Route", "Direction"])
stops_df = stops_df.set_index(["Route", "Direction"])

for route in tqdm(routes):
    query = f"{BUSTIME_URL}/m/index?q={route}"
    html = requests.get(query).text
    soup = BeautifulSoup(html, "html5lib")
    for div in soup.findAll("div", {"class": "directionForRoute"}):
        route_direction = div.find("p", {"class": "directionTitle"}).getText()
        route, _, direction  = route_direction.split(maxsplit=2)
        stop_num = 1
        for stop in div.findAll("li", {"class": ["start", "middle", "end"]}):
                stop_name = stop.find("a").getText()
                stops_df.loc[(route, direction), f"Stop {stop_num}"] = stop_name
                stop_num += 1

100%|██████████| 242/242 [02:21<00:00,  1.71it/s]


In [12]:
stops_df.replace(np.nan, "", inplace=True)
stops_df.dropna(axis=1, how="all", inplace=True)
stops_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Stop 1,Stop 2,Stop 3,Stop 4,Stop 5,Stop 6,Stop 7,Stop 8,Stop 9,Stop 10,...,Stop 108,Stop 109,Stop 110,Stop 111,Stop 112,Stop 113,Stop 114,Stop 115,Stop 116,Stop 117
Route,Direction,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Bx1,MOTT HAVEN 138 ST via CONCOURSE,RIVERDALE AV/W 231 ST,W 231 ST/CORLEAR AV,W 231 ST/BROADWAY,W 231 ST/BAILEY AV,HEATH AV/ALBANY CRES,HEATH AV/SUMMIT PL,SEDGWICK AV/FT INDEPENDENCE ST,SEDGWICK AV/GILES PL,SEDGWICK AV/STEVENSON PL 1,SEDGWICK AV/STEVENSON PL 2,...,,,,,,,,,,
Bx1,RIVERDALE 231 ST via CONCOURSE,LINCOLN AV/E 137 ST,GRAND CONCOURSE/E 138 ST,GRAND CONCOURSE/E 144 ST,GRAND CONCOURSE/E 149 ST,GRAND CONCOURSE/E 153 ST,GRAND CONCOURSE/E 156 ST,GRAND CONCOURSE/E 161 ST,GRAND CONCOURSE/E 163 ST,GRAND CONCOURSE/E 165 ST,GRAND CONCOURSE/MC CLELLAN ST,...,,,,,,,,,,
Bx10,NORWOOD 205 ST STA,RIVERDALE AV/W 263 ST,RIVERDALE AV/W 261 ST,RIVERDALE AV/W 260 ST,RIVERDALE AV/W 259 ST,RIVERDALE AV/W 256 ST,RIVERDALE AV/W 254 ST,HENRY HUDSON PKY W/W 252 ST,HENRY HUDSON PKY W/W 249 ST,HENRY HUDSON PKY W/W 246 ST,HENRY HUDSON PKY W/W 239 ST,...,,,,,,,,,,
Bx10,RIVERDALE 263 ST,E 206 ST/ROCHAMBEAU AVE,E MOSHOLU PY N/VAN CORTLANDT AV E,VAN CORTLANDT AV E/ROCHAMBEAU AV,BAINBRIDGE AV/RESERVOIR OVAL E,BAINBRIDGE AV/E 210 ST,E GUN HILL RD/BAINBRIDGE AV,E GUN HILL RD/DE KALB AV,JEROME AV/E MOSHOLU PY N,JEROME AV/E 205 ST,BEDFORD PK/JEROME AV,...,,,,,,,,,,
Bx11,WASHINGTON HEIGHTS G W BRIDGE,W FARMS RD/WESTCHESTER AV,W FARMS RD/E 167 ST,W FARMS RD/LONGFELLOW AV,E 172 ST/LONGFELLOW AV,E 172 ST/VYSE AV,E 172 ST/SOUTHERN BL,LOUIS NINE BL/E 170 ST,LOUIS NINE BL/BOSTON RD,CLAREMONT PKY/CROTONA PARK E,CLAREMONT PKY/CROTONA AV,...,,,,,,,,,,


In [13]:
OUTPUT_FILE_STOPS = "../../data/stops_by_route.csv"
stops_df.to_csv(OUTPUT_FILE_STOPS)