## 1. Data Acquisition via TfL API

In [12]:
import requests
import pandas as pd

all_stops = []
page = 1

while True:
    url = f'https://api.tfl.gov.uk/StopPoint/Mode/bus?page={page}'
    response = requests.get(url)
    data = response.json()

    if 'stopPoints' not in data or not data['stopPoints']:
        break

    for stop in data['stopPoints']:
        all_stops.append({
            'id': stop.get('id'),
            'naptanId': stop.get('naptanId'),
            'commonName': stop.get('commonName'),
            'lat': stop.get('lat'),
            'lon': stop.get('lon'),
            'modes': ', '.join(stop.get('modes', [])),
            'lines': ', '.join([l['name'] for l in stop.get('lines', [])])
        })

    print(f'Page {page} fetched — {len(all_stops)} bus stops in total.')
    page += 1

df = pd.DataFrame(all_stops)
display(df.head())
df.to_csv('latest_bus_stops.csv', index = False)

Page 1 fetched — 1000 bus stops in total.
Page 2 fetched — 2000 bus stops in total.
Page 3 fetched — 3000 bus stops in total.
Page 4 fetched — 4000 bus stops in total.
Page 5 fetched — 5000 bus stops in total.
Page 6 fetched — 6000 bus stops in total.
Page 7 fetched — 7000 bus stops in total.
Page 8 fetched — 8000 bus stops in total.
Page 9 fetched — 9000 bus stops in total.
Page 10 fetched — 10000 bus stops in total.
Page 11 fetched — 11000 bus stops in total.
Page 12 fetched — 12000 bus stops in total.
Page 13 fetched — 13000 bus stops in total.
Page 14 fetched — 14000 bus stops in total.
Page 15 fetched — 15000 bus stops in total.
Page 16 fetched — 16000 bus stops in total.
Page 17 fetched — 17000 bus stops in total.
Page 18 fetched — 18000 bus stops in total.
Page 19 fetched — 19000 bus stops in total.
Page 20 fetched — 20000 bus stops in total.
Page 21 fetched — 21000 bus stops in total.
Page 22 fetched — 22000 bus stops in total.
Page 23 fetched — 23000 bus stops in total.
Page 2

Unnamed: 0,id,naptanId,commonName,lat,lon,modes,lines
0,4900080165,4900080165,.Heathrow Central,51.47096,-0.45384,bus,
1,490008016N2,490008016N2,.Heathrow Central,51.47084,-0.45231,bus,
2,490000200E,490000200E,.Russell Square,51.52233,-0.12713,bus,"14, SL6"
3,490004889E,490004889E,Abbeville Road,51.45264,-0.14249,bus,"50, 355"
4,490004889W,490004889W,Abbeville Road,51.45273,-0.1429,bus,"355, 50"


## 2. Data Preprocessing

In [13]:
missing_count = df['lines'].isna().sum()
print(f'Total missing lines: {missing_count}')

Total missing lines: 0


In [14]:
# Add a new column to the DataFrame that indicates whether the stop is served by buses only.
df['is_pure_bus'] = df['modes'].apply(lambda x: x.strip() == 'bus').map({True: 1, False: 0})

In [None]:
# Clean
def clean_lines(text):
    if pd.isna(text):
        return ''
    lines = [l.strip() for l in text.split(',')]
    return ', '.join(sorted(set(lines)))

df['lines'] = df['lines'].apply(clean_lines)

df['commonName'] = df['commonName'].str.lstrip('.').str.strip()
df['lines'] = df['lines'].apply(clean_lines)

In [16]:
# Merge
def merge_lines(series):
    all_lines = []
    for line_list in series:
        all_lines.extend([l.strip() for l in line_list.split(',') if l.strip()])
    return ', '.join(sorted(set(all_lines)))

grouped_df = df.groupby(['commonName', 'lat', 'lon','is_pure_bus']).agg({
    'id': 'first',
    'lines': merge_lines
    }).reset_index()

display(grouped_df.head())
grouped_df.to_csv('processed_bus_stops.csv', index = False)

Unnamed: 0,commonName,lat,lon,is_pure_bus,id,lines
0,Abbeville Road,51.45264,-0.14249,1,490004889E,"355, 50"
1,Abbeville Road,51.45273,-0.1429,1,490004889W,"355, 50"
2,Abbey Lane,51.53378,-0.00718,1,490003025W,"108, 25, 276, 425, D8, N205, N25, N8"
3,Abbey Road,51.41598,-0.18575,1,490007938E,"131, 152, 200, 219, 57, 655, N155"
4,Abbey Road,51.41611,-0.18536,1,490007938F,"131, 152, 200, 219, 57, 655, N155"
