In [7]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup 
import re

In [8]:
url = "https://www.scotrail.co.uk/plan-your-journey/stations-and-facilities"

In [9]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [10]:
links = []
re_string = r"^\/plan-your-journey\/stations-and-facilities\/([a-z][a-z][a-z])$"
for link in soup.find_all('a'):
    link_text = link.get('href')
    if link_text is not None:
        if re.search(re_string, link_text) is not None:
            links.append(link_text)

In [11]:
station_codes = list(map(lambda x: x[-3:].upper(), links))

In [12]:
addresses = []
urls = [f"https://www.scotrail.co.uk/plan-your-journey/stations-and-facilities/{crs.lower()}" for crs in station_codes]
for url in urls:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    address = soup.find("div", {"class": "address"})
    addresses.append(address)

In [13]:
def strip_postcode(address):
    postcode = " ".join(re.split(r"<br/>", str(address))[-1].split(" ")[:2])
    if len(postcode) > 8:
        postcode = ''.join([c for c in postcode if not c.islower() and not c in ('\n', '\\', '<', '>', '/')])
    return postcode
list(map(strip_postcode, addresses))

['AB11 6LX',
 'KY3 0SN',
 'IV23 2QD',
 'IV22 2EJ',
 'IV54 8YH',
 'EH55 8NQ',
 'ML1 2SX',
 'ML6 9EX',
 'G31 3JN',
 'G83 0UL',
 'FK10 1BA',
 'IV17 0SE',
 'KW12 6UR',
 'G3 8RR',
 'DG12 6AS',
 'G12 0AY',
 'DD11 1RQ',
 'IV24 3AQ',
 'G83 7DT',
 'KA22 8BH',
 'KA22 8AU',
 'KA22 8AU',
 'G2 8DL',
 'PH39 4NJ',
 'EH48 3LP',
 'G83 7DB',
 'G22 6LR',
 'IV54 8YX',
 'KA18 2BH',
 'PH22 1PD',
 'KA7 1TH',
 'G69 7RN',
 'G83 8SS',
 'DD5 4QH',
 'PH33 7JF',
 'KA10 6SA',
 'G69 7TS',
 'G21 4NB',
 'G78 1AA',
 'KA26 0QF',
 'DD11 1PR',
 'EH48 1BA',
 'G61 4AN',
 'PH39 4NR',
 'IV4 7EF',
 'G31 1SG',
 'ML4 1RJ',
 'G64 1PE',
 'PA7 5AD',
 'EH48 3BW',
 'PH18 5SL',
 'ML5 1NP',
 'G72 9BB',
 'PA15 2TF',
 'G60 5AH',
 'PA16 9HA',
 'EH55 8JH',
 'FK9 4PH',
 'PA36 4AD',
 'G40 1BN',
 'KW9 6PY',
 'DD5 2DX',
 'EH15 2NG',
 'G73 3SA',
 'KY3 9DR',
 'G76 8JB',
 'ML6 7RJ',
 'G72 7EL',
 'FK1 4JW',
 'KY5 0BP',
 'G52 2DE',
 'G82 5NL',
 'ML1 5AL',
 'ML8 5AA',
 'G32 8YS',
 'DD7 6AY',
 'G32 6AW',
 'PH23 3AJ',
 'ML11 8PR',
 'PA15 2TG',
 'G44 4

In [14]:
scottish_stations = list(zip(station_codes, map(strip_postcode, addresses)))

In [19]:
scottish_stations = list(map(lambda x: [x[0], x[1]], scottish_stations))
scottish_stations

[['ABD', 'AB11 6LX'],
 ['AUR', 'KY3 0SN'],
 ['AAT', 'IV23 2QD'],
 ['ACN', 'IV22 2EJ'],
 ['ACH', 'IV54 8YH'],
 ['ADW', 'EH55 8NQ'],
 ['AIR', 'ML1 2SX'],
 ['ADR', 'ML6 9EX'],
 ['AXP', 'G31 3JN'],
 ['ALX', 'G83 0UL'],
 ['ALO', 'FK10 1BA'],
 ['ASS', 'IV17 0SE'],
 ['ABC', 'KW12 6UR'],
 ['AND', 'G3 8RR'],
 ['ANN', 'DG12 6AS'],
 ['ANL', 'G12 0AY'],
 ['ARB', 'DD11 1RQ'],
 ['ARD', 'IV24 3AQ'],
 ['AUI', 'G83 7DT'],
 ['ADS', 'KA22 8BH'],
 ['ASB', 'KA22 8AU'],
 ['ADN', 'KA22 8AU'],
 ['AGS', 'G2 8DL'],
 ['ARG', 'PH39 4NJ'],
 ['ARM', 'EH48 3LP'],
 ['ART', 'G83 7DB'],
 ['ASF', 'G22 6LR'],
 ['ATT', 'IV54 8YX'],
 ['AUK', 'KA18 2BH'],
 ['AVM', 'PH22 1PD'],
 ['AYR', 'KA7 1TH'],
 ['BIO', 'G69 7RN'],
 ['BHC', 'G83 8SS'],
 ['BSI', 'DD5 4QH'],
 ['BNV', 'PH33 7JF'],
 ['BSS', 'KA10 6SA'],
 ['BGI', 'G69 7TS'],
 ['BNL', 'G21 4NB'],
 ['BRR', 'G78 1AA'],
 ['BRL', 'KA26 0QF'],
 ['BYL', 'DD11 1PR'],
 ['BHG', 'EH48 1BA'],
 ['BRN', 'G61 4AN'],
 ['BSL', 'PH39 4NR'],
 ['BEL', 'IV4 7EF'],
 ['BLG', 'G31 1SG'],
 ['BLH', 'M

In [20]:
def get_coords(postcode):
    try:
        result = requests.get(f"https://findthatpostcode.uk/postcodes/{postcode.replace(' ', '+')}.json")
        location =  result.json()['data']['attributes']['location']
        return location['lat'], location['lon']
    except Exception:
        print(f"Failed on {postcode}")
        return "FAILED", "FAILED"

In [21]:
for station in scottish_stations:
    lat, lon = get_coords(station[0])
    station.append(lat)
    station.append(lon)

Failed on ABD
Failed on AUR
Failed on AAT
Failed on ACN
Failed on ACH
Failed on ADW
Failed on AIR
Failed on ADR
Failed on AXP
Failed on ALX
Failed on ALO
Failed on ASS
Failed on ABC
Failed on AND
Failed on ANN
Failed on ANL
Failed on ARB
Failed on ARD
Failed on AUI
Failed on ADS
Failed on ASB
Failed on ADN
Failed on AGS
Failed on ARG
Failed on ARM
Failed on ART
Failed on ASF
Failed on ATT
Failed on AUK
Failed on AVM
Failed on AYR
Failed on BIO
Failed on BHC
Failed on BSI
Failed on BNV
Failed on BSS
Failed on BGI
Failed on BNL
Failed on BRR
Failed on BRL
Failed on BYL
Failed on BHG
Failed on BRN
Failed on BSL
Failed on BEL
Failed on BLG
Failed on BLH
Failed on BBG
Failed on BPT
Failed on BKR
Failed on BLA
Failed on BAI
Failed on BLT
Failed on BGS
Failed on BWG
Failed on BCN
Failed on BRC
Failed on BEA
Failed on BRO
Failed on BDG
Failed on BRA
Failed on BYF
Failed on BSU
Failed on BUI
Failed on BTS
Failed on BUS
Failed on CAC
Failed on CBL
Failed on CMO
Failed on CDD
Failed on CDO
Failed

In [None]:
df_station = pd.DataFrame([s[:4] for s in scottish_stations], columns = ["Postcode", "CRS", "lat", "lon"])
df_station = pd.DataFrame(station_codes, columns = ["CRS"]).merge(df_station, how = 'left', on = 'CRS')
df_station.head()

In [None]:
nlc_codes = pd.read_csv("C:\\Users\\User\\Documents\\4. Fourth Year\\Project\\Railway-Fares\\Data\\nlc_codes.csv", dtype = str)
nlc_codes.head()

In [None]:
station_master = df_station.merge(nlc_codes, on='CRS', how = 'left')

In [None]:
station_master = station_master[["Postcode", "CRS", "lat", "lon", "Location", "NLC Clipped"]]
station_master = station_master.rename(columns = {"Location": "Name", "NLC Clipped":"NLC"})

In [15]:
station_master.to_csv("C:\\Users\\User\\Documents\\4. Fourth Year\\Project\\Railway-Fares\\Data\\Cleansed Data\\station.csv", index = False)