In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [2]:
url = "https://www.scotrail.co.uk/plan-your-journey/stations-and-facilities"

In [3]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
links = []
re_string = r"^\/plan-your-journey\/stations-and-facilities\/([a-z][a-z][a-z])$"
for link in soup.find_all('a'):
    link_text = link.get('href')
    if link_text is not None:
        if re.search(re_string, link_text) is not None:
            links.append(link_text)

In [5]:
station_codes = list(map(lambda x: x[-3:].upper(), links))

In [6]:
addresses = []
urls = [f"https://www.scotrail.co.uk/plan-your-journey/stations-and-facilities/{crs.lower()}" for crs in station_codes]
for url in urls:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    address = soup.find("div", {"class": "address"})
    addresses.append(address)

In [7]:
def strip_postcode(address):
    postcode = " ".join(re.split(r"<br/>", str(address))[-1].split(" ")[:2])
    if len(postcode) > 8:
        postcode = ''.join([c for c in postcode if not c.islower() and not c in ('\n', '\\', '<', '>', '/')])
    return postcode
list(map(strip_postcode, addresses))

['AB11 6LX',
 'KY3 0SN',
 'IV23 2QD',
 'IV22 2EJ',
 'IV54 8YH',
 'EH55 8NQ',
 'ML1 2SX',
 'ML6 9EX',
 'G31 3JN',
 'G83 0UL',
 'FK10 1BA',
 'IV17 0SE',
 'KW12 6UR',
 'G3 8RR',
 'DG12 6AS',
 'G12 0AY',
 'DD11 1RQ',
 'IV24 3AQ',
 'G83 7DT',
 'KA22 8BH',
 'KA22 8AU',
 'KA22 8AU',
 'G2 8DL',
 'PH39 4NJ',
 'EH48 3LP',
 'G83 7DB',
 'G22 6LR',
 'IV54 8YX',
 'KA18 2BH',
 'PH22 1PD',
 'KA7 1TH',
 'G69 7RN',
 'G83 8SS',
 'DD5 4QH',
 'PH33 7JF',
 'KA10 6SA',
 'G69 7TS',
 'G21 4NB',
 'G78 1AA',
 'KA26 0QF',
 'DD11 1PR',
 'EH48 1BA',
 'G61 4AN',
 'PH39 4NR',
 'IV4 7EF',
 'G31 1SG',
 'ML4 1RJ',
 'G64 1PE',
 'PA7 5AD',
 'EH48 3BW',
 'PH18 5SL',
 'ML5 1NP',
 'G72 9BB',
 'PA15 2TF',
 'G60 5AH',
 'PA16 9HA',
 'EH55 8JH',
 'FK9 4PH',
 'PA36 4AD',
 'G40 1BN',
 'KW9 6PY',
 'DD5 2DX',
 'EH15 2NG',
 'G73 3SA',
 'KY3 9DR',
 'G76 8JB',
 'ML6 7RJ',
 'G72 7EL',
 'FK1 4JW',
 'KY5 0BP',
 'G52 2DE',
 'G82 5NL',
 'ML1 5AL',
 'ML8 5AA',
 'G32 8YS',
 'DD7 6AY',
 'G32 6AW',
 'PH23 3AJ',
 'ML11 8PR',
 'PA15 2TG',
 'G44 4

In [8]:
scottish_stations = list(zip(station_codes, map(strip_postcode, addresses)))

In [9]:
correct_postcodes = pd.DataFrame(scottish_stations, columns = ["CRS", "Scotrail Postcode"])

In [10]:
data_path = "C:\\Users\\User\\Documents\\4. Fourth Year\\Project\\Railway-Fares\\Data\\Cleansed Data\\"

stations = pd.read_csv(data_path + "station.csv", dtype = str)

In [15]:
joined = correct_postcodes.merge(stations[["CRS", "Postcode"]], on = "CRS")

joined["match"] = joined.apply(lambda x: 1 if x[1][:4] == x[2][:4] else 0, axis = 1)

In [16]:
joined

Unnamed: 0,CRS,Scotrail Postcode,Postcode,match
0,ABD,AB11 6LX,AB11 6RT,1
1,AUR,KY3 0SN,KY3 0TQ,1
2,AAT,IV23 2QD,IV23 2QD,1
3,ACN,IV22 2EJ,IV22 2EE,1
4,ACH,IV54 8YH,IV54 8YU,1
...,...,...,...,...
358,WCK,KW1 4QT,KW1 5LB,1
359,WLM,G76 7HQ,G76 7NP,1
360,WSH,ML2 0EX,ML2 0EX,1
361,WDL,PA14 5PP,PA14 6QX,1


In [18]:
joined.to_csv(data_path + "postcode_check.csv", index = False)

In [14]:
"EH8 9SW"[:5]

'EH8 9'