# Text scrape Leeds City Council Election results into csv 

Data will be published on Leeds data mill but if you need it sooner than two weeks you can use the following code!

In [None]:
# import library for querying website
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver

In [None]:
# since 2022 Leeds city council has listed results on a single page on their website
# we set the address as a variable below

main_page = 'https://www.leeds.gov.uk/your-council/elections/leeds-city-council-election-results'

In [None]:
# set up selenium to use chrome in headless state
# this opens the page using chrome in a selenium session
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless=new")
wd = webdriver.Chrome(options=options)

wd.get(main_page)

html_page = wd.page_source


In [None]:
# opens page from selenium in beautiful soup
import re 

soup = BeautifulSoup(html_page, 'html.parser')

# the page is organised in a series of accordion html elements
# we use beautiful soup to select out the main div containing these accordion elements
accordion_layer = soup.find(id=re.compile(r"acc_[0-9]{4}"))

In [None]:
# each result is contained in it's own accordion section with a section containing the ward name 
# and a section with the main results table
# here we select out all the accordion id names as a python list
full_accordion_id_names = [tag['id'] for tag in accordion_layer.findAll(True, {'id':True})]

In [None]:
# this is the main section doing the scraping
# the logic here is to iterate through the accordion id names
# find it in the beautifulsoup object
# if the id name contains trigger its content is just the ward name, assign this to a variable
# for all other id names
# read the html using pandas, prettifying the beautifulsoup object to allow pandas to read it correctly
# this returns a list of 2 dataframes, 1 the main results data, 2 a table with turnout, spoilt ballots and electorate
# do some logic on this to add it as columns to the results data and append it to our frame stack list
# at the end concatenate all these small dataframes together into one dataframe

frame_stack = []

for layer in full_accordion_id_names:

    if "trigger" in layer:
        
        ward = accordion_layer.find(id=layer).contents[0].strip()

    else:
        tbl_list = pd.read_html(accordion_layer.find(id=layer).prettify())

        # transpose the metadata table 
        meta_tbl = tbl_list[1].T

        main_tbl = tbl_list[0]

        # set the columns of the metadata table to the first row
        # as pandas weirdly misreads this table
        meta_tbl.columns = meta_tbl.loc[0,:]

        # remove the colon in these column names
        meta_tbl.columns = meta_tbl.columns.str.replace(":","")

        # drop the row used to assign column names
        meta_tbl.drop(0, axis=0, inplace=True)

        # for each column in the metadata table 
        # add it to the main data table as a new row where every value is the 
        # single value in the metadata table
        for col in meta_tbl.columns:
            main_tbl[col] = meta_tbl[col].values[0]

        main_tbl['Ward'] = ward

        # create vote share column
        main_tbl['vote_share'] = round(main_tbl.Votes / main_tbl.Votes.sum() * 100, 1)

        frame_stack.append(main_tbl)

results_frame = pd.concat(frame_stack)

# convert turnout column to float
results_frame.Turnout = results_frame.Turnout.str.replace("%",'').astype(float)
        


In [None]:
# close the webdriver
wd.close()

In [None]:
results_frame.to_csv("../data/Leeds_LE2023_results.csv", index=False)