# Elections PEI - 2019 Provincial Election

Scraping data from the [Elections PEI](https://www.electionspei.ca/2019-election-results) website, we form simple datasets to be used in visualizations in our web app and future analysis.

In [1]:
# Dependencies.
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request

In [2]:
# Data.
df = pd.read_csv('elections_transformed/02b_2019_provincial_election_district_counts_by_dist.csv', index_col=0)
df.index.name = None
df.head()

Unnamed: 0,Green,Ind,Liberal,NDP,PC
1,804,0,861,0,1347
2,865,0,663,49,1493
3,675,0,785,124,1373
4,781,0,615,0,1545
5,1152,0,902,38,934


In [3]:
# Get data from webpage and create bs4 soup for parsing.
source = urllib.request.urlopen('https://www.electionspei.ca/2019-election-results').read()
soup = BeautifulSoup(source, 'html.parser')

In [4]:
# Get the needed urls from the webpage.
urls = soup.find_all('a', {'data-entity-type': 'node'})

In [5]:
# Create list to store actual district urls.
district_urls = []
# Loop through urls to remove incorrect links.
for url in urls:
    try:
        # If the text in the <a> tags cannot be turned into an int, it is not a district <a>.
        int(url.text)
        district_urls.append(url['href'])
    except:
        print(f"{url.text} - not a district url.")

2019 Referendum Results - not a district url.
Careers - not a district url.


In [6]:
# Check list of district data urls.
district_urls

['/district-1-results-souris-elmira',
 '/district-2-results-georgetown-pownal',
 '/district-3-results-montague-kilmuir',
 '/district-4-results-belfast-murray-river',
 '/district-5-results-mermaid-stratford',
 '/district-6-results-stratford-keppoch',
 '/district-7-results-morell-donagh',
 '/district-8-results-stanhope-marshfield',
 '/district-9-results-charlottetown-hillsborough-park',
 '/district-10-results-charlottetown-winsloe',
 '/district-11-results-charlottetown-belvedere',
 '/district-12-results-charlottetown-victoria-park',
 '/district-13-results-charlottetown-brighton',
 '/district-14-results-charlottetown-west-royalty',
 '/district-15-results-brackley-hunter-river',
 '/district-16-results-cornwall-meadowbank',
 '/district-17-results-new-haven-rocky-point',
 '/district-18-results-rustico-emerald',
 '/district-19-results-borden-kinkora',
 '/district-20-results-kensington-malpeque',
 '/district-21-results-summerside-wilmot',
 '/district-22-results-summerside-south-drive',
 '/dist

In [7]:
# Temporarily only run on the first url - delete when doing final run.
district_urls = [district_urls[0]]
district_urls

['/district-1-results-souris-elmira']

In [8]:
# Create dictionary to store information returned.
district_dict = {}

# Loop through each district url and store the bs4 info, as well as read the table with pd.
for i, district_url in enumerate(district_urls, 1):

    # Access source and create soup.
    district_source = urllib.request.urlopen(f'https://www.electionspei.ca{district_url}')
    district_soup = BeautifulSoup(district_source, 'html.parser')
    
    # Get district name from soup and store it.
    district_name = district_soup.find_all('span', class_='field')[0].text.split('Results: ')[1]

    # Get the first table as a dataframe.
    district_df = pd.read_html(f'https://www.electionspei.ca{district_url}')[0]
    
    # Store both in dict with district number as key.
    district_dict[i] = {'district_name': district_name, 'district_df': district_df}

In [9]:
# Loop through all district dataframes and fix them.