## PEI Election Data

Using the data from [Elections PEI - Open Data](https://www.electionspei.ca/resources/open-data), and more specifically [Yearly Political Party Contributions - Open Data](https://www.electionspei.ca/yearly-political-party-contributions-open-data), we create some visualizations of the data.

In [1]:
# Dependencies.
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../resources/election_contributions.csv')
df

Unnamed: 0,Year,Party,Last_Name,First_Name,Business_Name,Address,Community,Postal_Code,Province,Country,Amount
0,2011,Green Party of Prince Edward Island,Lanthier,Peter,,,Mermaid,,,,350
1,2011,Green Party of Prince Edward Island,Lanthier,Darcie,,,Mermaid,,,,400
2,2011,Green Party of Prince Edward Island,Munves,Barbara,,,Charlottetown,,,,500
3,2011,Green Party of Prince Edward Island,Green Party of Canada,,,,Malpeque Association,,,,5300
4,2011,The Island Party of P.E.I.,Ferguson,George,,,Murray River,,,,400
...,...,...,...,...,...,...,...,...,...,...,...
7746,2020,Progressive Conservative Association of Prince...,Walsh,Margaret Ann,,,Watervale,,PE,,"$1,500.00"
7747,2020,Progressive Conservative Association of Prince...,Wellner,William,,,Charlottetown,,PE,,"$3,000.00"
7748,2020,Progressive Conservative Association of Prince...,Wheatley,Ross,,,Stratford,,PE,,$310.26
7749,2020,Progressive Conservative Association of Prince...,Wheeler,Sean,,,Charlottetown,,PE,,$500.00


In [3]:
# Change column names.
df.columns = ['Year', 'Party', 'Last Name', 'First Name', 'Business Name', 'Address', 'Community', 'Postal Code', 'Province', 'Country', 'Amount']
df.head()

Unnamed: 0,Year,Party,Last Name,First Name,Business Name,Address,Community,Postal Code,Province,Country,Amount
0,2011,Green Party of Prince Edward Island,Lanthier,Peter,,,Mermaid,,,,350
1,2011,Green Party of Prince Edward Island,Lanthier,Darcie,,,Mermaid,,,,400
2,2011,Green Party of Prince Edward Island,Munves,Barbara,,,Charlottetown,,,,500
3,2011,Green Party of Prince Edward Island,Green Party of Canada,,,,Malpeque Association,,,,5300
4,2011,The Island Party of P.E.I.,Ferguson,George,,,Murray River,,,,400


In [4]:
# Check parties.
df['Party'].unique()

array(['Green Party of Prince Edward Island',
       'The Island Party of P.E.I.',
       'New Democratic Party of Prince Edward Island',
       'Prince Edward Island Liberal Association',
       'Progressive Conservative Association of Prince Edward Island'],
      dtype=object)

In [5]:
# Change names to simpler forms.
df.replace({'Party': {'Green Party of Prince Edward Island': 'Green Party',
                      'The Island Party of P.E.I.': 'Island Party',
                      'New Democratic Party of Prince Edward Island': 'New Democratic Party',
                      'Prince Edward Island Liberal Association': 'Liberal Party',
                      'Progressive Conservative Association of Prince Edward Island': 'Conservative Party'}}, inplace=True)
df['Party'].unique()

array(['Green Party', 'Island Party', 'New Democratic Party',
       'Liberal Party', 'Conservative Party'], dtype=object)

In [6]:
# Fill NaN values in Name columns.
df['Last Name'] = df['Last Name'].fillna('')
df['First Name'] = df['First Name'].fillna('')
df['Business Name'] = df['Business Name'].fillna('')
df.head(10)

Unnamed: 0,Year,Party,Last Name,First Name,Business Name,Address,Community,Postal Code,Province,Country,Amount
0,2011,Green Party,Lanthier,Peter,,,Mermaid,,,,350
1,2011,Green Party,Lanthier,Darcie,,,Mermaid,,,,400
2,2011,Green Party,Munves,Barbara,,,Charlottetown,,,,500
3,2011,Green Party,Green Party of Canada,,,,Malpeque Association,,,,5300
4,2011,Island Party,Ferguson,George,,,Murray River,,,,400
5,2011,Island Party,Murphy,Winnifred,,,Peakes,,,,400
6,2011,Island Party,Cann,Billy,,,Gaspereaux,,,,1500
7,2011,Island Party,Donahue,Joanne,,,Cornwall,,,,1540
8,2011,Island Party,Smitz,Paul,,,Brookvale,,,,1700
9,2011,New Democratic Party,Byrne,Joseph,,,Charlottetown,,,,265


In [7]:
# Combine the Name columns into one Entity column, and drop the Names.
df['Entity'] = df[['First Name', 'Last Name', 'Business Name']].agg(' '.join, axis=1)
df.drop(columns=['First Name', 'Last Name', 'Business Name'])
df = df[['Year', 'Party', 'Entity', 'Address', 'Community', 'Postal Code', 'Province', 'Country', 'Amount']]
df.head(10)

Unnamed: 0,Year,Party,Entity,Address,Community,Postal Code,Province,Country,Amount
0,2011,Green Party,Peter Lanthier,,Mermaid,,,,350
1,2011,Green Party,Darcie Lanthier,,Mermaid,,,,400
2,2011,Green Party,Barbara Munves,,Charlottetown,,,,500
3,2011,Green Party,Green Party of Canada,,Malpeque Association,,,,5300
4,2011,Island Party,George Ferguson,,Murray River,,,,400
5,2011,Island Party,Winnifred Murphy,,Peakes,,,,400
6,2011,Island Party,Billy Cann,,Gaspereaux,,,,1500
7,2011,Island Party,Joanne Donahue,,Cornwall,,,,1540
8,2011,Island Party,Paul Smitz,,Brookvale,,,,1700
9,2011,New Democratic Party,Joseph Byrne,,Charlottetown,,,,265


In [8]:
# Check the lengths of columns.
print(f"Address has values in {round(df['Address'].count()/len(df)*100)}% of rows.")
print(f"Postal Code has values in {round(df['Postal Code'].count()/len(df)*100)}% of rows.")
print(f"Province has values in {round(df['Province'].count()/len(df)*100)}% of rows.")
print(f"Country has values in {round(df['Country'].count()/len(df)*100)}% of rows.")

Address has values in 5% of rows.
Postal Code has values in 5% of rows.
Province has values in 45% of rows.
Country has values in 5% of rows.


In [9]:
# Drop the rows that lack data.
df = df.drop(columns=['Address', 'Postal Code', 'Country'])
df.head()

Unnamed: 0,Year,Party,Entity,Community,Province,Amount
0,2011,Green Party,Peter Lanthier,Mermaid,,350
1,2011,Green Party,Darcie Lanthier,Mermaid,,400
2,2011,Green Party,Barbara Munves,Charlottetown,,500
3,2011,Green Party,Green Party of Canada,Malpeque Association,,5300
4,2011,Island Party,George Ferguson,Murray River,,400


In [10]:
# Replace currency with float value.
df['Amount'] = df['Amount'].replace('[\$,]', '', regex=True).astype(float)
df

Unnamed: 0,Year,Party,Entity,Community,Province,Amount
0,2011,Green Party,Peter Lanthier,Mermaid,,350.00
1,2011,Green Party,Darcie Lanthier,Mermaid,,400.00
2,2011,Green Party,Barbara Munves,Charlottetown,,500.00
3,2011,Green Party,Green Party of Canada,Malpeque Association,,5300.00
4,2011,Island Party,George Ferguson,Murray River,,400.00
...,...,...,...,...,...,...
7746,2020,Conservative Party,Margaret Ann Walsh,Watervale,PE,1500.00
7747,2020,Conservative Party,William Wellner,Charlottetown,PE,3000.00
7748,2020,Conservative Party,Ross Wheatley,Stratford,PE,310.26
7749,2020,Conservative Party,Sean Wheeler,Charlottetown,PE,500.00


In [11]:
# Check Province column.
df['Province'].unique()

array([nan, 'PE  ', 'PE', 'PE ', 'ON', 'NS', 'NB', 'QC', 'AB', 'BC', 'NL',
       'SK', 'PEI'], dtype=object)

In [12]:
# Strip and Replace various PEI formats.
df['Province'] = df['Province'].str.strip()
df['Province'] = df['Province'].str.replace('PEI', 'PE')
df['Province'].unique()

array([nan, 'PE', 'ON', 'NS', 'NB', 'QC', 'AB', 'BC', 'NL', 'SK'],
      dtype=object)

In [13]:
# Begin cleaning Community column.
df['Community'] = df['Community'].str.replace('\s+', ' ', regex=True)
df['Community'] = df['Community'].str.replace('\sRR\s*\d+', ' ', regex=True)
df['Community'] = df['Community'].str.replace("O Leary|OLeary|O'Leary |O�Leary", "O'Leary", regex=True)
df['Community'] = df['Community'].str.strip()

In [14]:
# Find all Community values with a letter then a space.
#df.loc[df['Community'].str.contains(r'^\w{1}\s{1}\w+', na=False), 'Community']
df.replace({'Community': {'W heatly River': 'Wheatly River',
                          'W indsor Junction, NS': 'Windsor Junction, NS',
                          'M urray River': 'Murray River',
                          'M orell': 'Morell',
                          'W insloe': 'Winsloe',
                          'M ontague': 'Montague',
                          'W ellington': 'Wellington',
                          'M iscouche': 'Miscouche'}}, inplace=True)
# Replace spelling mistakes I noticed during analysis.
df.replace({'Community': {'Charlotteown': 'Charlottetown',
                          'Charolottetown': 'Charlottetown',
                          'Charlottettown': 'Charlottetown',
                          'Charlottetown Pei': 'Charlottetown',
                          "Ch'Town": 'Charlottetown',
                          'Halifax`': 'Halifax',
                          'NorthWiltshire': 'North Wiltshire',
                          'North W Iltshire': 'North Wiltshire',
                          'Canoe Cove`': 'Canoe Cove',
                          'Stewart': 'Mount Stewart',
                          'Mt Stewart': 'Mount Stewart',
                          'Mt. Stewart': 'Mount Stewart',
                          'Startford': 'Stratford',
                          "St. Peter'S Bay": 'St. Peters Bay',
                          }}, inplace=True)

In [15]:
# Community values that contain the Province, adding Province Data.
df.loc[df['Community'].str.contains(r'[,|, ]\w{2}$', na=False), 'Province'] = df['Community'].str[-2:]
# Changing the Community to not have the Province.
df.loc[df['Community'].str.contains(r'[,|, ]\w{2}$', na=False), 'Community'] = df['Community'].str.extract('^(.+?),')

In [16]:
# Turn all values in Community to Title Case.
df['Community'] = df['Community'].str.title()

In [17]:
# Get unique list of communities.
coms = df['Community'].unique().tolist()
coms[:5]

['Mermaid', 'Charlottetown', 'Malpeque Association', 'Murray River', 'Peakes']

In [18]:
# Read in the PEI municipality data.
muns = pd.read_csv('../resources/pei_gov_placefinder.csv')['Place'].tolist()
muns[:5]

['48 Road', 'Abney', 'Abrams Village', 'Afton', 'Afton Road']

In [19]:
others = []
# Loop through our Communities and see if they are in PEI.
for com in coms:
    if com in muns:
        df.loc[df['Community'].str.lower() == com.lower(), 'Province'] = 'PE'
    else:
        if com not in others:
            others.append(com)

In [20]:
# Length of communities not in PE.
len(others)

96

In [21]:
# View rows again.
df.head(20)

Unnamed: 0,Year,Party,Entity,Community,Province,Amount
0,2011,Green Party,Peter Lanthier,Mermaid,PE,350.0
1,2011,Green Party,Darcie Lanthier,Mermaid,PE,400.0
2,2011,Green Party,Barbara Munves,Charlottetown,PE,500.0
3,2011,Green Party,Green Party of Canada,Malpeque Association,,5300.0
4,2011,Island Party,George Ferguson,Murray River,PE,400.0
5,2011,Island Party,Winnifred Murphy,Peakes,PE,400.0
6,2011,Island Party,Billy Cann,Gaspereaux,PE,1500.0
7,2011,Island Party,Joanne Donahue,Cornwall,PE,1540.0
8,2011,Island Party,Paul Smitz,Brookvale,PE,1700.0
9,2011,New Democratic Party,Joseph Byrne,Charlottetown,PE,265.0


In [22]:
# Save this cleaned data to a CSV.
df.to_csv('../resources/transformed_data/election_contributions_transformed.csv', index=False)