# Setup
Install the requirements

You may need to install Firefox, and add geckodriver to `PATH`

In [1]:
!pip install -r requirements.txt
%load_ext autoreload
%autoreload 2

import numpy as np

[33mYou are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Gather ABC lower-house data

In [2]:
import election2019.abc

# Build initial dataframes
Scrape candidate and electorate data from the ABC's website

## Candidates

In [3]:
candidates = election2019.abc.build_candidates()

In [4]:
candidates

Unnamed: 0,first_name,family_name,party,electorate,sitting,website,twitter,facebook,instagram
Aaron Hammond,Aaron,Hammond,SCP,Sydney,False,,,,
Aaron Harpley-Carr,Aaron,Harpley-Carr,UAP,Dobell,False,,,,
Aaron Santelises,Aaron,Santelises,ALP,McPherson,False,,,,
Aaron Whittaker,Aaron,Whittaker,UAP,Brisbane,False,,,,
Adam Bandt,Adam,Bandt,GRN,Melbourne,True,,,,
Adam Blakester,Adam,Blakester,IND,New England,False,,,,
Adam Holt,Adam,Holt,UAP,Sydney,False,,,,
Adam Pulford,Adam,Pulford,GRN,Wills,False,,,,
Adam Veitch,Adam,Veitch,UAP,Bendigo,False,,,,
Adam Watson,Adam,Watson,UAP,Kingsford Smith,False,,,,


## Electorates

In [5]:
electorates = election2019.abc.build_electorates()

In [6]:
electorates

Unnamed: 0,name,state,url,party,margin
Adelaide,Adelaide,SA,https://www.abc.net.au/news/elections/federal/...,ALP,8.3
Aston,Aston,VIC,https://www.abc.net.au/news/elections/federal/...,LIB,7.4
Ballarat,Ballarat,VIC,https://www.abc.net.au/news/elections/federal/...,ALP,7.4
Banks,Banks,NSW,https://www.abc.net.au/news/elections/federal/...,LIB,1.4
Barker,Barker,SA,https://www.abc.net.au/news/elections/federal/...,LIB,13.9
Barton,Barton,NSW,https://www.abc.net.au/news/elections/federal/...,ALP,8.3
Bass,Bass,TA,https://www.abc.net.au/news/elections/federal/...,ALP,5.4
Bean,Bean,ACT,https://www.abc.net.au/news/elections/federal/...,ALP,8.9
Bendigo,Bendigo,VIC,https://www.abc.net.au/news/elections/federal/...,ALP,3.9
Bennelong,Bennelong,NSW,https://www.abc.net.au/news/elections/federal/...,LIB,9.7


# Gather external candidate data   

## Scrape ABC candidates' websites
Scrape the websites linked to in each of the candidates' profiles

In [7]:
election2019.abc.scrape_candidate_websites(electorates, candidates)

HBox(children=(IntProgress(value=0, max=151), HTML(value='')))




In [8]:
candidates

Unnamed: 0,first_name,family_name,party,electorate,sitting,website,twitter,facebook,instagram
Aaron Hammond,Aaron,Hammond,SCP,Sydney,False,,,,
Aaron Harpley-Carr,Aaron,Harpley-Carr,UAP,Dobell,False,,,,
Aaron Santelises,Aaron,Santelises,ALP,McPherson,False,https://www.alp.org.au/our-people/our-people/a...,,santelises4mcpherson,santelises4mcpherson
Aaron Whittaker,Aaron,Whittaker,UAP,Brisbane,False,,,,
Adam Bandt,Adam,Bandt,GRN,Melbourne,True,https://greens.org.au/vic/person/adam-bandt,adambandt,adam.bandt.mp,
Adam Blakester,Adam,Blakester,IND,New England,False,https://www.adamblakester.vote/,adamblakester,adamblakester,campaignblakester
Adam Holt,Adam,Holt,UAP,Sydney,False,,,,
Adam Pulford,Adam,Pulford,GRN,Wills,False,,,,
Adam Veitch,Adam,Veitch,UAP,Bendigo,False,,,,
Adam Watson,Adam,Watson,UAP,Kingsford Smith,False,,,,


In [9]:
candidates.replace("", np.nan).count()

first_name     707
family_name    707
party          707
electorate     707
sitting        707
website        341
twitter         64
facebook       231
instagram       54
dtype: int64

## Scrape Liberal party candidates website
Use selenium to manipulate page, and then dump the DOM

In [10]:
import election2019.liberals

In [11]:
election2019.liberals.parse_html(candidates)

In [12]:
candidates.replace("", np.nan).count()

first_name     707
family_name    707
party          707
electorate     707
sitting        707
website        341
twitter        120
facebook       311
instagram       92
dtype: int64

## Scrape Labor party candidates website
Scrape the linked pages for links

In [13]:
import election2019.labor

In [14]:
election2019.labor.scrape_candidates_pages(candidates)

HBox(children=(IntProgress(value=0, max=185), HTML(value='')))




In [15]:
candidates.replace("", np.nan).count()

first_name     707
family_name    707
party          707
electorate     707
sitting        707
website        341
twitter        181
facebook       398
instagram      111
dtype: int64

## Scrape Nationals party candidates website
Scrape the page for links

In [16]:
import election2019.nationals

In [17]:
election2019.nationals.scrape_candidates_page(candidates)
election2019.nationals.scrape_members_page(candidates)

In [18]:
candidates.replace("", np.nan).count()

first_name     707
family_name    707
party          707
electorate     707
sitting        707
website        341
twitter        186
facebook       415
instagram      111
dtype: int64

## Scrape Greens candidates website
Scrape the linked pages for links

In [19]:
import election2019.greens

In [20]:
election2019.greens.scrape_candidates_pages(candidates)

In [21]:
candidates.replace("", np.nan).count()

first_name     707
family_name    707
party          707
electorate     707
sitting        707
website        341
twitter        192
facebook       422
instagram      117
dtype: int64

## Scrape United Australia Party candidates website
Get pages from the API, then scrape links from pages

In [22]:
import election2019.uap

In [23]:
election2019.uap.scrape_candidates_pages(candidates)

HBox(children=(IntProgress(value=0, max=151), HTML(value='')))




In [24]:
candidates.replace("", np.nan).count()

first_name     707
family_name    707
party          707
electorate     707
sitting        707
website        341
twitter        192
facebook       528
instagram      117
dtype: int64

## Scrape Pauline Hanson's One Nation candidates website
Use selenium to access page and defeat bot-detection, then dump and scrape DOM

In [25]:
import election2019.phon

In [26]:
election2019.phon.scrape_candidates_page(candidates)

In [27]:
candidates.replace("", np.nan).count()

first_name     707
family_name    707
party          707
electorate     707
sitting        707
website        341
twitter        192
facebook       550
instagram      117
dtype: int64

# Validation

Identify candidates that have the same accounts

### Twitter

In [28]:
twitter_dups = candidates["twitter"].replace("", np.nan).dropna().duplicated()

In [29]:
candidates.loc[twitter_dups[twitter_dups == True].index]

Unnamed: 0,first_name,family_name,party,electorate,sitting,website,twitter,facebook,instagram


### Facebook

In [30]:
facebook_dups = candidates["facebook"].replace("", np.nan).dropna().duplicated()

In [31]:
candidates.loc[facebook_dups[facebook_dups == True].index]

Unnamed: 0,first_name,family_name,party,electorate,sitting,website,twitter,facebook,instagram
Hemant Dave,Hemant,Dave,LIB,Makin,False,https://www.liberal.org.au/member/hemant-dave,,458494798228929,
Patrick Conaghan,Patrick,Conaghan,NAT,Cowper,False,,,patrick-conaghan-candidate-for-cowper-11594598...,
Peter Schuback,Peter,Schuback,AFN,Longman,False,https://australiafirstparty.net/campaigns/2019...,,dr-jim-saleam-for-cootamundra-for-an-ideologic...,
Robert Shearman,Robert,Shearman,LNP,Blair,False,https://www.liberal.org.au/member/robert-shearman,,458494798228929,
Susan Jakobi,Susan,Jakobi,AFN,Lalor,False,https://australiafirstparty.net/campaigns/2019...,,dr-jim-saleam-for-cootamundra-for-an-ideologic...,


### Instagram

In [32]:
instagram_dups = candidates["instagram"].replace("", np.nan).dropna().duplicated()

In [33]:
candidates.loc[instagram_dups[instagram_dups == True].index]

Unnamed: 0,first_name,family_name,party,electorate,sitting,website,twitter,facebook,instagram


# Statistics


Totals for each source:

In [34]:
candidates[["twitter", "facebook", "instagram", "website", "first_name"]].rename(columns={"first_name": "total"}).replace("", np.nan).count()

twitter      192
facebook     550
instagram    117
website      341
total        707
dtype: int64

Totals for each party:

In [35]:
candidates[["party", "twitter", "facebook", "instagram", "website", "first_name"]].rename(columns={"first_name": "total"}).replace("", np.nan).groupby(["party"]).count()

Unnamed: 0_level_0,twitter,facebook,instagram,website,total
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AFN,0,3,0,3,3
AJP,0,0,0,13,24
ALP,82,143,46,62,152
AUC,0,0,0,0,11
CA,0,0,0,2,3
CEC,0,0,0,0,2
CLP,0,2,1,1,2
GRN,19,110,15,140,143
IND,20,21,13,26,30
KAP,0,1,0,5,6


Incumbents vs candidates:

In [36]:
candidates[["sitting", "twitter", "facebook", "instagram", "website"]].replace("", np.nan).groupby(["sitting"]).count()

Unnamed: 0_level_0,twitter,facebook,instagram,website
sitting,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,67,410,71,308
True,125,140,46,33


# Save data

In [37]:
import ipywidgets as widgets
from IPython.display import display
button = widgets.Button(description="Save")
display(button)
from time import time
import pickle
import os

def on_button_clicked(b):
    save_time = int(time())
    pre_str = os.path.join(os.getcwd(), 'output/')
    candidates.to_json(f"{pre_str}{save_time}_candidates.json")
    candidates.to_csv(f"{pre_str}{save_time}_candidates.csv")
    electorates.to_json(f"{pre_str}{save_time}_electorates.json")
    electorates.to_csv(f"{pre_str}{save_time}_electorates.csv")
    print(f"Saved to {pre_str}")

button.on_click(on_button_clicked)

Button(description='Save', style=ButtonStyle())