### Scrape gubernatorial candidate face urls from ballotpedia

In [1]:
import pandas as pd
import bs4 as bs # pulling data out of HTML and XML files.
import urllib.request # opening and reading URLs
import re # pattern matching

In [2]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [3]:
#Download the website's html
source = urllib.request.urlopen("https://ballotpedia.org/Gubernatorial_elections,_2022").read()

In [4]:
#Parse the html
soup = bs.BeautifulSoup(source)

In [5]:
urls_css = "tbody tr a"

In [6]:
urls = soup.select(urls_css)

In [7]:
extracted_urls = [url["href"] for url in urls[5:185:5]]

In [8]:
len(extracted_urls)

36

In [9]:
extracted_urls[0]

'/Alabama_gubernatorial_election,_2022'

In [10]:
extracted_urls.append('/South_Carolina_gubernatorial_election,_2022')

In [11]:
extracted_urls

['/Alabama_gubernatorial_election,_2022',
 '/Alaska_gubernatorial_and_lieutenant_gubernatorial_election,_2022',
 '/Arizona_gubernatorial_election,_2022',
 '/Arkansas_gubernatorial_election,_2022',
 '/California_gubernatorial_election,_2022',
 '/Colorado_gubernatorial_and_lieutenant_gubernatorial_election,_2022',
 '/Connecticut_gubernatorial_and_lieutenant_gubernatorial_election,_2022',
 '/Florida_gubernatorial_and_lieutenant_gubernatorial_election,_2022',
 '/Georgia_gubernatorial_election,_2022',
 '/Hawaii_gubernatorial_and_lieutenant_gubernatorial_election,_2022',
 '/Idaho_gubernatorial_election,_2022',
 '/Illinois_gubernatorial_and_lieutenant_gubernatorial_election,_2022',
 '/Iowa_gubernatorial_and_lieutenant_gubernatorial_election,_2022',
 '/Kansas_gubernatorial_and_lieutenant_gubernatorial_election,_2022',
 '/Maine_gubernatorial_election,_2022',
 '/Maryland_gubernatorial_and_lieutenant_gubernatorial_election,_2022',
 '/Massachusetts_gubernatorial_and_lieutenant_gubernatorial_electi

In [12]:
full_urls = ["https://ballotpedia.org" + url_suffix for url_suffix in extracted_urls]

In [13]:
full_urls[1]

'https://ballotpedia.org/Alaska_gubernatorial_and_lieutenant_gubernatorial_election,_2022'

Now that we've got all the links, we need to visit each, extract what we need and save it somewhere.

In [91]:
#Now do this in a loop, for the first 5 urls
bp_urls = []
bp_names = []

for i in full_urls[:]:
  source = urllib.request.urlopen(i).read()
  soup = bs.BeautifulSoup(source)
  img_css = "img.image-candidate-thumbnail"
  img = soup.select(img_css)
  img = img[0:5]
  bp_urls.extend(list([i['src'] for i in img]))
  n_css = "td.votebox-results-cell--text"
  nm = soup.select(n_css)
  nm = nm[0:5]
  bp_names.extend(list([url.text for url in nm]))

In [15]:
len(bp_urls)

185

In [16]:
len(bp_names)

185

In [17]:
df = pd.DataFrame(columns=('bp_name_raw', 'bp_url'))

In [18]:
df['bp_name_raw']=bp_names

In [19]:
df['bp_url']=bp_urls

In [20]:
df.tail()

Unnamed: 0,bp_name_raw,bp_url
180,Henry McMaster (R) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...
181,Joe Cunningham (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...
182,Morgan Bruce Reeves (L) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...
183,Joe Cunningham\n,https://s3.amazonaws.com/ballotpedia-api4/file...
184,Mia McLeod\n,https://s3.amazonaws.com/ballotpedia-api4/file...


In [21]:
df2 = df['bp_name_raw'].str.rsplit("(", n=1, expand=True)

In [22]:
df2 = df2.rename(columns={0:'name'})

In [23]:
df3 =df2['name'].str.rsplit("(", n=1, expand=True)
df3 = df3.rename(columns={0:'name'})

In [24]:
df3['bp_name'] = df3['name'].str.replace("\n", "").replace("\t", "")

In [25]:
df3['bp_name']

0                 Kay Ivey 
1          Yolanda Flowers 
2              James Blake 
3            Jared Budlong 
4           Yolanda Flowers
               ...         
180         Henry McMaster 
181         Joe Cunningham 
182    Morgan Bruce Reeves 
183          Joe Cunningham
184              Mia McLeod
Name: bp_name, Length: 185, dtype: object

In [26]:
df['bp_name']=df3['bp_name']

In [27]:
df.head()

Unnamed: 0,bp_name_raw,bp_url,bp_name
0,Kay Ivey (R) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Kay Ivey
1,Yolanda Flowers (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Yolanda Flowers
2,James Blake (L) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,James Blake
3,Jared Budlong (Independent) (Write-in) \n,https://s3.amazonaws.com/ballotpedia-api4/file...,Jared Budlong
4,Yolanda Flowers\n,https://s3.amazonaws.com/ballotpedia-api4/file...,Yolanda Flowers


In [28]:
df.shape

(185, 3)

In [29]:
df

Unnamed: 0,bp_name_raw,bp_url,bp_name
0,Kay Ivey (R) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Kay Ivey
1,Yolanda Flowers (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Yolanda Flowers
2,James Blake (L) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,James Blake
3,Jared Budlong (Independent) (Write-in) \n,https://s3.amazonaws.com/ballotpedia-api4/file...,Jared Budlong
4,Yolanda Flowers\n,https://s3.amazonaws.com/ballotpedia-api4/file...,Yolanda Flowers
...,...,...,...
180,Henry McMaster (R) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Henry McMaster
181,Joe Cunningham (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Joe Cunningham
182,Morgan Bruce Reeves (L) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Morgan Bruce Reeves
183,Joe Cunningham\n,https://s3.amazonaws.com/ballotpedia-api4/file...,Joe Cunningham


In [31]:
df['bp_name'] = df['bp_name'].str.strip()

In [32]:
df = df.drop_duplicates(subset=['bp_name'], keep='first')

In [33]:
df.shape

(159, 3)

### Load ABC gubernatorial candidate list 

In [35]:
g = pd.read_excel("./Priors 2022.xlsx")

In [36]:
g.head()

Unnamed: 0,State,Democrat,Prior,Republican,Prior.1,Other,Prior.2,Unnamed: 7,Prior.3,Unnamed: 9,Prior.4
0,Alabama,Yolanda Flowers,35,Kay Ivey,65,,,,,,
1,Alaska,Les Gara,28,Mike Dunleavy,45,Bill Walker (Ind),21.0,Charlie Pierce (Rep),6.0,,
2,Arizona,Katie Hobbs,49,Kari Lake,51,,,,,,
3,Arkansas,Chris Jone,38,Sarah Huckabee Sanders,62,,,,,,
4,California,Gavin Newsom,61,Brian Dahle,39,,,,,,


In [37]:
g.columns

Index(['State', 'Democrat', 'Prior', 'Republican', 'Prior.1', 'Other',
       'Prior.2', 'Unnamed: 7', 'Prior.3', 'Unnamed: 9', 'Prior.4'],
      dtype='object')

Clean up this file

In [39]:
g = g[['State', 'Democrat','Republican', 'Other','Unnamed: 7', 'Unnamed: 9']]
g.head()

Unnamed: 0,State,Democrat,Republican,Other,Unnamed: 7,Unnamed: 9
0,Alabama,Yolanda Flowers,Kay Ivey,,,
1,Alaska,Les Gara,Mike Dunleavy,Bill Walker (Ind),Charlie Pierce (Rep),
2,Arizona,Katie Hobbs,Kari Lake,,,
3,Arkansas,Chris Jone,Sarah Huckabee Sanders,,,
4,California,Gavin Newsom,Brian Dahle,,,


In [40]:
abc = pd.melt(g, id_vars=['State'], value_vars=['Democrat', 'Republican', 'Other','Unnamed: 7', 'Unnamed: 9'],
             value_name='abc_name_raw')

In [41]:
abc = abc.rename(columns={'State':'abc_state'})

In [42]:
abc = abc[['abc_name_raw', 'abc_state']]

In [43]:
abc = abc.loc[~abc['abc_name_raw'].isna()]

In [44]:
abc.shape

(83, 2)

In [45]:
abc['abc_name'] = abc['abc_name_raw'].str.rsplit("(", n=1, expand=True)[0]

In [46]:
abc

Unnamed: 0,abc_name_raw,abc_state,abc_name
0,Yolanda Flowers,Alabama,Yolanda Flowers
1,Les Gara,Alaska,Les Gara
2,Katie Hobbs,Arizona,Katie Hobbs
3,Chris Jone,Arkansas,Chris Jone
4,Gavin Newsom,California,Gavin Newsom
...,...,...,...
105,Peter Duval (Ind),Vermont,Peter Duval
109,Charlie Pierce (Rep),Alaska,Charlie Pierce
133,Ervin Yen (Ind),Oklahoma,Ervin Yen
141,Kevin Hoyt (Ind),Vermont,Kevin Hoyt


In [49]:
abc.loc[abc['abc_name']=='None of these candidates']

Unnamed: 0,abc_name_raw,abc_state,abc_name
92,None of these candidates,Nevada,None of these candidates


In [50]:
abc = abc.drop([92])

In [51]:
abc.loc[abc['abc_name']=='None of these candidates']

Unnamed: 0,abc_name_raw,abc_state,abc_name


### Fuzzy matching names

In [52]:
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

In [53]:
dat = fuzzy_merge(abc, df, 'abc_name', 'bp_name', threshold=90)

In [54]:
dat

Unnamed: 0,abc_name_raw,abc_state,abc_name,matches
0,Yolanda Flowers,Alabama,Yolanda Flowers,Yolanda Flowers
1,Les Gara,Alaska,Les Gara,Les Gara
2,Katie Hobbs,Arizona,Katie Hobbs,Katie Hobbs
3,Chris Jone,Arkansas,Chris Jone,Chris Jones
4,Gavin Newsom,California,Gavin Newsom,Gavin Newsom
...,...,...,...,...
105,Peter Duval (Ind),Vermont,Peter Duval,Peter Duval
109,Charlie Pierce (Rep),Alaska,Charlie Pierce,Charlie Pierce
133,Ervin Yen (Ind),Oklahoma,Ervin Yen,Ervin Yen
141,Kevin Hoyt (Ind),Vermont,Kevin Hoyt,Kevin Hoyt


In [55]:
dat.loc[dat['matches'].isna()]

Unnamed: 0,abc_name_raw,abc_state,abc_name,matches


In [56]:
dat.shape

(82, 4)

In [57]:
gov = dat.merge(df, left_on='matches', right_on='bp_name', indicator=True, how='left')

In [58]:
gov.head()

Unnamed: 0,abc_name_raw,abc_state,abc_name,matches,bp_name_raw,bp_url,bp_name,_merge
0,Yolanda Flowers,Alabama,Yolanda Flowers,Yolanda Flowers,Yolanda Flowers (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Yolanda Flowers,both
1,Les Gara,Alaska,Les Gara,Les Gara,Les Gara,https://s3.amazonaws.com/ballotpedia-api4/file...,Les Gara,both
2,Katie Hobbs,Arizona,Katie Hobbs,Katie Hobbs,Katie Hobbs (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Katie Hobbs,both
3,Chris Jone,Arkansas,Chris Jone,Chris Jones,Chris Jones (D) \n,https://s3.amazonaws.com/ballotpedia-api4/file...,Chris Jones,both
4,Gavin Newsom,California,Gavin Newsom,Gavin Newsom,Gavin Newsom (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Gavin Newsom,both


In [59]:
gov._merge.value_counts()

both          81
left_only      1
right_only     0
Name: _merge, dtype: int64

In [60]:
gov.loc[gov['_merge']=='left_only']

Unnamed: 0,abc_name_raw,abc_state,abc_name,matches,bp_name_raw,bp_url,bp_name,_merge
60,Mike DeWine,Ohio,Mike DeWine,,,,,left_only


In [61]:
# Mannually correct this entry
gov.loc[gov['_merge']=='left_only', 'bp_url']='https://s3.amazonaws.com/ballotpedia-api4/files/thumbs/100/100/MikeDeWine2015.jpg'
gov.loc[gov['_merge']=='left_only', 'bp_name']='Richard Michael DeWine'

In [62]:
gov.loc[gov['_merge']=='left_only']

Unnamed: 0,abc_name_raw,abc_state,abc_name,matches,bp_name_raw,bp_url,bp_name,_merge
60,Mike DeWine,Ohio,Mike DeWine,,,https://s3.amazonaws.com/ballotpedia-api4/file...,Richard Michael DeWine,left_only


In [63]:
gov.columns

Index(['abc_name_raw', 'abc_state', 'abc_name', 'matches', 'bp_name_raw',
       'bp_url', 'bp_name', '_merge'],
      dtype='object')

In [64]:
gov = gov[['abc_name', 'abc_name_raw', 'abc_state', 'bp_name', 'bp_name_raw', 'bp_url']]

In [68]:
gov['bp_url_filename'] = gov['bp_url'].str.replace("https://s3.amazonaws.com/ballotpedia-api4/files/thumbs/100/100/", '')

  gov['bp_url_filename'] = gov['bp_url'].str.replace("https://s3.amazonaws.com/ballotpedia-api4/files/thumbs/100/100/", '')


In [69]:
gov.head(20)

Unnamed: 0,abc_name,abc_name_raw,abc_state,bp_name,bp_name_raw,bp_url,bp_url_filename
0,Yolanda Flowers,Yolanda Flowers,Alabama,Yolanda Flowers,Yolanda Flowers (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Yolanda-Flowers.PNG
1,Les Gara,Les Gara,Alaska,Les Gara,Les Gara,https://s3.amazonaws.com/ballotpedia-api4/file...,Les_Gara.JPG
2,Katie Hobbs,Katie Hobbs,Arizona,Katie Hobbs,Katie Hobbs (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Katie-Hobbs.PNG
3,Chris Jone,Chris Jone,Arkansas,Chris Jones,Chris Jones (D) \n,https://s3.amazonaws.com/ballotpedia-api4/file...,Chris-Jones.jpg
4,Gavin Newsom,Gavin Newsom,California,Gavin Newsom,Gavin Newsom (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,399px-Gavin_Newsom_official_photo.jpg
5,Jared Polis,Jared Polis,Colorado,Jared Polis,Jared Polis (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Jared_Polis.jpg
6,Ned Lamont,Ned Lamont,Connecticut,Ned Lamont,Ned Lamont (D / Working Families Party / Grieb...,https://s3.amazonaws.com/ballotpedia-api4/file...,nedlamont.jpg
7,Charlie Crist,Charlie Crist,Florida,Charlie Crist,Charlie Crist (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Charlie_Crist_115th_Congress_photo.jpg
8,Stacey Abrams,Stacey Abrams,Georgia,Stacey Abrams,Stacey Abrams (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Stacey_Abrams.jpg
9,Josh Green,Josh Green,Hawaii,Joshua Green,Joshua Green (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Joshua-Green.PNG


In [70]:
gov.to_csv("./bp2022_gov_scraped_face_verified.csv", index=False)

In [71]:
gov.shape

(82, 7)