### Scrape Senate candidate face urls from ballotpedia

In [1]:
import pandas as pd
import bs4 as bs # pulling data out of HTML and XML files.
import urllib.request # opening and reading URLs
import re # pattern matching

In [2]:
#Download the website's html
source = urllib.request.urlopen("https://ballotpedia.org/United_States_Senate_elections,_2022").read()

In [3]:
#Parse the html
soup = bs.BeautifulSoup(source)

In [4]:
urls_css = "small center a"

In [5]:
urls = soup.select(urls_css)

In [6]:
extracted_urls = [url["href"] for url in urls]

In [7]:
full_urls = ["https://ballotpedia.org" + url_suffix for url_suffix in extracted_urls]

In [8]:
len(full_urls)

36

In [9]:
full_urls

['https://ballotpedia.org/United_States_Senate_election_in_Alabama,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Alaska,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Arizona,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Arkansas,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_California,_2022',
 'https://ballotpedia.org/United_States_Senate_special_election_in_California,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Colorado,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Connecticut,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Florida,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Georgia,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Hawaii,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Idaho,_2022',
 'https://ballotpedia.org/United_States_Senate_election_in_Illinois,_2022',
 'https:

Now that we've got all the links, we need to visit each, extract what we need and save it somewhere.

In [10]:
#Now do this in a loop, for the first 5 urls
bp_urls = []
bp_names = []

In [11]:
for i in full_urls[:]:
  source = urllib.request.urlopen(i).read()
  soup = bs.BeautifulSoup(source)
  img_css = "img.image-candidate-thumbnail, img[id=placeholder_image]"
  img = soup.select(img_css)
  img = img[0:5]
  bp_urls.extend(list([i['src'] for i in img]))
  n_css = "td.votebox-results-cell--text"
  nm = soup.select(n_css)
  nm = nm[0:5]
  bp_names.extend(list([url.text for url in nm]))

In [12]:
df = pd.DataFrame(columns=('bp_name_raw', 'bp_url'))

df['bp_name_raw']=bp_names

df['bp_url']=bp_urls

In [13]:
df.tail()

Unnamed: 0,bp_name_raw,bp_url
175,Ronald Harold Johnson (R) \n\t\t\t\t\t\t\t\t\...,https://s3.amazonaws.com/ballotpedia-api4/file...
176,Mandela Barnes (D),https://s3.amazonaws.com/ballotpedia-api4/file...
177,Adam Nicholas Paul (Logic Party) (Write-in) \...,https://cdn.ballotpedia.org/images/thumb/f/fb/...
178,Scott Aubart (American Independent Party) (Wri...,https://cdn.ballotpedia.org/images/thumb/f/fb/...
179,Other/Write-in votes,https://s3.amazonaws.com/ballotpedia-api4/file...


In [14]:
df2 = df['bp_name_raw'].str.rsplit("(", n=1, expand=True)

In [15]:
df2 = df2.rename(columns={0:'name'})

In [16]:
df3 =df2['name'].str.rsplit("(", n=1, expand=True)
df3 = df3.rename(columns={0:'name'})

In [17]:
df3['bp_name'] = df3['name'].str.replace("\n", "").replace("\t", "")

In [18]:
df3['bp_name']

0                Katie Britt 
1                  Will Boyd 
2            John Sophocleus 
3                 Katie Britt
4                   Mo Brooks
                ...          
175    Ronald Harold Johnson 
176           Mandela Barnes 
177       Adam Nicholas Paul 
178             Scott Aubart 
179      Other/Write-in votes
Name: bp_name, Length: 180, dtype: object

In [19]:
df['bp_name']=df3['bp_name']

In [20]:
df['bp_url_filename'] = df['bp_url'].str.replace("https://s3.amazonaws.com/ballotpedia-api4/files/thumbs/100/100/", '')

In [21]:
df['bp_name'] = df['bp_name'].str.strip()

In [22]:
df = df.drop_duplicates(subset=['bp_name'], keep='first')

In [23]:
df.tail(10)

Unnamed: 0,bp_name_raw,bp_url,bp_name,bp_url_filename
166,Gerald Malloy (R),https://s3.amazonaws.com/ballotpedia-api4/file...,Gerald Malloy,Gerald_Malloy.jpeg
167,Dawn Ellis (Independent) \n\t\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,Dawn Ellis,DawnEllis.jpg
168,Natasha Diamondstone-Kohout (Green Mountain Pe...,https://cdn.ballotpedia.org/images/thumb/f/fb/...,Natasha Diamondstone-Kohout,https://cdn.ballotpedia.org/images/thumb/f/fb/...
169,Kerry Patrick Raheb (Independent),https://s3.amazonaws.com/ballotpedia-api4/file...,Kerry Patrick Raheb,KerryRahebVT.png
170,Patty Murray (D) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Patty Murray,Patty_Murray.jpg
171,Tiffany Smiley (R) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Tiffany Smiley,TiffanySmiley.jpeg
175,Ronald Harold Johnson (R) \n\t\t\t\t\t\t\t\t\...,https://s3.amazonaws.com/ballotpedia-api4/file...,Ronald Harold Johnson,Ron_Johnson.jpg
176,Mandela Barnes (D),https://s3.amazonaws.com/ballotpedia-api4/file...,Mandela Barnes,Mandela-Barnes.jpg
177,Adam Nicholas Paul (Logic Party) (Write-in) \...,https://cdn.ballotpedia.org/images/thumb/f/fb/...,Adam Nicholas Paul,https://cdn.ballotpedia.org/images/thumb/f/fb/...
178,Scott Aubart (American Independent Party) (Wri...,https://cdn.ballotpedia.org/images/thumb/f/fb/...,Scott Aubart,https://cdn.ballotpedia.org/images/thumb/f/fb/...


In [24]:
df.to_csv("./data/bp2022_sen_scraped_face.csv", index=False)