In [8]:
import pandas as pd 
import csv
from pathlib import Path

from collections import OrderedDict

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time

from num2words import num2words

from collections import OrderedDict

In [9]:
# Util method to come between state formats 
# returnType must be either 'full' 'two_digit' or 'FIPS'
def get_state_code(input, returnType = 'FIPS'):
    if (returnType != 'full' and returnType != 'two_digit' and returnType != 'FIPS'):
        raise ValueError("returnType must be either 'full' 'two_digit' or 'FIPS'")

    code_hash = {
        'AL': {'full': 'Alabama', 'two_digit': 'AL', 'FIPS': 1},
        'AK': {'full': 'Alaska', 'two_digit': 'AK', 'FIPS': 2},
        'AZ': {'full': 'Arizona', 'two_digit': 'AZ', 'FIPS': 4},
        'AR': {'full': 'Arkansas', 'two_digit': 'AR', 'FIPS': 5},
        'CA': {'full': 'California', 'two_digit': 'CA', 'FIPS': 6},
        'CO': {'full': 'Colorado', 'two_digit': 'CO', 'FIPS': 8},
        'CT': {'full': 'Connecticut', 'two_digit': 'CT', 'FIPS': 9},
        'DE': {'full': 'Delaware', 'two_digit': 'DE', 'FIPS': 10},
        'DC': {'full': 'District of Columbia', 'two_digit': 'DC', 'FIPS': 11},
        'FL': {'full': 'Florida', 'two_digit': 'FL', 'FIPS': 12},
        'GA': {'full': 'Georgia', 'two_digit': 'GA', 'FIPS': 13},
        'HI': {'full': 'Hawaii', 'two_digit': 'HI', 'FIPS': 15},
        'ID': {'full': 'Idaho', 'two_digit': 'ID', 'FIPS': 16},
        'IL': {'full': 'Illinois', 'two_digit': 'IL', 'FIPS': 17},
        'IN': {'full': 'Indiana', 'two_digit': 'IN', 'FIPS': 18},
        'IA': {'full': 'Iowa', 'two_digit': 'IA', 'FIPS': 19},
        'KS': {'full': 'Kansas', 'two_digit': 'KS', 'FIPS': 20},
        'KY': {'full': 'Kentucky', 'two_digit': 'KY', 'FIPS': 21},
        'LA': {'full': 'Louisiana', 'two_digit': 'LA', 'FIPS': 22},
        'ME': {'full': 'Maine', 'two_digit': 'ME', 'FIPS': 23},
        'MD': {'full': 'Maryland', 'two_digit': 'MD', 'FIPS': 24},
        'MA': {'full': 'Massachusetts', 'two_digit': 'MA', 'FIPS': 25},
        'MI': {'full': 'Michigan', 'two_digit': 'MI', 'FIPS': 26},
        'MN': {'full': 'Minnesota', 'two_digit': 'MN', 'FIPS': 27},
        'MS': {'full': 'Mississippi', 'two_digit': 'MS', 'FIPS': 28},
        'MO': {'full': 'Missouri', 'two_digit': 'MO', 'FIPS': 29},
        'MT': {'full': 'Montana', 'two_digit': 'MT', 'FIPS': 30},
        'NE': {'full': 'Nebraska', 'two_digit': 'NE', 'FIPS': 31},
        'NV': {'full': 'Nevada', 'two_digit': 'NV', 'FIPS': 32},
        'NH': {'full': 'New Hampshire', 'two_digit': 'NH', 'FIPS': 33},
        'NJ': {'full': 'New Jersey', 'two_digit': 'NJ', 'FIPS': 34},
        'NM': {'full': 'New Mexico', 'two_digit': 'NM', 'FIPS': 35},
        'NY': {'full': 'New York', 'two_digit': 'NY', 'FIPS': 36},
        'NC': {'full': 'North Carolina', 'two_digit': 'NC', 'FIPS': 37},
        'ND': {'full': 'North Dakota', 'two_digit': 'ND', 'FIPS': 38},
        'OH': {'full': 'Ohio', 'two_digit': 'OH', 'FIPS': 39},
        'OK': {'full': 'Oklahoma', 'two_digit': 'OK', 'FIPS': 40},
        'OR': {'full': 'Oregon', 'two_digit': 'OR', 'FIPS': 41},
        'PA': {'full': 'Pennsylvania', 'two_digit': 'PA', 'FIPS': 42},
        'RI': {'full': 'Rhode Island', 'two_digit': 'RI', 'FIPS': 44},
        'SC': {'full': 'South Carolina', 'two_digit': 'SC', 'FIPS': 45},
        'SD': {'full': 'South Dakota', 'two_digit': 'SD', 'FIPS': 46},
        'TN': {'full': 'Tennessee', 'two_digit': 'TN', 'FIPS': 47},
        'TX': {'full': 'Texas', 'two_digit': 'TX', 'FIPS': 48},
        'UT': {'full': 'Utah', 'two_digit': 'UT', 'FIPS': 49},
        'VT': {'full': 'Vermont', 'two_digit': 'VT', 'FIPS': 50},
        'VA': {'full': 'Virginia', 'two_digit': 'VA', 'FIPS': 51},
        'WA': {'full': 'Washington', 'two_digit': 'WA', 'FIPS': 53},
        'WV': {'full': 'West Virginia', 'two_digit': 'WV', 'FIPS': 54},
        'WI': {'full': 'Wisconsin', 'two_digit': 'WI', 'FIPS': 55},
        'WY': {'full': 'Wyoming', 'two_digit': 'WY', 'FIPS': 56},
        'PR': {'full': 'Puerto Rico', 'two_digit': 'PR', 'FIPS': 72}
    }
    for row in code_hash:
        for value in code_hash[row]:
            if code_hash[row][value] == input:
                return code_hash[row][returnType]
    return None

In [103]:
def get_ballotpedia_page(district):
    ballotpedia = 'https://ballotpedia.org/'
    ballotpedia += get_state_code(district[0:2], returnType='full') + '_'
    district_name = district[6:]
    # deal with annoying MA districts *** still not working for all --> just dont scrape MA
    district_name = district[6:]
    if len(district_name) > 3:
        i = 0
        while district_name[i] in ['0','1','2','3','4','5','6','7','8','9']:
            i += 1
        if i > 0:
            num_str = district_name[0:i]
            # use to='ordinal' parameter if '-nd' '-rd' etc wanted automatically
            num_words = num2words(num_str, to='ordinal').capitalize()
            name_str = district_name[i:].replace('th ', ' ').replace('st ', ' ').replace('nd ', ' ').replace('rd ', ' ')
            dist_str = (num_words + name_str).replace(' ', '_')
            ballotpedia += 'House_of_Representatives_' if (district[3:5] == 'HD') else 'State_Senate_'
            ballotpedia += dist_str + '_District'
            return(ballotpedia)
    
    # all non MA districts
    if district[0:2] == 'WI':
        ballotpedia += 'State_Assembly_District_' if (district[3:5] == 'HD') else 'State_Senate_District_'
    else:
        ballotpedia += 'House_of_Representatives_District_' if (district[3:5] == 'HD') else 'State_Senate_District_'
    district_name = district[6:]
    while (district_name[0] == '0'):
        district_name = district_name[1:]
    ballotpedia += district_name.replace(' ', '_').replace('&','%26')

    return ballotpedia


In [141]:
# th st nd rd 
get_ballotpedia_page('FL-HD-20')

'https://ballotpedia.org/Florida_House_of_Representatives_District_20'

In [56]:
get_ballotpedia_page("MN-SD-05")

'https://ballotpedia.org/Minnesota_State_Senate_District_5'

In [12]:
moneyball_dir = Path.cwd() 

In [13]:
# setup dataframe to store data as its scraped
df = pd.read_csv(moneyball_dir / 'processed_data_8_11.csv')
df = df[['state', 'district', 'rep_nominee', 'dem_nominee', 'incumbent']]
df = pd.concat([df, pd.DataFrame(columns=['r_cand', 'd_cand', 'y_cand', 'y_cand_party', 'r_cand_b_link', 'd_cand_b_link', 'y_cand_b_link', 'r_cand_p_link', 'd_cand_p_link', 'y_cand_p_link' ])])

In [61]:
df.head()

Unnamed: 0,state,district,rep_nominee,dem_nominee,incumbent,r_cand,d_cand,y_cand,y_cand_party,r_cand_b_link,d_cand_b_link,y_cand_b_link,r_cand_p_link,d_cand_p_link,y_cand_p_link
0,TX,TX-HD-112,Angie Chen Button,Brandy Chambers,R,,,,,,,,,,
1,TX,TX-HD-26,Jacey Jetton,L. Sarah DeMerchant,Open,,,,,,,,,,
2,TX,TX-HD-66,Matt Shaheen,Sharon Hirsch,R,,,,,,,,,,
3,TX,TX-HD-67,Jeff Leach,Lorenzo Sanchez,R,,,,,,,,,,
4,TX,TX-HD-96,David Cook,Joe Drago,Open,,,,,,,,,,


In [142]:
driver = webdriver.Chrome(moneyball_dir / 'chromedriver')
for i in range (1000,len(df.index)):
    # MA urls are difficult to map, haven't figured them all out yet
    if df.iloc[i]['state'] == 'MA': continue

    dist_page_url = get_ballotpedia_page( df.iloc[i]['district'] )
    driver.get(dist_page_url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    print('district: ' + df.iloc[i]['district'])

    # table_soup = soup.select('#mw-content-text > div:nth-child(26) > div.results_table_container > table')[0]
    table_soup = soup.find('table', {"class": "results_table"})
    if table_soup is None or len(table_soup) < 0: continue

    for cand_row in table_soup.find_all("td", class_="votebox-results-cell--text"):
        cand_name = cand_row.find('a').text
        if '(' not in cand_row.text: continue 
        cand_party = cand_row.text.split('(',1)[1].replace(')', '').strip().upper()
        cand_b_href = cand_row.find('a')['href']
        cand_p_site = ''

        # dont waste time with campaign sites for invaluable districts 
        if i < 1000:
            driver.get(cand_b_href)
            cand_b_page_soup = BeautifulSoup(driver.page_source, 'html.parser')
            body = cand_b_page_soup.find("div", id="bodyContent")
            infobox = body.find_all("div", class_='infobox person')
            if infobox is not None and len(infobox) >0:
                cand_p_obj = infobox[0].find("a", string="Campaign website" )
                cand_p_site = '' if cand_p_obj is None else cand_p_obj['href']

        if cand_party == 'R':
            df.iloc[i]['r_cand'] = cand_name
            df.iloc[i]['r_cand_b_link'] = cand_b_href
            df.iloc[i]['r_cand_p_link'] = cand_p_site
        elif cand_party == 'D':
            df.iloc[i]['d_cand'] = cand_name
            df.iloc[i]['d_cand_b_link'] = cand_b_href
            df.iloc[i]['d_cand_p_link'] = cand_p_site
        else: 
            df.iloc[i]['y_cand'] = cand_name
            df.iloc[i]['y_cand_party'] = cand_party
            df.iloc[i]['y_cand_b_link'] = cand_b_href
            df.iloc[i]['y_cand_p_link'] = cand_p_site

        print(f"""Name: {cand_name} Party: {cand_party} Site: {cand_p_site}""")



me: Glen Casada Party: R Site: 
Name: Elizabeth Madeira Party: D Site: 
Name: Brad Fiscus Party: INDEPENDENT Site: 
district: IL-HD-76
Name: Lance Yednock Party: D Site: 
Name: Travis Breeden Party: R Site: 
district: TN-HD-40
Name: Terri Lynn Weaver Party: R Site: 
Name: Paddy Sizemore Party: INDEPENDENT Site: 
district: UT-HD-11
Name: Kelly Miles Party: R Site: 
Name: Jason Allen Party: D Site: 
district: TN-HD-89
Name: Justin Lafferty Party: R Site: 
Name: Greg Mills Party: INDEPENDENT Site: 
district: TN-HD-64
Name: Scott Cepicky Party: R Site: 
Name: James Campbell Party: D Site: 
district: OR-HD-53
Name: Jack Zika Party: R Site: 
Name: Emerson Levy Party: D Site: 
district: TN-HD-38
Name: Kelly Keisling Party: R Site: 
Name: Carol Veneá Abney Party: D Site: 
district: TN-HD-25
Name: Cameron Sexton Party: R Site: 
Name: Robyn Deck Party: D Site: 
district: TN-HD-57
Name: Susan Lynn Party: R Site: 
Name: Tom Sottek Party: INDEPENDENT Site: 
district: GA-HD-124
Name: Henry Howard Pa

WebDriverException: Message: disconnected: Unable to receive message from renderer
  (Session info: chrome=84.0.4147.135)


In [146]:
df

Unnamed: 0,state,district,rep_nominee,dem_nominee,incumbent,r_cand,d_cand,y_cand,y_cand_party,r_cand_b_link,d_cand_b_link,y_cand_b_link,r_cand_p_link,d_cand_p_link,y_cand_p_link
0,TX,TX-HD-112,Angie Chen Button,Brandy Chambers,R,Angie Chen Button,Brandy Chambers,Shane Newsom,L,https://ballotpedia.org/Angie_Chen_Button,https://ballotpedia.org/Brandy_Chambers,https://ballotpedia.org/Shane_Newsom,http://www.angiebutton.com/,https://brandykchambers.com/,https://www.newsomforstaterep.com/
1,TX,TX-HD-26,Jacey Jetton,L. Sarah DeMerchant,Open,Jacey Jetton,L. Sarah DeMerchant,,,https://ballotpedia.org/Jacey_Jetton,https://ballotpedia.org/L._Sarah_DeMerchant,,https://www.jaceyjetton.com/,https://www.democratdemerchant.com/,
2,TX,TX-HD-66,Matt Shaheen,Sharon Hirsch,R,Matt Shaheen,Sharon Hirsch,Shawn Jones,L,https://ballotpedia.org/Matt_Shaheen,https://ballotpedia.org/Sharon_Hirsch,https://ballotpedia.org/Shawn_Jones,http://mattshaheen.com/index.html,https://www.sharon4tx.com/,
3,TX,TX-HD-67,Jeff Leach,Lorenzo Sanchez,R,Jeff Leach,Lorenzo Sanchez,,,https://ballotpedia.org/Jeff_Leach,https://ballotpedia.org/Lorenzo_Sanchez,,http://www.jeffleach.com/,https://www.lorenzofortexas.com/,
4,TX,TX-HD-96,David Cook,Joe Drago,Open,David Cook,Joe Drago,Nelson Range,L,https://ballotpedia.org/David_Cook_(Texas_Hous...,https://ballotpedia.org/Joe_Drago,https://ballotpedia.org/Nelson_Range,https://www.davidcookfortexas.com/,https://dragofortexas.com/home-1,https://nelsonrangefortexas96.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2176,UT,UT-SD-13,Jake Anderegg,Lyndsey Jarman,R,,,,,,,,,,
2177,UT,UT-SD-19,TBA,Katy Owens,Open,,,,,,,,,,
2178,UT,UT-SD-24,TBA,,Open,,,,,,,,,,
2179,UT,UT-SD-25,TBA,Nancy Huntly,Open,,,,,,,,,,


In [144]:
df.iloc[1909]

state                                               IL
district                                      IL-HD-95
rep_nominee                               Avery Bourne
dem_nominee                              Chase Wilhelm
incumbent                                            R
r_cand                                    Avery Bourne
d_cand                                   Chase Wilhelm
y_cand                                             NaN
y_cand_party                                       NaN
r_cand_b_link     https://ballotpedia.org/Avery_Bourne
d_cand_b_link    https://ballotpedia.org/Chase_Wilhelm
y_cand_b_link                                      NaN
r_cand_p_link                                         
d_cand_p_link                                         
y_cand_p_link                                      NaN
Name: 1909, dtype: object

In [145]:
df.to_csv(moneyball_dir / 'scraper_output.csv', index=False)