In [42]:
import pandas as pd 
import csv
from pathlib import Path

from collections import OrderedDict

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time

from num2words import num2words

from collections import OrderedDict

In [25]:
# Util method to come between state formats 
# returnType must be either 'full' 'two_digit' or 'FIPS'
def get_state_code(input, returnType = 'FIPS'):
    if (returnType != 'full' and returnType != 'two_digit' and returnType != 'FIPS'):
        raise ValueError("returnType must be either 'full' 'two_digit' or 'FIPS'")

    code_hash = {
        'AL': {'full': 'Alabama', 'two_digit': 'AL', 'FIPS': 1},
        'AK': {'full': 'Alaska', 'two_digit': 'AK', 'FIPS': 2},
        'AZ': {'full': 'Arizona', 'two_digit': 'AZ', 'FIPS': 4},
        'AR': {'full': 'Arkansas', 'two_digit': 'AR', 'FIPS': 5},
        'CA': {'full': 'California', 'two_digit': 'CA', 'FIPS': 6},
        'CO': {'full': 'Colorado', 'two_digit': 'CO', 'FIPS': 8},
        'CT': {'full': 'Connecticut', 'two_digit': 'CT', 'FIPS': 9},
        'DE': {'full': 'Delaware', 'two_digit': 'DE', 'FIPS': 10},
        'DC': {'full': 'District of Columbia', 'two_digit': 'DC', 'FIPS': 11},
        'FL': {'full': 'Florida', 'two_digit': 'FL', 'FIPS': 12},
        'GA': {'full': 'Georgia', 'two_digit': 'GA', 'FIPS': 13},
        'HI': {'full': 'Hawaii', 'two_digit': 'HI', 'FIPS': 15},
        'ID': {'full': 'Idaho', 'two_digit': 'ID', 'FIPS': 16},
        'IL': {'full': 'Illinois', 'two_digit': 'IL', 'FIPS': 17},
        'IN': {'full': 'Indiana', 'two_digit': 'IN', 'FIPS': 18},
        'IA': {'full': 'Iowa', 'two_digit': 'IA', 'FIPS': 19},
        'KS': {'full': 'Kansas', 'two_digit': 'KS', 'FIPS': 20},
        'KY': {'full': 'Kentucky', 'two_digit': 'KY', 'FIPS': 21},
        'LA': {'full': 'Louisiana', 'two_digit': 'LA', 'FIPS': 22},
        'ME': {'full': 'Maine', 'two_digit': 'ME', 'FIPS': 23},
        'MD': {'full': 'Maryland', 'two_digit': 'MD', 'FIPS': 24},
        'MA': {'full': 'Massachusetts', 'two_digit': 'MA', 'FIPS': 25},
        'MI': {'full': 'Michigan', 'two_digit': 'MI', 'FIPS': 26},
        'MN': {'full': 'Minnesota', 'two_digit': 'MN', 'FIPS': 27},
        'MS': {'full': 'Mississippi', 'two_digit': 'MS', 'FIPS': 28},
        'MO': {'full': 'Missouri', 'two_digit': 'MO', 'FIPS': 29},
        'MT': {'full': 'Montana', 'two_digit': 'MT', 'FIPS': 30},
        'NE': {'full': 'Nebraska', 'two_digit': 'NE', 'FIPS': 31},
        'NV': {'full': 'Nevada', 'two_digit': 'NV', 'FIPS': 32},
        'NH': {'full': 'New Hampshire', 'two_digit': 'NH', 'FIPS': 33},
        'NJ': {'full': 'New Jersey', 'two_digit': 'NJ', 'FIPS': 34},
        'NM': {'full': 'New Mexico', 'two_digit': 'NM', 'FIPS': 35},
        'NY': {'full': 'New York', 'two_digit': 'NY', 'FIPS': 36},
        'NC': {'full': 'North Carolina', 'two_digit': 'NC', 'FIPS': 37},
        'ND': {'full': 'North Dakota', 'two_digit': 'ND', 'FIPS': 38},
        'OH': {'full': 'Ohio', 'two_digit': 'OH', 'FIPS': 39},
        'OK': {'full': 'Oklahoma', 'two_digit': 'OK', 'FIPS': 40},
        'OR': {'full': 'Oregon', 'two_digit': 'OR', 'FIPS': 41},
        'PA': {'full': 'Pennsylvania', 'two_digit': 'PA', 'FIPS': 42},
        'RI': {'full': 'Rhode Island', 'two_digit': 'RI', 'FIPS': 44},
        'SC': {'full': 'South Carolina', 'two_digit': 'SC', 'FIPS': 45},
        'SD': {'full': 'South Dakota', 'two_digit': 'SD', 'FIPS': 46},
        'TN': {'full': 'Tennessee', 'two_digit': 'TN', 'FIPS': 47},
        'TX': {'full': 'Texas', 'two_digit': 'TX', 'FIPS': 48},
        'UT': {'full': 'Utah', 'two_digit': 'UT', 'FIPS': 49},
        'VT': {'full': 'Vermont', 'two_digit': 'VT', 'FIPS': 50},
        'VA': {'full': 'Virginia', 'two_digit': 'VA', 'FIPS': 51},
        'WA': {'full': 'Washington', 'two_digit': 'WA', 'FIPS': 53},
        'WV': {'full': 'West Virginia', 'two_digit': 'WV', 'FIPS': 54},
        'WI': {'full': 'Wisconsin', 'two_digit': 'WI', 'FIPS': 55},
        'WY': {'full': 'Wyoming', 'two_digit': 'WY', 'FIPS': 56},
        'PR': {'full': 'Puerto Rico', 'two_digit': 'PR', 'FIPS': 72}
    }
    for row in code_hash:
        for value in code_hash[row]:
            if code_hash[row][value] == input:
                return code_hash[row][returnType]
    return None

In [78]:
def get_ballotpedia_page(district):
    ballotpedia = 'https://ballotpedia.org/'
    ballotpedia += get_state_code(district[0:2], returnType='full') + '_'
    district_name = district[6:]
    # deal with annoying MA districts *** still not working for all --> just dont scrape MA
    district_name = district[6:]
    if len(district_name) > 3:
        i = 0
        while district_name[i] in ['0','1','2','3','4','5','6','7','8','9']:
            i += 1
        if i > 0:
            num_str = district_name[0:i]
            # use to='ordinal' parameter if '-nd' '-rd' etc wanted automatically
            num_words = num2words(num_str, to='ordinal').capitalize()
            name_str = district_name[i:].replace('th ', ' ').replace('st ', ' ').replace('nd ', ' ').replace('rd ', ' ')
            dist_str = (num_words + name_str).replace(' ', '_')
            ballotpedia += 'House_of_Representatives_' if (district[3:5] == 'HD') else 'State_Senate_'
            ballotpedia += dist_str + '_District'
            return(ballotpedia)
    
    # all non MA districts
    ballotpedia += 'House_of_Representatives_District_' if (district[3:5] == 'HD') else 'State_Senate_District_'
    district_name = district[6:]
    ballotpedia += district[6:].replace(' ', '_').replace('&','%26')

    return ballotpedia


In [82]:
# th st nd rd 
get_ballotpedia_page('NC-HD-113')

'https://ballotpedia.org/North Carolina_House_of_Representatives_District_113'

In [19]:
moneyball_dir = Path.cwd() 

In [83]:
# setup dataframe to store data as its scraped
df = pd.read_csv(moneyball_dir / 'processed_data_8_11.csv')
df = df[['state', 'district', 'rep_nominee', 'dem_nominee', 'incumbent']]
df = pd.concat([df, pd.DataFrame(columns=['r_cand', 'd_cand', 'y_cand', 'y_cand_party', 'r_cand_b_link', 'd_cand_b_link', 'y_cand_b_link', 'r_cand_p_link', 'd_cand_p_link', 'y_cand_p_link' ])])

In [89]:
df.head()

Unnamed: 0,state,district,rep_nominee,dem_nominee,incumbent,r_cand,d_cand,y_cand,y_cand_party,r_cand_b_link,d_cand_b_link,y_cand_b_link,r_cand_p_link,d_cand_p_link,y_cand_p_link
0,TX,TX-HD-112,Angie Chen Button,Brandy Chambers,R,TX-HD-112,,,,,,,,,
1,TX,TX-HD-26,Jacey Jetton,L. Sarah DeMerchant,Open,TX-HD-26,,,,,,,,,
2,TX,TX-HD-66,Matt Shaheen,Sharon Hirsch,R,TX-HD-66,,,,,,,,,
3,TX,TX-HD-67,Jeff Leach,Lorenzo Sanchez,R,TX-HD-67,,,,,,,,,
4,TX,TX-HD-96,David Cook,Joe Drago,Open,TX-HD-96,,,,,,,,,


In [21]:
driver = webdriver.Chrome(moneyball_dir / 'chromedriver')
driver.get('http://www.google.com/')
time.sleep(5) # Let the user actually see something!

In [91]:
driver = webdriver.Chrome(moneyball_dir / 'chromedriver')
for i in range (len(df)):
    dist_page_url = get_ballotpedia_page( df.iloc[i]['district'] )
    driver.get(dist_page_url)

    # df.at[i, 'r_cand'] = dist_page_url

NoSuchWindowException: Message: no such window: window was already closed
  (Session info: chrome=84.0.4147.125)


In [107]:
driver = webdriver.Chrome(moneyball_dir / 'chromedriver')
dist_page_url = get_ballotpedia_page( df.iloc[0]['district'] )
driver.get(dist_page_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

In [128]:
table_soup = soup.select('#mw-content-text > div:nth-child(26) > div.results_table_container > table')[0]

In [129]:
print(table_soup)

<table class="results_table">
<tbody><tr class="non_result_row" style="margin-top: 6px;">
<td colspan="2"></td>
<td><div>Candidate</div></td>
</tr>
<tr class="results_row">
<td class="votebox-results-cell--check"></td>
<td>
<div class="image-candidate-thumbnail-wrapper Republican">
<p> <img alt="Image of https://s3.amazonaws.com/ballotpedia-api4/files/thumbs/100/100/Angie_Chen_Button.jpg" class="image-candidate-thumbnail" onerror="imgError(this);" src="https://s3.amazonaws.com/ballotpedia-api4/files/thumbs/100/100/Angie_Chen_Button.jpg"/>
</p>
</div>
</td>
<td class="votebox-results-cell--text"><b><u><a href="https://ballotpedia.org/Angie_Chen_Button">Angie Chen Button</a></u></b> (R)										</td>
</tr>
<tr class="results_row">
<td class="votebox-results-cell--check"></td>
<td>
<div class="image-candidate-thumbnail-wrapper Democratic">
<p> <img alt="Image of https://s3.amazonaws.com/ballotpedia-api4/files/thumbs/100/100/Brandy_Chambers.JPG" class="image-candidate-thumbnail" onerror="

In [130]:
for cand_row in table_soup.find_all("td", class_="votebox-results-cell--text"):
    print(cand_row)

<td class="votebox-results-cell--text"><b><u><a href="https://ballotpedia.org/Angie_Chen_Button">Angie Chen Button</a></u></b> (R)										</td>
<td class="votebox-results-cell--text"><a href="https://ballotpedia.org/Brandy_Chambers">Brandy Chambers</a> (D)										</td>
<td class="votebox-results-cell--text"><a href="https://ballotpedia.org/Shane_Newsom">Shane Newsom</a> (L)										</td>


In [118]:
table_soup.find_all("td", class_="votebox-results-cell--text")

AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?