## After I scraped Politico, it turned out their individual political parties were rife with error. So I scraped them from the New York Times and merged them back. As an added bonus, NYT had a bunch of races like state legislatures that Politico didn't have. As another bonus, all the data for New York Times election results is hardcoded as a Javascript object in the page source!

In [1]:
from bs4 import BeautifulSoup
import re
import json
import requests
import os

In [2]:
# first use Selenium to get a dict of urls for state results.
if not os.path.isfile('nyt_state2url.json'):
    from selenium import webdriver
    from selenium.webdriver.support.ui import Select
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.implicitly_wait(10)
    driver.get("http://www.nytimes.com/elections/results/alabama")
    # click on 'State Results' item to get list of states and URLs
    toexpand = driver.find_element_by_xpath('//*[@id="shell"]/nav/div/a[6]')
    toexpand.click()
    outerhtml = driver.execute_script("return document.documentElement.outerHTML")
    soup = BeautifulSoup(outerhtml, 'lxml')
    nav = soup.find_all('nav', attrs={'class': "eln-navigation-states"})[0]
    state2url = {}
    for tag in nav.find_all('a'):
        state2url[tag.attrs['title']] = tag.attrs['href']
    driver.quit()
    with open('nyt_state2url.json', 'w+') as f:
        f.write(json.dumps(state2url))
else:
    with open('nyt_state2url.json', 'r') as f:
        state2url = json.load(f)
            

In [5]:
nytresults = {}

In [6]:
for state, url in state2url.items():
    print(state)
    r = requests.get(url)
    assert r.status_code == 200
    soup = BeautifulSoup(r.text, 'lxml')
    for script in soup.find_all('script'):
        if script.text.find('eln_election_date') != -1:
            break
    for line in script.text.split('\n'):
        if line.find('eln_races = [{"') != -1:
            break
    blob = re.search("eln_races\s+=\s+(.+),", line).group(1)
    blob = re.sub('<a href="(.+?)">', r'a href=\"\1\">', blob) # escape the double quotes inside a url inside json
    nytresults[state] = json.loads(blob)
    

Missouri
South Dakota
Ohio
Colorado
Nevada
North Carolina
Texas
Virginia
Florida
Arkansas
New Mexico
District of Columbia
Tennessee
Oklahoma
North Dakota
Wyoming
Iowa
New Jersey
Minnesota
Rhode Island
Delaware
Wisconsin
Mississippi
Michigan
Indiana
Alaska
California
Massachusetts
New Hampshire
Connecticut
Louisiana
West Virginia
Georgia
Maine
Illinois
Kansas
Arizona
New York
South Carolina
Idaho
Pennsylvania
Montana
Kentucky
Alabama
Utah
Vermont
Nebraska
Oregon
Maryland
Hawaii
Washington


In [7]:
with open('nyt_election_2016_by_state.json', 'w+') as f:
    f.write(json.dumps(nytresults))