In [1]:
import pandas as pd
from pathlib import Path
import bs4 as bs
import requests
import re
import us

##### Map name:abbr for states
# Initial Mapping #
STATES_DICT = us.states.mapping('name', 'abbr')
# Remove all unwanted states #
territories = ['American Samoa', 'Dakota', 'District of Columbia', 'Guam','Northern Mariana Islands', 
               'Orleans', 'Philippine Islands', 'Puerto Rico', 'Virgin Islands']
for t in territories:
    del STATES_DICT[t]

##### Map parties of interest; all others will be cast to NaN
PARTY_DICT = {'republican' : 'r', 'democratic' : 'd'}

##### Regex shortcuts
NAME = r"[\w\s\-\.\']"
def remove_punc_from_series(series):
    result = series.copy()
    
    punc_list = ['-', '.', '\'']
    for punc in punc_list:
        result = result.str.replace(punc, '')
    
    return result


DATA_DIR = Path('.')/'..'/'..'/'data'
pd.set_option('display.max_rows', 500)

# Governor

### Names 2010-2015, 2017-2018

In [2]:
def scrape_governor_names(year):
    """Works for all years except 2016."""
    RACE_TYPE = 'governor'
    if year <= 2013:
        source = requests.get(f'https://en.wikipedia.org/wiki/United_States_gubernatorial_elections,_{year}')
    elif year >= 2014:
        source = requests.get(f'https://en.wikipedia.org/wiki/{year}_United_States_gubernatorial_elections')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    
    for table in soup.find_all('table', class_="wikitable"):
        if all(['candidates' not in h.text.strip().lower() for h in table.find_all('th', text=True)]):
            continue
            
        for tr in table.find_all('tr')[1:]:
            cells = tr.find_all('td')
            state = re.match("([\w\s\,\.]+).*", cells[0].text).groups()[0].strip()
            
            # incumbent
            incumbent_name = re.match("([\w\s\.\-]+).*", cells[1].text).groups()[0].lower().strip()
            incumbent_party = cells[2].text.lower().strip()
            names.append(incumbent_name)
            parties.append(PARTY_DICT[incumbent_party.lower()] if incumbent_party.lower() in PARTY_DICT.keys() else incumbent_party.lower())
            race_types.append(RACE_TYPE)
            years.append(year)
            states.append(state)
            
            # competitors
            candidates = cells[-1]
            candidates_text = re.findall(r"\w[\s\w\.\- ]+?\(\w+?\)", candidates.text)
            for candidate in candidates_text:
                matcher = re.match(r"([\w\s\.\-]+)\s\((\w+)\)", candidate.lower())
                if matcher:
                    name, party = matcher.groups()
                    if name == incumbent_name:
                        continue
                    names.append(name.strip())
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(year)
                    states.append(state)

    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['name'] = remove_punc_from_series(df['name'])
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
    
    
    return df

In [3]:
governor_names_2010 = scrape_governor_names(2010)
governor_names_2011 = scrape_governor_names(2011)
governor_names_2012 = scrape_governor_names(2012)
governor_names_2013 = scrape_governor_names(2013)
governor_names_2014 = scrape_governor_names(2014)
governor_names_2015 = scrape_governor_names(2015)
governor_names_2017 = scrape_governor_names(2017)
governor_names_2018 = scrape_governor_names(2018)

### Names 2016

In [4]:
def scrape_governor_2016():
    """Works for 2016."""
    RACE_TYPE = 'governor'
    source = requests.get('https://en.wikipedia.org/wiki/2016_United_States_gubernatorial_elections')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    
    for table in soup.find_all('table', class_="wikitable"):
        if all(['candidates' not in h.text.strip().lower() for h in table.find_all('th', text=True)]):
            continue
            
        for tr in table.find_all('tr')[2:]:
            cells = tr.find_all('td')
            state = tr.find('th').text.strip()
            
            # incumbent
            incumbent_name = re.match("([\w\s\.\-]+).*", cells[1].text).groups()[0].lower().strip()
            incumbent_party = cells[2].text.lower().strip()
            names.append(incumbent_name)
            parties.append(PARTY_DICT[incumbent_party.lower()] if incumbent_party.lower() in PARTY_DICT.keys() else incumbent_party.lower())
            race_types.append(RACE_TYPE)
            years.append(2016)
            states.append(state)
            
            # competitors
            candidates = cells[-1]
            candidates_text = re.findall(r"\w[\s\w\.\- ]+?\(\w+?\)", candidates.text)
            for candidate in candidates_text:
                matcher = re.match(r"([\w\s\.\-]+)\s\((\w+)\)", candidate.lower())
                if matcher:
                    name, party = matcher.groups()
                    if name == incumbent_name:
                        continue
                    names.append(name.strip())
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(2016)
                    states.append(state)
        break

    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['name'] = df['name'].str.replace(r'.', '')
    df['name'] = df['name'].str.replace(r'-', '')
    if df['state'][0] in STATES_DICT.keys():
        df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df.loc[~df['party'].isin(['d', 'r']), 'party'] = 'o'
        
    return df

In [5]:
governor_names_2016 = scrape_governor_2016()

### Concatenate

In [6]:
governor_names = pd.concat([governor_names_2010, governor_names_2011, governor_names_2012, governor_names_2013, 
                           governor_names_2014, governor_names_2015, governor_names_2016, governor_names_2017,
                           governor_names_2018])

In [7]:
print('Shape:', governor_names.shape)
print('\nNull values:')
print(governor_names.isnull().sum())
governor_names

Shape: (498, 5)

Null values:
name         0
party        0
race_type    0
year         0
state        0
dtype: int64


Unnamed: 0,name,party,race_type,year,state
0,bob riley,o,governor,2010,AL
1,robert bentley,o,governor,2010,AL
2,ron sparks,o,governor,2010,AL
3,sean parnell,o,governor,2010,AK
4,ethan berkowitz,o,governor,2010,AK
5,billy toien,o,governor,2010,AK
6,jan brewer,o,governor,2010,AZ
7,terry goddard,o,governor,2010,AZ
8,barry hess,o,governor,2010,AZ
9,mike beebe,o,governor,2010,AR


In [8]:
# governor_names.to_csv(DATA_DIR/'cleaned'/'candidate_parties_governor.csv')