In [1]:
import pandas as pd
from pathlib import Path
import bs4 as bs
import requests
import re
import unidecode
import us

##### Map name:abbr for states
# Initial Mapping #
STATES_DICT = us.states.mapping('name', 'abbr')
# Remove all unwanted states #
territories = ['American Samoa', 'Dakota', 'District of Columbia', 'Guam','Northern Mariana Islands', 
               'Orleans', 'Philippine Islands', 'Puerto Rico', 'Virgin Islands']
for t in territories:
    del STATES_DICT[t]

##### Map parties of interest; all others will be cast to NaN
PARTY_DICT = {'republican' : 'r', 'democratic' : 'd'}

##### Regex shortcuts
NAME = r"[\w\s\-\.\'\,]"
def clean_name(series):
    """
    Remove following punctuation commonly found in names: - . , '
    Also strip names of accents
    """
    result = series.copy()
    
    punc_list = ['-', '.', '\'', ',']
    for punc in punc_list:
        result = result.str.replace(punc, '')
    series = series.apply(unidecode.unidecode)
    
    return result


DATA_DIR = Path('.')/'..'/'..'/'data'
pd.set_option('display.max_rows', 500)

# House Names

### Names 2014, 2016, 2018

In [2]:
def scrape_house_names_new(year):
    """Works for 2014, 2016, 2018."""
    RACE_TYPE = 'house'
    source = requests.get(f'https://en.wikipedia.org/wiki/{year}_United_States_House_of_Representatives_elections')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    # Initialize empty lists
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    num_states = 0
    
    # TODO: Go through special elections as well, which are not sortable tables
    
    # iterate through all sortable tables
    for table in soup.find_all('table', class_="wikitable sortable"):
        # skip tables without candidate info
        if 'Candidates' not in [h.text.strip() for h in table.find_all('th', text=True)]:
            continue
        
        # each table is one state 
        state = None
        for tr in table.find_all('tr')[2:]:
            # find state if not set yet for this table
            if not state:
                state = re.match(r"([\w\s]+)\s.+", tr.find('th').text).groups()[0]
                num_states += 1
            
            # candidate names are stored in last cell of row
            candidates = tr.find_all('td')[-1]
            candidates_text = re.findall(rf"\w{NAME}+?\(\w+?\)", candidates.text)
            
            for candidate in candidates_text:
                matcher = re.match(rf"({NAME}+)\s\((\w+)\)", candidate.lower())
                # if valid match
                if matcher:
                    name, party = matcher.groups()
                    names.append(name)
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(year)
                    states.append(state)
                    
        # stop at 50 states
        if num_states == 50:
            break
    
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['name'] = clean_name(df['name'])
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
    
    return df

In [3]:
house_names_2014 = scrape_house_names_new(2014)
house_names_2016 = scrape_house_names_new(2016)
house_names_2018 = scrape_house_names_new(2018)

### Names 2012

In [4]:
def scrape_house_names_2012():
    """Works for 2012."""
    RACE_TYPE = 'house'
    source = requests.get(f'https://en.wikipedia.org/wiki/United_States_House_of_Representatives_elections,_2012')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    num_states = 0
    
    for table in soup.find_all('table', class_='wikitable')[3:]:
    #     print(table.prettify())
        state = None
        for tr in table.find_all('tr')[1:]:
            # find state if not set yet
            if not state:
                state = re.match(r"([\w\s]+)\s.+", tr.find('td').text).groups()[0]
                num_states += 1

            # find candidate names
            candidates = tr.find_all('td')[-1]
            candidates_text = re.findall(rf"\w{NAME}+?\(\w+?\)", candidates.text)
            for candidate in candidates_text:
                matcher = re.match(rf"({NAME}+)\s\((\w+)\)", candidate.lower())
                if matcher:
                    name, party = matcher.groups()
                    names.append(name)
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(2012)
                    states.append(state)
                    
        if num_states == 50:
            break
        
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['name'] = clean_name(df['name'])
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
    
    return df

In [5]:
house_names_2012 = scrape_house_names_2012()

### Names 2010

In [6]:
def scrape_house_names_2010():
    """Works for 2010."""
    RACE_TYPE = 'house'
    source = requests.get(f'https://en.wikipedia.org/wiki/United_States_House_of_Representatives_elections,_2010')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    num_states = 0
    
    for table in soup.find_all('table', class_='wikitable')[4:]:
        state = None
        for tr in table.find_all('tr')[1:]:
            if not state:
                state = re.match(r"([\w\s]+)\s.+", tr.find('th').text).groups()[0]
                num_states += 1

            # find candidate names
            candidates = tr.find_all('td')[-1]
            candidates_text = re.findall(rf"\w{NAME}+?\(\w+?\)", candidates.text)
            for candidate in candidates_text:
                matcher = re.match(rf"({NAME}+)\s\((\w+)\)", candidate.lower())
                if matcher:
                    name, party = matcher.groups()
                    names.append(name)
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(2010)
                    states.append(state)
                    
        if num_states == 50:
            break
        
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['name'] = clean_name(df['name'])
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
    
    return df

In [7]:
house_names_2010 = scrape_house_names_2010()

### House Concatenate

In [8]:
house_names = pd.concat([house_names_2010, house_names_2012, house_names_2014, house_names_2016, house_names_2018],
                       ignore_index=True)

In [9]:
print('Shape:', house_names.shape)
print('\nNull values:')
print(house_names.isnull().sum())
house_names

Shape: (5595, 5)

Null values:
name         0
party        0
race_type    0
year         0
state        0
dtype: int64


Unnamed: 0,name,party,race_type,year,state
0,jo bonner,o,house,2010,AL
1,david walter,o,house,2010,AL
2,martha roby,o,house,2010,AL
3,bobby bright,o,house,2010,AL
4,mike rogers,o,house,2010,AL
5,steve segrest,o,house,2010,AL
6,robert aderholt,o,house,2010,AL
7,mo brooks,o,house,2010,AL
8,steve raby,o,house,2010,AL
9,spencer bachus,o,house,2010,AL


In [10]:
house_names.to_csv(DATA_DIR/'cleaned'/'candidate_parties_house.csv', index=False)