In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import bs4 as bs
import requests
import re
import unidecode
import us
from datetime import datetime, date as make_date

DATA_DIR = Path('.')/'..'/'..'/'data'
pd.set_option('display.max_rows', 500)

In [2]:
##### Map name:abbr for states
# Initial mapping
STATES_DICT = us.states.mapping('name', 'abbr')
# Remove all unwanted states
territories = ['American Samoa', 'Dakota', 'District of Columbia', 'Guam','Northern Mariana Islands', 
               'Orleans', 'Philippine Islands', 'Puerto Rico', 'Virgin Islands']
for t in territories:
    del STATES_DICT[t]

    
##### Map parties of interest; all others will be cast to NaN #####
PARTY_DICT = {'republican' : 'r', 'democratic' : 'd'}


##### Regex shortcuts #####
NAME = r"[\w\s\-\.\'\,]"

def clean_name(series):
    """
    Remove following punctuation commonly found in names: - . , '
    Also strip names of accents
    """
    result = series.copy()
    
    punc_list = ['-', '.', '\'', ',']
    for punc in punc_list:
        result = result.str.replace(punc, '')
    result = result.apply(unidecode.unidecode)
    
    return result

# House Names

### Names 2014, 2016, 2018

In [3]:
def scrape_house_names_new(year):
    """Works for 2014, 2016, 2018."""
    RACE_TYPE = 'house'
    source = requests.get(f'https://en.wikipedia.org/wiki/{year}_United_States_House_of_Representatives_elections')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    # Initialize empty lists
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    num_states = 0
    
    # TODO: Go through special elections as well, which are not sortable tables
    
    # iterate through all sortable tables
    for table in soup.find_all('table', class_="wikitable sortable"):
        # skip tables without candidate info
        if 'Candidates' not in [h.text.strip() for h in table.find_all('th', text=True)]:
            continue
        
        # each table is one state 
        state = None
        for tr in table.find_all('tr')[2:]:
            # find state if not set yet for this table
            if not state:
                state = re.match(r"([\w\s]+)\s.+", tr.find('th').text).groups()[0]
                num_states += 1
            
            # candidate names are stored in last cell of row
            candidates = tr.find_all('td')[-1]
            candidates_text = re.findall(rf"\w{NAME}+?\(\w+?\)", candidates.text)
            
            for candidate in candidates_text:
                matcher = re.match(rf"({NAME}+)\s\((\w+)\)", candidate.lower())
                # if valid match
                if matcher:
                    name, party = matcher.groups()
                    names.append(name)
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(year)
                    states.append(state)
                    
        # stop at 50 states
        if num_states == 50:
            break
    
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['name'] = clean_name(df['name'])
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
    
    return df

In [4]:
house_names_2014 = scrape_house_names_new(2014)
house_names_2016 = scrape_house_names_new(2016)
house_names_2018 = scrape_house_names_new(2018)

### Names 2012

In [5]:
def scrape_house_names_2012():
    """Works for 2012."""
    RACE_TYPE = 'house'
    source = requests.get(f'https://en.wikipedia.org/wiki/United_States_House_of_Representatives_elections,_2012')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    num_states = 0
    
    for table in soup.find_all('table', class_='wikitable')[3:]:
    #     print(table.prettify())
        state = None
        for tr in table.find_all('tr')[1:]:
            # find state if not set yet
            if not state:
                state = re.match(r"([\w\s]+)\s.+", tr.find('td').text).groups()[0]
                num_states += 1

            # find candidate names
            candidates = tr.find_all('td')[-1]
            candidates_text = re.findall(rf"\w{NAME}+?\(\w+?\)", candidates.text)
            for candidate in candidates_text:
                matcher = re.match(rf"({NAME}+)\s\((\w+)\)", candidate.lower())
                if matcher:
                    name, party = matcher.groups()
                    names.append(name)
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(2012)
                    states.append(state)
                    
        if num_states == 50:
            break
        
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['name'] = clean_name(df['name'])
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
    
    return df

In [6]:
house_names_2012 = scrape_house_names_2012()

### Names 2010

In [7]:
def scrape_house_names_2010():
    """Works for 2010."""
    RACE_TYPE = 'house'
    source = requests.get(f'https://en.wikipedia.org/wiki/United_States_House_of_Representatives_elections,_2010')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    num_states = 0
    
    for table in soup.find_all('table', class_='wikitable')[4:]:
        state = None
        for tr in table.find_all('tr')[1:]:
            if not state:
                state = re.match(r"([\w\s]+)\s.+", tr.find('th').text).groups()[0]
                num_states += 1

            # find candidate names
            candidates = tr.find_all('td')[-1]
            candidates_text = re.findall(rf"\w{NAME}+?\(\w+?\)", candidates.text)
            for candidate in candidates_text:
                matcher = re.match(rf"({NAME}+)\s\((\w+)\)", candidate.lower())
                if matcher:
                    name, party = matcher.groups()
                    names.append(name)
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(2010)
                    states.append(state)
                    
        if num_states == 50:
            break
        
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['name'] = clean_name(df['name'])
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
    
    return df

In [8]:
house_names_2010 = scrape_house_names_2010()

### House Concatenate

In [9]:
house_names = pd.concat([house_names_2010, house_names_2012, house_names_2014, house_names_2016, house_names_2018],
                       ignore_index=True)

In [10]:
print('Shape:', house_names.shape)
print('\nNull values:')
print(house_names.isnull().sum())
house_names

Shape: (5595, 5)

Null values:
name         0
party        0
race_type    0
year         0
state        0
dtype: int64


Unnamed: 0,name,party,race_type,year,state
0,jo bonner,o,house,2010,AL
1,david walter,o,house,2010,AL
2,martha roby,o,house,2010,AL
3,bobby bright,o,house,2010,AL
4,mike rogers,o,house,2010,AL
5,steve segrest,o,house,2010,AL
6,robert aderholt,o,house,2010,AL
7,mo brooks,o,house,2010,AL
8,steve raby,o,house,2010,AL
9,spencer bachus,o,house,2010,AL


In [11]:
house_names.to_csv(DATA_DIR/'cleaned'/'candidate_parties_house.csv', index=False)

# House Election Results

In [12]:
source = requests.get(f'https://en.wikipedia.org/wiki/2018_United_States_House_of_Representatives_elections')
soup = bs.BeautifulSoup(source.content, features='html.parser')

In [13]:
count = 0
for table in soup.find_all('table', class_='wikitable sortable'):
    if 'Candidates' not in [h.text.strip() for h in table.find_all('th', text=True)]:
        continue
    
    print(table.attrs)
    for tr in table.find_all('tr')[2:]:
        print(tr.find('th'))
        print ([td.text.strip() for td in tr.find_all('th')])
        break
    break
count

{'class': ['wikitable', 'sortable']}
<th><a href="/wiki/Alabama%27s_1st_congressional_district" title="Alabama's 1st congressional district">Alabama 1</a>
</th>
['Alabama\xa01']


0

In [20]:
def scrape_house_2018(year):
    """Works for 2014, 2016, 2018."""
    RACE_TYPE = 'house'
    source = requests.get(f'https://en.wikipedia.org/wiki/{year}_United_States_House_of_Representatives_elections')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    # Initialize empty lists
    names = []
    parties = []
    race_types = []
    dates = []
    states = []
    districts = []
    incumbents = []
    incumbent_first_electeds = []
    winners = []
    pcts = []
    
    ##### Special elections
    for table in soup.find_all('table', class_="wikitable"):
        # Skip tables without candidate info
        if 'Candidates' not in [h.text.strip() for h in table.find_all('th', text=True)]:
            continue
        
        # Go through each row; non-sortable has one header row
        for tr in table.find_all('tr')[1:]:          
            cells = tr.find_all('td')
            
            # Extract state, district
            state, district = re.match(r"([\w\s]+)\s(.+)", cells[0].text).groups()
            if district == 'at-large':
                district = 0
            # Find incumbent name, elected year, and special election date
            incumbent = cells[1].text.lower().strip()
            incumbent_first_elected = cells[3].text.strip()
            date = re.match(r".*New member elected (.*?)\..*", cells[4].text).groups()[0]
            date = datetime.strptime(date, "%B %d, %Y")
            
            # All candidate info stored in last cell
            candidates = cells[-1]
            candidates_text = re.findall(rf"\s*{NAME}+\s\(\w+?\)\s*[\d\.]*", candidates.text)
            # Go through each candidate
            for candidate in candidates_text:
                # Match name, party, pct. If pct missing, assign NaN
                name, party, pct = re.match(rf"\s*({NAME}+)\s\((\w+?)\)\s*([\d\.]*)", candidate.lower()).groups()
                if pct == '':
                    pct = np.nan
                names.append(name)
                parties.append(party)
                race_types.append(RACE_TYPE)
                dates.append(date)
                states.append(state)
                districts.append(district)
                incumbents.append(incumbent)
                incumbent_first_electeds.append(incumbent_first_elected)
                pcts.append(pct)
        # Stop after one table (since there's only one table of special elections)
        break

    ##### Regular elections
    for table in soup.find_all('table', class_="wikitable sortable"):
        # Skip tables without candidate info
        if 'Candidates' not in [h.text.strip() for h in table.find_all('th', text=True)]:
            continue
        
        # Go through each row; sortable has two header rows
        for tr in table.find_all('tr')[2:]:
            # For these tables, state info is stored as 'th'. 
            # There is an edge case in which row_head does not exist; use previous state.
            row_head = tr.find('th')
            if row_head:
                state, district = re.match(r"([\w\s]+)\s(.+)", row_head.text).groups()
                if district == 'at-large':
                    district = 0
            
            cells = tr.find_all('td')
            # Find incumbent name, elected year, and special election date
            incumbent = cells[1].text.lower().strip()
            incumbent_first_elected = re.match(r"(\d*).*", cells[3].text.strip()).groups()[0]
            date = make_date(2018, 11, 6)
            
            # All candidate info stored in last cell
            candidates = cells[-1]
            candidates_text = re.findall(rf"\s*{NAME}+\s\(\w+?\)\s*[\d\.]*", candidates.text)
            # Go through each candidate
            for candidate in candidates_text:
                # Match name, party, pct. If pct missing, assign NaN
                name, party, pct = re.match(rf"\s*({NAME}+)\s\((\w+?)\)\s*([\d\.]*)", candidate.lower()).groups()
                if pct == '':
                    pct = np.nan
                names.append(name)
                parties.append(party)
                race_types.append(RACE_TYPE)
                dates.append(date)
                states.append(state)
                districts.append(district)
                incumbents.append(incumbent)
                incumbent_first_electeds.append(incumbent_first_elected)
                pcts.append(pct)
        
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'date': dates,
                       'state': states,
                       'district': districts,
                       'incumbent': incumbents,
                       'incumbent_first_elected': incumbent_first_electeds,
                       'pct': pcts})
    df['name'] = clean_name(df['name'])
    df['incumbent'] = clean_name(df['incumbent'])
    df['state'] = df['state'].apply(unidecode.unidecode)
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
    
    return df

In [21]:
scrape_house_2018(2018)

Unnamed: 0,name,party,race_type,date,state,district,incumbent,incumbent_first_elected,pct
0,conor lamb,d,house,2018-03-13,PA,18,tim murphy,2002,49.9
1,rick saccone,r,house,2018-03-13,PA,18,tim murphy,2002,49.5
2,drew miller,o,house,2018-03-13,PA,18,tim murphy,2002,0.6
3,debbie lesko,r,house,2018-04-24,AZ,8,trent franks,2002,52.4
4,hiral tipirneni,d,house,2018-04-24,AZ,8,trent franks,2002,47.6
5,michael cloud,r,house,2018-06-30,TX,27,blake farenthold,2010,54.7
6,eric holguin,d,house,2018-06-30,TX,27,blake farenthold,2010,32.0
7,roy barrera,d,house,2018-06-30,TX,27,blake farenthold,2010,4.8
8,bech bruun,r,house,2018-06-30,TX,27,blake farenthold,2010,4.3
9,mike westergren,d,house,2018-06-30,TX,27,blake farenthold,2010,2.4
