In [4]:
import pandas as pd
import bs4 as bs
import requests
import re
import us

STATES_DICT = us.states.mapping('name', 'abbr')
PARTY_DICT = {'republican' : 'r', 'democratic' : 'd', 'independent': 'i'}

# Senate Names

### 2010, 2012, 2014, 2016, 2018

In [5]:
def scrape_senate_names(year):
    """Works for 2010, 2012, 2014, 2016, 2018."""
    RACE_TYPE = 'senate'
    source = requests.get(f'https://en.wikipedia.org/wiki/{year}_United_States_Senate_elections')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    
    # iterate through all tables with "Candidates"
    for table in soup.find_all('table', class_="wikitable sortable"):
        if 'Candidates' not in [h.text.strip() for h in table.find_all('th', text=True)]:
            continue
        
        for tr in table.find_all('tr')[2:]:
            state = re.match(r"([\w\s]+).*", tr.find('td').text).groups()[0].strip()
                
            # find candidate names
            candidates = tr.find_all('td')[-1]
            candidates_text = re.findall(r"\w[\s\w ]+?\(\w+?\)", candidates.text)
            for candidate in candidates_text:
                matcher = re.match(r"([\w\s]+)\s\((\w+)\)", candidate.lower())
                if matcher:
                    name, party = matcher.groups()
                    names.append(name)
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(year)
                    states.append(state)
    
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['state'] = df['state'].map(STATES_DICT)
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[~df['party'].isin(['d', 'r']), 'party'] = 'o'
    
    return df

In [6]:
senate_names_2010 = scrape_senate_names(2010)
senate_names_2012 = scrape_senate_names(2012)
senate_names_2014 = scrape_senate_names(2014)
senate_names_2016 = scrape_senate_names(2016)
senate_names_2018 = scrape_senate_names(2018)

In [13]:
senate_names_2016.head()

Unnamed: 0,name,party,race_type,year,state
0,richard shelby,r,senate,2016,AL
1,ron crumpton,d,senate,2016,AL
2,lisa murkowski,r,senate,2016,AK
3,joe miller,o,senate,2016,AK
4,margaret stock,o,senate,2016,AK


In [15]:
def scrape_senate_names(year):
    """Works for 2010, 2012, 2014, 2016, 2018."""
    RACE_TYPE = 'senate'
    source = requests.get(f'https://en.wikipedia.org/wiki/{year}_United_States_Senate_elections')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    
    # iterate through all tables with "Candidates"
    for table in soup.find_all('table', class_="wikitable sortable"):
        if 'Candidates' not in [h.text.strip() for h in table.find_all('th', text=True)]:
            print([h.text.strip() for h in table.find_all('th', text=True)])
        
scrape_senate_names(2010)    

### Concatenate

In [10]:
senate_names = pd.concat([senate_names_2010, senate_names_2012, senate_names_2014, senate_names_2016, senate_names_2018])

In [11]:
senate_names.head()

Unnamed: 0,name,party,race_type,year,state
0,scott brown,r,senate,2010,MA
1,martha coakley,d,senate,2010,MA
2,kennedy,o,senate,2010,MA
3,chris coons,d,senate,2010,DE
4,donnell,r,senate,2010,DE
