In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
import bs4 as bs
import requests
import re
import unidecode
import us
from datetime import datetime, date as make_date

DATA_DIR = Path('.')/'..'/'..'/'data'
pd.set_option('display.max_rows', 500)

In [2]:
##### Map name:abbr for states
# Initial mapping
STATES_DICT = us.states.mapping('name', 'abbr')
# Remove all unwanted states
territories = ['American Samoa', 'Dakota', 'District of Columbia', 'Guam','Northern Mariana Islands', 
               'Orleans', 'Philippine Islands', 'Puerto Rico', 'Virgin Islands']
for t in territories:
    del STATES_DICT[t]

    
##### Map parties of interest; all others will be cast to NaN #####
PARTY_DICT = {'republican' : 'r', 'democratic' : 'd'}


##### Regex shortcuts #####
NAME = r"[\w\s\-\.\'\,]"

def clean_name(series):
    """
    Remove following punctuation commonly found in names: - . , '
    Also strip names of accents
    """
    result = series.copy()
    
    punc_list = ['-', '.', '\'', ',']
    for punc in punc_list:
        result = result.str.replace(punc, '')
    result = result.apply(unidecode.unidecode)
    
    return result

# Senate Names

### Names 2010, 2012, 2014, 2016, 2018

In [3]:
def scrape_senate_names(year):
    """Works for 2010, 2012, 2014, 2016, 2018."""
    RACE_TYPE = 'senate'
    source = requests.get(f'https://en.wikipedia.org/wiki/{year}_United_States_Senate_elections')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    years = []
    states = []
    
    # iterate through all tables with "Candidates"
    for table in soup.find_all('table', class_="wikitable sortable"):
        if 'Candidates' not in [h.text.strip() for h in table.find_all('th', text=True)]:
            continue
        
        for tr in table.find_all('tr')[2:]:
            state = re.match(r"([\w\s]+).*", tr.find('td').text).groups()[0].strip()
                
            # find candidate names
            candidates = tr.find_all('td')[-1]
            candidates_text = re.findall(rf"\w*?{NAME}+?\(\w+?\)", candidates.text)
            for candidate in candidates_text:
                matcher = re.match(rf"\w*?({NAME}+)\s\((\w+)\)", candidate.lower())
                if matcher:
                    name, party = matcher.groups()
                    names.append(name)
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    years.append(year)
                    states.append(state)
    
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'year': years,
                       'state': states})
    df['name'] = clean_name(df['name'])
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
    
    return df

In [4]:
senate_names_2010 = scrape_senate_names(2010)
senate_names_2012 = scrape_senate_names(2012)
senate_names_2014 = scrape_senate_names(2014)
senate_names_2016 = scrape_senate_names(2016)
senate_names_2018 = scrape_senate_names(2018)

### Concatenate

In [5]:
senate_names = pd.concat([senate_names_2010, senate_names_2012, senate_names_2014, senate_names_2016, senate_names_2018],
                        ignore_index=True)

In [6]:
print('Shape:', senate_names.shape)
print('\nNull values:')
print(senate_names.isnull().sum())
senate_names

Shape: (589, 5)

Null values:
name         0
party        0
race_type    0
year         0
state        0
dtype: int64


Unnamed: 0,name,party,race_type,year,state
0,scott brown,r,senate,2010,MA
1,martha coakley,d,senate,2010,MA
2,joseph l kennedy,o,senate,2010,MA
3,chris coons,d,senate,2010,DE
4,christine odonnell,r,senate,2010,DE
5,glenn miller,o,senate,2010,DE
6,james rash,o,senate,2010,DE
7,mark kirk,r,senate,2010,IL
8,alexi giannoulias,d,senate,2010,IL
9,lealan jones,o,senate,2010,IL


In [7]:
senate_names.to_csv(DATA_DIR/'cleaned'/'candidate_parties_senate.csv', index=False)

# Senate Election Results

In [93]:
def scrape_senate(year, month, day):
    """Works for 2010, 2012, 2014, 2016, 2018."""
    RACE_TYPE = 'senate'
    source = requests.get(f'https://en.wikipedia.org/wiki/{year}_United_States_Senate_elections')
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    
    names = []
    parties = []
    race_types = []
    dates = []
    states = []
    incumbents = []
    incumbent_first_electeds = []
    winners = []
    pcts = []
    
    # iterate through all tables with "Candidates"
    for table in soup.find_all('table', class_="wikitable sortable"):
        if 'Candidates' not in [h.text.strip() for h in table.find_all('th', text=True)]:
            continue
        
        for tr in table.find_all('tr')[2:]:
            cells = tr.find_all('td')
            state = re.match(r"([\w\s]+).*", cells[0].text).groups()[0].strip()
            
            incumbent = cells[1].text.lower().strip()
            incumbent_first_elected = re.match(r"(\d{4}).*", cells[3].text).groups()[0]
            
            date = re.match(r".*New senator elected (.*?)\..*", cells[4].text)
            if date:
                date = datetime.strptime(date.groups()[0], "%B %d, %Y")
            else:
                date = make_date(year, month, day)
                
            # find candidate names
            candidates = tr.find_all('td')[-1]
            candidates_text = re.findall(rf"{NAME}+\s\(\w*\)\s*?.*?[\d\.]+%", candidates.text)
            for candidate in candidates_text:
                matcher = re.match(rf"\s*({NAME}+)\s\((\w+?)\)\s?.*?([\d\.]+)%", candidate.lower())
                if matcher:
                    name, party, pct = matcher.groups()
                    names.append(name)
                    parties.append(party)
                    race_types.append(RACE_TYPE)
                    dates.append(date)
                    states.append(state)
                    incumbents.append(incumbent)
                    incumbent_first_electeds.append(incumbent_first_elected)
                    pcts.append(pct)
    
    # return df
    df = pd.DataFrame({'name': names, 
                       'party': parties,
                       'race_type': race_types,
                       'date': dates,
                       'state': states,
                       'incumbent': incumbents,
                       'incumbent_first_elected': incumbent_first_electeds,
                       'pct': pcts})
    df['name'] = clean_name(df['name'])
    df['incumbent'] = clean_name(df['incumbent'])
    df['incumbent'] = df['incumbent'].str.replace(r"\sredistricted.*", "")
    df['incumbent'] = df['incumbent'].replace('nope', np.nan)
    df['state'] = df['state'].apply(unidecode.unidecode)
    df['state'] = df['state'].replace('Democratic-Farmer-Labor', 'Democratic')
    df['state'] = df['state'].map(STATES_DICT)
    df = df.dropna(subset=['state'])
    df['party'] = df['party'].map(PARTY_DICT)
    df.loc[df['party'].isnull(), 'party'] = 'o'
        
    return df

In [94]:
senate_2018 = scrape_senate(2018, 11, 6)
senate_2016 = scrape_senate(2016, 11, 8)
senate_2014 = scrape_senate(2014, 11, 4)
senate_2012 = scrape_senate(2012, 11, 6)
senate_2010 = scrape_senate(2010, 11, 2)

In [95]:
senate = pd.concat([senate_2018, senate_2016, senate_2014, senate_2012, senate_2010], ignore_index=True)

In [96]:
senate

Unnamed: 0,name,party,race_type,date,state,incumbent,incumbent_first_elected,pct
0,tina smith,d,senate,2018-11-06,MN,tina smith,2018,53.6
1,karin housley,r,senate,2018-11-06,MN,tina smith,2018,41.8
2,jerry trooien,o,senate,2018-11-06,MN,tina smith,2018,0.9
3,cindy hydesmith,r,senate,2018-11-06,MS,cindy hydesmith,2018,53.9
4,mike espy,d,senate,2018-11-06,MS,cindy hydesmith,2018,46.1
5,kyrsten sinema,d,senate,2018-11-06,AZ,jeff flake,2012,49.68
6,martha mcsally,r,senate,2018-11-06,AZ,jeff flake,2012,47.96
7,angela green,o,senate,2018-11-06,AZ,jeff flake,2012,2.37
8,dianne feinstein,d,senate,2018-11-06,CA,dianne feinstein,1992,53.8
9,kevin de leon,d,senate,2018-11-06,CA,dianne feinstein,1992,46.2


In [104]:
senate['date'] = pd.to_datetime(senate['date'])
senate_winners = senate.sort_values('pct', ascending=False).groupby(['date', 'state']).first()['name'].reset_index()

In [105]:
senate_winners

Unnamed: 0,date,state,name
0,2010-01-19,MA,scott brown
1,2010-11-02,AK,lisa murkowski
2,2010-11-02,AL,richard shelby
3,2010-11-02,AR,john boozman
4,2010-11-02,AZ,john mccain
5,2010-11-02,CA,barbara boxer
6,2010-11-02,CO,michael bennet
7,2010-11-02,CT,richard blumenthal
8,2010-11-02,DE,chris coons
9,2010-11-02,FL,marco rubio


In [109]:
senate['winner'] = (senate['name'] + senate['date'].astype(str)).isin(
    senate_winners['name'] + senate_winners['date'].astype(str))
special_winner_idx = [0, 3, 194, 197, 200, ]

Unnamed: 0,name,party,race_type,date,state,incumbent,incumbent_first_elected,pct,winner
0,tina smith,d,senate,2018-11-06,MN,tina smith,2018,53.6,False
1,karin housley,r,senate,2018-11-06,MN,tina smith,2018,41.8,False
2,jerry trooien,o,senate,2018-11-06,MN,tina smith,2018,0.9,False
3,cindy hydesmith,r,senate,2018-11-06,MS,cindy hydesmith,2018,53.9,False
4,mike espy,d,senate,2018-11-06,MS,cindy hydesmith,2018,46.1,False
5,kyrsten sinema,d,senate,2018-11-06,AZ,jeff flake,2012,49.68,True
6,martha mcsally,r,senate,2018-11-06,AZ,jeff flake,2012,47.96,False
7,angela green,o,senate,2018-11-06,AZ,jeff flake,2012,2.37,False
8,dianne feinstein,d,senate,2018-11-06,CA,dianne feinstein,1992,53.8,True
9,kevin de leon,d,senate,2018-11-06,CA,dianne feinstein,1992,46.2,False


In [112]:
senate_winners

Unnamed: 0,date,state,name
0,2010-01-19,MA,scott brown
1,2010-11-02,AK,lisa murkowski
2,2010-11-02,AL,richard shelby
3,2010-11-02,AR,john boozman
4,2010-11-02,AZ,john mccain
5,2010-11-02,CA,barbara boxer
6,2010-11-02,CO,michael bennet
7,2010-11-02,CT,richard blumenthal
8,2010-11-02,DE,chris coons
9,2010-11-02,FL,marco rubio


In [None]:
senate.to_csv(DATA_DIR/'cleaned'/'new_senate_election_results.csv')