In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import heapq

from sklearn.base import BaseEstimator, TransformerMixin

import us
STATES_DICT = us.states.mapping('name', 'abbr')

from pathlib import Path
DATA_DIR = Path('..')/'..'/'data'

In [2]:
house = pd.read_csv(DATA_DIR/'cleaned'/'RCP_house_Final.csv')
house.head()

Unnamed: 0,Race/Topic (Click to Sort),Poll,Results,Spread,Date,Year
0,Pennsylvania 12th District Special Election - ...,PPP (D),"Burns 44, Critz 41",Burns +3,2010-04-20,2010
1,New Hampshire 1st District - Guinta vs. Shea-P...,PPP (D),"Guinta 46, Shea-Porter 45",Guinta +1,2010-04-22,2010
2,New Hampshire 2nd District - Bass vs. Swett,PPP (D),"Bass 47, Swett 32",Bass +15,2010-04-22,2010
3,New Hampshire 1st District - Guinta vs. Shea-P...,WMUR/UNH,"Guinta 42, Shea-Porter 38",Guinta +4,2010-04-30,2010
4,New Hampshire 2nd District - Bass vs. Swett,WMUR/UNH,"Bass 44, Swett 27",Bass +17,2010-04-30,2010


In [3]:
senate = pd.read_csv(DATA_DIR/'cleaned'/'RCP_senate_Final.csv')
senate.head()

Unnamed: 0,Race/Topic (Click to Sort),Poll,Results,Spread,Date,Year
0,Florida Senate - Rubio vs. Meek vs. Crist,Quinnipiac,"Crist 32, Rubio 30, Meek 24",Crist +2,2010-04-15,2010
1,Arkansas Senate - Boozman vs. Lincoln,R2000/Daily Kos (D),"Boozman 50, Lincoln 43",Boozman +7,2010-04-15,2010
2,Arkansas Senate - Boozman vs. Halter,R2000/Daily Kos (D),"Boozman 48, Halter 41",Boozman +7,2010-04-15,2010
3,Florida Senate - Rubio vs. Meek,Quinnipiac,"Rubio 42, Meek 38",Rubio +4,2010-04-15,2010
4,Florida Senate - Republican Primary,Quinnipiac,"Rubio 56, Crist 33",Rubio +23,2010-04-15,2010


In [4]:
governor = pd.read_csv(DATA_DIR/'cleaned'/'RCP_governor_Final.csv')
governor.head()

Unnamed: 0,Race/Topic (Click to Sort),Poll,Results,Spread,Date,Year
0,Colorado Governor - McInnis vs. Hickenlooper,Rasmussen Reports,"Hickenlooper 42, McInnis 48",McInnis +6,2010-04-16,2010
1,Arizona Governor - Republican Primary,Rasmussen Reports,"Brewer 26, Mills 18, Martin 12, Munger 14",Brewer +8,2010-04-17,2010
2,New York Governor - Republican Primary,Siena,"Lazio 29, Levy 15, Paladino 13",Lazio +14,2010-04-18,2010
3,New York Governor - Levy vs. Cuomo,Siena,"Cuomo 58, Levy 23",Cuomo +35,2010-04-18,2010
4,New York Governor - Lazio vs. Cuomo,Siena,"Cuomo 61, Lazio 24",Cuomo +37,2010-04-18,2010


In [5]:
class ExtractInfo(BaseEstimator, TransformerMixin):
    """
    Extract information from RealClearPolitics polling data.
    
    Parameters
    ----------
    race_type : string
        Type of race. Supported race types:
            - 'house'
            - 'senate'
            - 'governor'
    """
    def __init__(self, race_type):
        if race_type in ['house', 'senate', 'governor']:
            self.race_type = race_type
        else:
            raise NotImplementedError(f"Race type '{race_type}' is not supported")
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):     
        # Extract race name
        race_name = X['Race/Topic (Click to Sort)'] 
        race_name = race_name.str.split(' - ').map(lambda x: x[0].strip())
        
        # Extract state
        state = race_name.copy()
        if self.race_type == 'house':
            state = state.str.replace('At-Large', '0')
            state = state.str.extract(r"(.*) \d.*", expand=False).map(STATES_DICT)
        elif self.race_type == 'senate':
            state = state.str.extract(r"(.*) Senate", expand=False).map(STATES_DICT)
        elif self.race_type == 'governor':
            state = state.str.extract(r"(.*) Governor", expand=False).map(STATES_DICT)
        
        # Extract candidate names and values
        all_poll_results = X['Results'].str.split(',')
        candidates = []
        for row in all_poll_results:
            poll_results = {}
            for candidate in row:
                split = candidate.split()
                candidate_name = ' '.join([x.strip().lower() for x in split[:-1]]) if len(split) > 1 else split[0]
                candidate_value = float(split[-1]) if len(split) > 1 else 0
                
                poll_results[candidate_name] = candidate_value
            
            candidates.append(poll_results)
        candidates = pd.Series(candidates)
            
        # Extract top two candidates
        relevant_candidates = candidates.apply(lambda row: heapq.nlargest(2, row, key=row.get))
            
        # Extract spread
        spread_split = X['Spread'].str.split()
        poll_leader = spread_split.map(lambda x: ' '.join(x[:-1]).lower() if len(x) > 1 else np.nan)
        poll_spread = spread_split.map(lambda x: float(x[-1]) if len(x) > 1 else np.nan)
        
        return pd.DataFrame({'race_name': race_name.str.lower(),
                             'race_type': self.race_type,
                             'state': state,
                             'top_candidates': relevant_candidates,
                             'poll_results': candidates,
                             'poll_leader': poll_leader,
                             'poll_spread': poll_spread,
                             'date': X['Date']})

In [6]:
ei_house = ExtractInfo('house')
ei_senate = ExtractInfo('senate')
ei_governor = ExtractInfo('governor')

In [7]:
house_tr = ei_house.transform(house)
senate_tr = ei_senate.transform(senate)
governor_tr = ei_governor.transform(governor)

In [9]:
house_tr.head()

Unnamed: 0,race_name,race_type,state,top_candidates,poll_results,poll_leader,poll_spread,date
0,pennsylvania 12th district special election,house,PA,"[burns, critz]","{'burns': 44.0, 'critz': 41.0}",burns,3.0,2010-04-20
1,new hampshire 1st district,house,NH,"[guinta, shea-porter]","{'guinta': 46.0, 'shea-porter': 45.0}",guinta,1.0,2010-04-22
2,new hampshire 2nd district,house,NH,"[bass, swett]","{'bass': 47.0, 'swett': 32.0}",bass,15.0,2010-04-22
3,new hampshire 1st district,house,NH,"[guinta, shea-porter]","{'guinta': 42.0, 'shea-porter': 38.0}",guinta,4.0,2010-04-30
4,new hampshire 2nd district,house,NH,"[bass, swett]","{'bass': 44.0, 'swett': 27.0}",bass,17.0,2010-04-30


In [10]:
senate_tr.head()

Unnamed: 0,race_name,race_type,state,top_candidates,poll_results,poll_leader,poll_spread,date
0,florida senate,senate,FL,"[crist, rubio]","{'crist': 32.0, 'rubio': 30.0, 'meek': 24.0}",crist,2.0,2010-04-15
1,arkansas senate,senate,AR,"[boozman, lincoln]","{'boozman': 50.0, 'lincoln': 43.0}",boozman,7.0,2010-04-15
2,arkansas senate,senate,AR,"[boozman, halter]","{'boozman': 48.0, 'halter': 41.0}",boozman,7.0,2010-04-15
3,florida senate,senate,FL,"[rubio, meek]","{'rubio': 42.0, 'meek': 38.0}",rubio,4.0,2010-04-15
4,florida senate,senate,FL,"[rubio, crist]","{'rubio': 56.0, 'crist': 33.0}",rubio,23.0,2010-04-15


In [11]:
governor_tr.head()

Unnamed: 0,race_name,race_type,state,top_candidates,poll_results,poll_leader,poll_spread,date
0,colorado governor,governor,CO,"[mcinnis, hickenlooper]","{'hickenlooper': 42.0, 'mcinnis': 48.0}",mcinnis,6.0,2010-04-16
1,arizona governor,governor,AZ,"[brewer, mills]","{'brewer': 26.0, 'mills': 18.0, 'martin': 12.0...",brewer,8.0,2010-04-17
2,new york governor,governor,NY,"[lazio, levy]","{'lazio': 29.0, 'levy': 15.0, 'paladino': 13.0}",lazio,14.0,2010-04-18
3,new york governor,governor,NY,"[cuomo, levy]","{'cuomo': 58.0, 'levy': 23.0}",cuomo,35.0,2010-04-18
4,new york governor,governor,NY,"[cuomo, lazio]","{'cuomo': 61.0, 'lazio': 24.0}",cuomo,37.0,2010-04-18


# temp code to get fec downloads pipeline running

In [8]:
# (
#     pd.concat([house_tr, senate_tr, governor_tr])
#     .loc[:, ['race_type', 'top_candidates', 'date']]
#     .reset_index(drop=True)
#     .to_pickle(DATA_DIR/'tmp_for_nick.pkl')
# )