In [4]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
house = pd.read_csv('../data/cleaned/RCP_house.csv', index_col=0)
house.head()

Unnamed: 0,Race/Topic (Click to Sort),Poll,Results,Spread,Date,Year
0,Pennsylvania 12th District Special Election - ...,PPP (D),"Burns 44, Critz 41",Burns +3,2010-04-20,2010.0
1,New Hampshire 1st District - Guinta vs. Shea-P...,PPP (D),"Guinta 46, Shea-Porter 45",Guinta +1,2010-04-22,2010.0
2,New Hampshire 2nd District - Bass vs. Swett,PPP (D),"Bass 47, Swett 32",Bass +15,2010-04-22,2010.0
3,New Hampshire 1st District - Guinta vs. Shea-P...,WMUR/UNH,"Guinta 42, Shea-Porter 38",Guinta +4,2010-04-30,2010.0
4,New Hampshire 2nd District - Bass vs. Swett,WMUR/UNH,"Bass 44, Swett 27",Bass +17,2010-04-30,2010.0


In [82]:
senate = pd.read_csv('../data/cleaned/RCP_senate.csv', index_col=0)
senate.head()

Unnamed: 0,Race/Topic (Click to Sort),Poll,Results,Spread,Date,Year
0,Florida Senate - Rubio vs. Meek vs. Crist,Quinnipiac,"Crist 32, Rubio 30, Meek 24",Crist +2,2010-04-15,2010.0
1,Arkansas Senate - Boozman vs. Lincoln,R2000/Daily Kos (D),"Boozman 50, Lincoln 43",Boozman +7,2010-04-15,2010.0
2,Arkansas Senate - Boozman vs. Halter,R2000/Daily Kos (D),"Boozman 48, Halter 41",Boozman +7,2010-04-15,2010.0
3,Florida Senate - Rubio vs. Meek,Quinnipiac,"Rubio 42, Meek 38",Rubio +4,2010-04-15,2010.0
4,Florida Senate - Republican Primary,Quinnipiac,"Rubio 56, Crist 33",Rubio +23,2010-04-15,2010.0


In [84]:
governor = pd.read_csv('../data/cleaned/RCP_governor.csv', index_col=0)
governor.head()

Unnamed: 0,Race/Topic (Click to Sort),Poll,Results,Spread,Date,Year
0,Colorado Governor - McInnis vs. Hickenlooper,Rasmussen Reports,"Hickenlooper 42, McInnis 48",McInnis +6,2010-04-16,2010.0
1,Arizona Governor - Republican Primary,Rasmussen Reports,"Brewer 26, Mills 18, Martin 12, Munger 14",Brewer +8,2010-04-17,2010.0
2,New York Governor - Republican Primary,Siena,"Lazio 29, Levy 15, Paladino 13",Lazio +14,2010-04-18,2010.0
3,New York Governor - Levy vs. Cuomo,Siena,"Cuomo 58, Levy 23",Cuomo +35,2010-04-18,2010.0
4,New York Governor - Lazio vs. Cuomo,Siena,"Cuomo 61, Lazio 24",Cuomo +37,2010-04-18,2010.0


In [139]:
class ExtractInfo(BaseEstimator, TransformerMixin):
    """
        Extract information from RealClearPolitics polling data.
        Supported race types:
            - 'house'
            - 'senate'
            - 'governor'
        """
    def __init__(self, race_type):
        if race_type in ['house', 'senate', 'governor']:
            self.race_type = race_type
        else:
            raise NotImplementedError(f"Race type '{race_type}' is not supported")
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):     
        # Extract race name
        race_name = X['Race/Topic (Click to Sort)'] 
        race_name = race_name.str.split(' - ').map(lambda x: x[0].strip())
        
        # Extract state
        state = race_name.copy()
        if self.race_type == 'house':
            state = state.str.replace('At-Large', '0')
            state = state.str.extract(r"(.*) \d.*", expand=False)
        elif self.race_type == 'senate':
            state = state.str.extract(r"(.*) Senate", expand=False)
        elif self.race_type == 'governor':
            state = state.str.extract(r"(.*) Governor", expand=False)
        
        # Extract candidate names and values
        all_poll_results = X['Results'].str.split(',')
        candidates = []
        for row in all_poll_results:
            poll_results = {}
            for candidate in row:
                split = candidate.split()
                candidate_name = ' '.join([x.strip() for x in split[:-1]]) if len(split) > 1 else split[0]
                candidate_value = float(split[-1]) if len(split) > 1 else 0
                
                poll_results[candidate_name] = candidate_value
            
            candidates.append(poll_results)
            
        # Extract spread
        spread_split = X['Spread'].str.split()
        poll_leader = spread_split.map(lambda x: ' '.join(x[:-1]) if len(x) > 1 else np.nan)
        poll_spread = spread_split.map(lambda x: float(x[-1]) if len(x) > 1 else np.nan)
        
        return pd.DataFrame({'race_name': race_name,
                             'race_type': self.race_type,
                             'state': state,
                             'poll_results': candidates,
                             'poll_leader': poll_leader,
                             'poll_spread': poll_spread,
                             'date': X['Date']})

In [140]:
ei_house = ExtractInfo('house')
ei_senate = ExtractInfo('senate')
ei_governor = ExtractInfo('governor')

In [141]:
ei_house.transform(house)

Unnamed: 0,race_name,race_type,state,poll_results,poll_leader,poll_spread,date
0,Pennsylvania 12th District Special Election,house,Pennsylvania,"{'Burns': 44.0, 'Critz': 41.0}",Burns,3.0,2010-04-20
1,New Hampshire 1st District,house,New Hampshire,"{'Guinta': 46.0, 'Shea-Porter': 45.0}",Guinta,1.0,2010-04-22
2,New Hampshire 2nd District,house,New Hampshire,"{'Bass': 47.0, 'Swett': 32.0}",Bass,15.0,2010-04-22
3,New Hampshire 1st District,house,New Hampshire,"{'Guinta': 42.0, 'Shea-Porter': 38.0}",Guinta,4.0,2010-04-30
4,New Hampshire 2nd District,house,New Hampshire,"{'Bass': 44.0, 'Swett': 27.0}",Bass,17.0,2010-04-30
5,Pennsylvania 12th District Special Election,house,Pennsylvania,"{'Critz': 40.0, 'Burns': 46.0}",Burns,6.0,2010-05-01
6,Pennsylvania 12th District Special Election,house,Pennsylvania,"{'Burns': 46.0, 'Critz': 40.0}",Burns,6.0,2010-05-01
7,Maryland 1st District,house,Maryland,"{'Harris': 39.0, 'Kratovil': 36.0}",Harris,3.0,2010-05-01
8,Hawaii 1st District Special Election,house,Hawaii,"{'Djou': 36.0, 'Case': 28.0, 'Hanabusa': 22.0}",Djou,8.0,2010-05-02
9,Pennsylvania 12th District Special Election,house,Pennsylvania,"{'Critz': 44.0, 'Burns': 38.0}",Critz,6.0,2010-05-12


In [138]:
ei_senate.transform(senate)

Unnamed: 0,race_name,race_type,state,poll_results,poll_leader,poll_spread
0,Florida Senate,senate,Florida,"{'Crist': 32.0, 'Rubio': 30.0, 'Meek': 24.0}",Crist,2.0
1,Arkansas Senate,senate,Arkansas,"{'Boozman': 50.0, 'Lincoln': 43.0}",Boozman,7.0
2,Arkansas Senate,senate,Arkansas,"{'Boozman': 48.0, 'Halter': 41.0}",Boozman,7.0
3,Florida Senate,senate,Florida,"{'Rubio': 42.0, 'Meek': 38.0}",Rubio,4.0
4,Florida Senate,senate,Florida,"{'Rubio': 56.0, 'Crist': 33.0}",Rubio,23.0
5,Arkansas Senate,senate,Arkansas,"{'Lincoln': 45.0, 'Halter': 33.0}",Lincoln,12.0
6,Florida Senate,senate,Florida,"{'Crist': 48.0, 'Meek': 34.0}",Crist,14.0
7,Arizona Senate,senate,Arizona,"{'McCain': 47.0, 'Hayworth': 42.0}",McCain,5.0
8,Nevada Senate,senate,Nevada,"{'Lowden': 47.0, 'Reid': 37.0}",Lowden,10.0
9,New York Senate,senate,New York,"{'Gillibrand': 46.0, 'DioGuardi': 27.0}",Gillibrand,19.0
