In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import heapq

from sklearn.base import BaseEstimator, TransformerMixin

import us
STATES_DICT = us.states.mapping('name', 'abbr')
GRADE_TO_SCORE = {'A+':13, 'A':12, 'A-':11, 'B+':10, 'B':9, 'B-':8, 
                  'C+':7, 'C':6, 'C-':5, 'D+':4, 'D':3, 'D-':2, 'F':1}

from pathlib import Path
DATA_DIR = Path('..')/'..'/'data'

# Extract info purely from RCP polling data

First, taking a look at raw RCP data. These are separated into house, senate, and governor (process can be found in the Practice_with_Polls notebook).

In [2]:
house = pd.read_csv(DATA_DIR/'cleaned'/'RCP_house_Final.csv')
house.head()

Unnamed: 0,Race/Topic (Click to Sort),Poll,Results,Spread,Date,Year
0,Pennsylvania 12th District Special Election - ...,PPP (D),"Burns 44, Critz 41",Burns +3,2010-04-20,2010
1,New Hampshire 1st District - Guinta vs. Shea-P...,PPP (D),"Guinta 46, Shea-Porter 45",Guinta +1,2010-04-22,2010
2,New Hampshire 2nd District - Bass vs. Swett,PPP (D),"Bass 47, Swett 32",Bass +15,2010-04-22,2010
3,New Hampshire 1st District - Guinta vs. Shea-P...,WMUR/UNH,"Guinta 42, Shea-Porter 38",Guinta +4,2010-04-30,2010
4,New Hampshire 2nd District - Bass vs. Swett,WMUR/UNH,"Bass 44, Swett 27",Bass +17,2010-04-30,2010


In [3]:
senate = pd.read_csv(DATA_DIR/'cleaned'/'RCP_senate_Final.csv')
senate.head()

Unnamed: 0,Race/Topic (Click to Sort),Poll,Results,Spread,Date,Year
0,Florida Senate - Rubio vs. Meek vs. Crist,Quinnipiac,"Crist 32, Rubio 30, Meek 24",Crist +2,2010-04-15,2010
1,Arkansas Senate - Boozman vs. Lincoln,R2000/Daily Kos (D),"Boozman 50, Lincoln 43",Boozman +7,2010-04-15,2010
2,Arkansas Senate - Boozman vs. Halter,R2000/Daily Kos (D),"Boozman 48, Halter 41",Boozman +7,2010-04-15,2010
3,Florida Senate - Rubio vs. Meek,Quinnipiac,"Rubio 42, Meek 38",Rubio +4,2010-04-15,2010
4,Florida Senate - Republican Primary,Quinnipiac,"Rubio 56, Crist 33",Rubio +23,2010-04-15,2010


In [4]:
governor = pd.read_csv(DATA_DIR/'cleaned'/'RCP_governor_Final.csv')
governor.head()

Unnamed: 0,Race/Topic (Click to Sort),Poll,Results,Spread,Date,Year
0,Colorado Governor - McInnis vs. Hickenlooper,Rasmussen Reports,"Hickenlooper 42, McInnis 48",McInnis +6,2010-04-16,2010
1,Arizona Governor - Republican Primary,Rasmussen Reports,"Brewer 26, Mills 18, Martin 12, Munger 14",Brewer +8,2010-04-17,2010
2,New York Governor - Republican Primary,Siena,"Lazio 29, Levy 15, Paladino 13",Lazio +14,2010-04-18,2010
3,New York Governor - Levy vs. Cuomo,Siena,"Cuomo 58, Levy 23",Cuomo +35,2010-04-18,2010
4,New York Governor - Lazio vs. Cuomo,Siena,"Cuomo 61, Lazio 24",Cuomo +37,2010-04-18,2010


Below is a class that cleans RCP data.

In [5]:
class ExtractInfo(BaseEstimator, TransformerMixin):
    """
    Extract information from RealClearPolitics polling data.
    
    Parameters
    ----------
    race_type : string
        Type of race. Supported race types:
            - 'house'
            - 'senate'
            - 'governor'
    """
    def __init__(self, race_type):
        if race_type in ['house', 'senate', 'governor']:
            self.race_type = race_type
        else:
            raise NotImplementedError(f"Race type '{race_type}' is not supported")
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):     
        # Extract race name
        race_name = X['Race/Topic (Click to Sort)'] 
        race_name = race_name.str.split(' - ').map(lambda x: x[0].strip())
        
        # Extract state
        state = race_name.copy()
        if self.race_type == 'house':
            state = state.str.replace('At-Large', '0')
            state = state.str.extract(r"(.*) \d.*", expand=False).map(STATES_DICT)
        elif self.race_type == 'senate':
            state = state.str.extract(r"(.*) Senate", expand=False).map(STATES_DICT)
        elif self.race_type == 'governor':
            state = state.str.extract(r"(.*) Governor", expand=False).map(STATES_DICT)
        
        # Extract candidate names and values
        all_poll_results = X['Results'].str.split(',')
        candidates = []
        for row in all_poll_results:
            poll_results = {}
            for candidate in row:
                split = candidate.split()
                candidate_name = ' '.join([x.strip().lower() for x in split[:-1]]) if len(split) > 1 else split[0]
                candidate_value = float(split[-1]) if len(split) > 1 else 0
                
                poll_results[candidate_name] = candidate_value
            
            candidates.append(poll_results)
        candidates = pd.Series(candidates)
            
        # Extract top two candidates
        relevant_candidates = candidates.apply(lambda row: heapq.nlargest(2, row, key=row.get))
            
        # Extract spread
        spread_split = X['Spread'].str.split()
        poll_leader = spread_split.map(lambda x: ' '.join(x[:-1]).lower() if len(x) > 1 else np.nan)
        poll_spread = spread_split.map(lambda x: float(x[-1]) if len(x) > 1 else np.nan)
        
        return pd.DataFrame({'race_name': race_name.str.lower(),
                             'race_type': self.race_type,
                             'state': state,
                             'top_candidates': relevant_candidates,
                             'pollster': X['Poll'].str.lower(),
                             'poll_results': candidates,
                             'poll_leader': poll_leader,
                             'poll_spread': poll_spread,
                             'date': X['Date']})

In [6]:
ei_house = ExtractInfo('house')
ei_senate = ExtractInfo('senate')
ei_governor = ExtractInfo('governor')

In [7]:
house_tr = ei_house.transform(house)
senate_tr = ei_senate.transform(senate)
governor_tr = ei_governor.transform(governor)

In [8]:
house_tr.head()

Unnamed: 0,race_name,race_type,state,top_candidates,pollster,poll_results,poll_leader,poll_spread,date
0,pennsylvania 12th district special election,house,PA,"[burns, critz]",ppp (d),"{'burns': 44.0, 'critz': 41.0}",burns,3.0,2010-04-20
1,new hampshire 1st district,house,NH,"[guinta, shea-porter]",ppp (d),"{'guinta': 46.0, 'shea-porter': 45.0}",guinta,1.0,2010-04-22
2,new hampshire 2nd district,house,NH,"[bass, swett]",ppp (d),"{'bass': 47.0, 'swett': 32.0}",bass,15.0,2010-04-22
3,new hampshire 1st district,house,NH,"[guinta, shea-porter]",wmur/unh,"{'guinta': 42.0, 'shea-porter': 38.0}",guinta,4.0,2010-04-30
4,new hampshire 2nd district,house,NH,"[bass, swett]",wmur/unh,"{'bass': 44.0, 'swett': 27.0}",bass,17.0,2010-04-30


In [9]:
senate_tr.head()

Unnamed: 0,race_name,race_type,state,top_candidates,pollster,poll_results,poll_leader,poll_spread,date
0,florida senate,senate,FL,"[crist, rubio]",quinnipiac,"{'crist': 32.0, 'rubio': 30.0, 'meek': 24.0}",crist,2.0,2010-04-15
1,arkansas senate,senate,AR,"[boozman, lincoln]",r2000/daily kos (d),"{'boozman': 50.0, 'lincoln': 43.0}",boozman,7.0,2010-04-15
2,arkansas senate,senate,AR,"[boozman, halter]",r2000/daily kos (d),"{'boozman': 48.0, 'halter': 41.0}",boozman,7.0,2010-04-15
3,florida senate,senate,FL,"[rubio, meek]",quinnipiac,"{'rubio': 42.0, 'meek': 38.0}",rubio,4.0,2010-04-15
4,florida senate,senate,FL,"[rubio, crist]",quinnipiac,"{'rubio': 56.0, 'crist': 33.0}",rubio,23.0,2010-04-15


In [10]:
governor_tr.head()

Unnamed: 0,race_name,race_type,state,top_candidates,pollster,poll_results,poll_leader,poll_spread,date
0,colorado governor,governor,CO,"[mcinnis, hickenlooper]",rasmussen reports,"{'hickenlooper': 42.0, 'mcinnis': 48.0}",mcinnis,6.0,2010-04-16
1,arizona governor,governor,AZ,"[brewer, mills]",rasmussen reports,"{'brewer': 26.0, 'mills': 18.0, 'martin': 12.0...",brewer,8.0,2010-04-17
2,new york governor,governor,NY,"[lazio, levy]",siena,"{'lazio': 29.0, 'levy': 15.0, 'paladino': 13.0}",lazio,14.0,2010-04-18
3,new york governor,governor,NY,"[cuomo, levy]",siena,"{'cuomo': 58.0, 'levy': 23.0}",cuomo,35.0,2010-04-18
4,new york governor,governor,NY,"[cuomo, lazio]",siena,"{'cuomo': 61.0, 'lazio': 24.0}",cuomo,37.0,2010-04-18


# [WIP] Add information from FiveThirtyEight pollster grades

FiveThirtyEight publishes a list of pollster grades. We want to add this information to our poll data. First, a look at the pollster grades dataset:

In [11]:
pollster = pd.read_csv(DATA_DIR/'cleaned'/'pollster_grades_538.csv')
pollster.head()

Unnamed: 0,Pollster,538 Grade
0,Selzer & Co.,A+
1,Monmouth University,A+
2,Field Research Corp. (Field Poll),A+
3,ABC News/Washington Post,A+
4,Elway Research,A+


To match our transformed RCP data, we need to make sure that the pollsters are lowercased. Furthermore, RCP can have multiple pollsters for each line; in this case, we take the maximum of the pollster grades. (The reasoning: we figure that if a reputable organization (e.g. NYT) and a less-reputable organization (e.g. some_pollster) collaborate, NYT will likely make sure that it meets their standards of polling.)

In [12]:
class AddPollsterGrades(BaseEstimator, TransformerMixin):
    """
    Add pollster grades to already-cleaned Real Clear Politics polling data.
    
    Parameters
    ----------
    pollster_grades : filepath(str)
        File path to pollster grades (collected by FiveThirtyEight).
        
    grade_bins : dict
        Dictionary mapping grades to bins of choice. Keys must include:
        A+, A, A-, B+, B, B-, C+, C, C-, D+, D, D-, F, None
    """    
    def __init__(self, pollster_grades_filepath, grade_map=None):
        self.pollster_grades = pd.read_csv(pollster_grades_filepath)
        if grade_map == None:
            self.grade_map = {'A+':13, 'A':12, 'A-':11, 'B+':10, 'B':9, 'B-':8, 'C+':7, 
                              'C':6, 'C-':5, 'D+':4, 'D':3, 'D-':2, 'F':1}
        else:
            self.grade_map = grade_map
        
    def fit(self, X, y=None):       
        return self
            
    def transform(self, X, y=None):
        # Clean pollster grades
        self.pollster_grades['Pollster'] = (
            self.pollster_grades['Pollster']
            .str.lower()
            .str.replace('Public Policy Polling', 'PPP')
        )
        
        # Add pollster score from grade
        self.pollster_grades['score'] = self.pollster_grades['538 Grade'].map(self.grade_map)
        
        # Clean RCP polling data
        X['pollster'] = (
            X['pollster'].str.replace('\*', '')
            .str.replace('\s*[\(\[].*?[\)\]]\s*', '')
        )
        
        expanded_polls = X['pollster'].str.split('/', n=10, expand=True)
        
        
        
        
        return X

In [14]:
### CONG YANG
pollster = pd.read_csv(DATA_DIR/'cleaned'/'pollster_grades_538.csv')
pollster.head()
for i in range(0,len(pollster.iloc[:,0])):
    temp = pollster.iloc[i,0]
    temp = temp.replace('Public Policy Polling', 'PPP')
    if temp == 'PPP':
        pollster.iloc[i,0] = temp
pollster["score"] = np.nan
gradetoscore =pd.read_csv(DATA_DIR/'cleaned'/'gradetoscore.csv')
for i in range(0,len(pollster.iloc[:,0])):
    for j in range(0,len(gradetoscore)):
        if gradetoscore.iloc[j,0]==pollster.iloc[i,1]:
            pollster.iloc[i,2]=gradetoscore.iloc[j,1]

def aggregate(df,pollster):
    # select rcp poll column
    poll = pd.Series.to_frame(df['Poll'])
    
    # for each poll, remove \* and (\w*)
    for i in range (0,len(poll)):
        poll.iloc[i,0] = re.sub("\s*[\(\[].*?[\)\]]", '', poll.iloc[i,0])
        temp = poll.iloc[i,0]
        temp = temp.rstrip()
        poll.iloc[i,0] = temp
        temp = poll.iloc[i,0]
        temp = temp.replace("*","")
        poll.iloc[i,0] = temp
        
    # expand on /
    poll = poll['Poll'].str.split("/", n = 10, expand = True)
    df["score"] = np.nan
    # for each poll
    for i in range(0,len(poll.iloc[:,1])):
        max_score = 0
        temp_max = 0
        # iterate through each pollster
        for j in range(0,len(poll.iloc[1,:])):
            if poll.iloc[i,j] is None:
                break
            else:
                temp = pollster[pollster['Pollster'].str.contains(poll.iloc[i,j], case = False)]
                if len(temp)>0:
                    temp_max = max(temp['score'])
            if max_score < temp_max:
                max_score = temp_max
        df['score'].iloc[i] = max_score
    return df,poll

In [None]:
### CONG YANG
aggregate(RCP_c_approval.iloc[:],pollster)

# Temp code to get FEC downloads pipeline running

In [12]:
# (
#     pd.concat([house_tr, senate_tr, governor_tr])
#     .reset_index(drop=True)
#     .to_pickle(DATA_DIR/'tmp_for_nick.pkl')
# )

In [13]:
pd.read_pickle(DATA_DIR/'tmp_for_nick.pkl')

Unnamed: 0,race_name,race_type,state,top_candidates,pollster,poll_results,poll_leader,poll_spread,date
0,pennsylvania 12th district special election,house,PA,"[burns, critz]",ppp (d),"{'burns': 44.0, 'critz': 41.0}",burns,3.0,2010-04-20
1,new hampshire 1st district,house,NH,"[guinta, shea-porter]",ppp (d),"{'guinta': 46.0, 'shea-porter': 45.0}",guinta,1.0,2010-04-22
2,new hampshire 2nd district,house,NH,"[bass, swett]",ppp (d),"{'bass': 47.0, 'swett': 32.0}",bass,15.0,2010-04-22
3,new hampshire 1st district,house,NH,"[guinta, shea-porter]",wmur/unh,"{'guinta': 42.0, 'shea-porter': 38.0}",guinta,4.0,2010-04-30
4,new hampshire 2nd district,house,NH,"[bass, swett]",wmur/unh,"{'bass': 44.0, 'swett': 27.0}",bass,17.0,2010-04-30
5,maryland 1st district,house,MD,"[harris, kratovil]",pos (r),"{'harris': 39.0, 'kratovil': 36.0}",harris,3.0,2010-05-01
6,pennsylvania 12th district special election,house,PA,"[burns, critz]",r2000/daily kos (d),"{'burns': 46.0, 'critz': 40.0}",burns,6.0,2010-05-01
7,pennsylvania 12th district special election,house,PA,"[burns, critz]",r2000/daily kos (d),"{'critz': 40.0, 'burns': 46.0}",burns,6.0,2010-05-01
8,hawaii 1st district special election,house,HI,"[djou, case]",honolulu advertiser,"{'djou': 36.0, 'case': 28.0, 'hanabusa': 22.0}",djou,8.0,2010-05-02
9,pennsylvania 12th district special election,house,PA,"[critz, burns]",susquehanna,"{'critz': 44.0, 'burns': 38.0}",critz,6.0,2010-05-12


In [None]:
from_rcp = [
    'monmouth university',
    'abc news/wash post',
    'abc/wp',
    'abc news',
    'abc/wash post tracking',
    'abc news tracking',
    
]

In [None]:
to_rcp = [
    'monmouth',
    'abc news, wapo',
    'abc news, wapo',
    'abc news, wapo',
    'abc news, wapo',
    'abc news, wapo',
    
]

In [None]:
from_538 = [
    'selzer & co.',
    'monmouth university',
    'field research corp. (field poll)',
    'abc news/washington post',
]

In [None]:
to_538 = [
    'selzer',
    'monmouth',
    'field',
    'abc news, wapo',
]