In [39]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from datetime import date, timedelta
from scipy.stats.stats import spearmanr 
from scipy import stats
from pandas import *
import re
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
import re
!pip install us
import us
plt.style.use(['seaborn-darkgrid'])

import warnings
warnings.filterwarnings('ignore')



# Preprocessing

In [65]:
# Set datetime index
# Filter out primary elections 
def make_pretty_RCP(CSV_list):
    clean_dfs = []
    for csv in CSV_list:
        df = read_csv(csv)
        df['Date'] = to_datetime(df['Date'])
        df['Year'] = df['Year'].astype(int)
        df.columns = ['Race','Poll','Result','Spread','Date','Year']
        df = df[df['Race'].str.contains('Primary')==False]
        clean_dfs.append(df)
    return clean_dfs

clean_dfs = make_pretty_RCP(['RCP_governor_Final.csv','RCP_senate_Final.csv','RCP_house_Final.csv'])
gov_df, sen_df, house_df = clean_dfs[0], clean_dfs[1], clean_dfs[2]

# This format is consistent for all dfs
gov_df.head()

Unnamed: 0,Race,Poll,Result,Spread,Date,Year
0,Colorado Governor - McInnis vs. Hickenlooper,Rasmussen Reports,"Hickenlooper 42, McInnis 48",McInnis +6,2010-04-16,2010
3,New York Governor - Levy vs. Cuomo,Siena,"Cuomo 58, Levy 23",Cuomo +35,2010-04-18,2010
4,New York Governor - Lazio vs. Cuomo,Siena,"Cuomo 61, Lazio 24",Cuomo +37,2010-04-18,2010
5,Texas Governor - Perry vs. White,Rasmussen Reports,"Perry 48, White 44",Perry +4,2010-04-19,2010
6,Massachusetts Governor - Baker vs. Patrick vs....,Western NE College,"Patrick 34, Baker 27, Cahill 29",Patrick +5,2010-04-19,2010


In [49]:
def make_pretty_Winners(CSV_list):
    win_dfs = []
    for csv in CSV_list:
        df = read_csv(csv)
        df = df.drop(['Unnamed: 0'], axis=1)
        df['date'] = to_datetime(df['date'])
        df = df.set_index('date')
        df['lastname'] = [i[-1:][0] for i in df['name'].str.split()]
        # JI is our join index
        df['JI'] = df['lastname']+' '+df['state']

        # This is the part we care about in the aggregation
        win = df[['party','JI']]
        win_dfs.append(win)
    return win_dfs, pd.concat(win_dfs)

# all_win is winners from all races together in a single df
win_dfs, all_win = make_pretty_Winners(['Clean_Governor_Winners.csv','Clean_Senate_Winners.csv','Clean_House_Winners.csv'])
gov_win, sen_win, house_win = win_dfs[0], win_dfs[1], win_dfs[2]

# This format is consistent for all dfs
gov_win.head()

Unnamed: 0_level_0,party,JI
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-12-15,r,hoeven ND
2000-12-21,r,perry TX
2000-12-21,r,perry TX
2002-12-02,r,lingle HI
2003-01-01,d,richardson NM


In [50]:
# Helper function for classifying party
# 0 for Republican, 1 for Democrat, nans stay as is
def party_class(df):
    ind = []
    for i in df['party']:
        if i=='r':
            ind.append(0)
        elif i=='d':
            ind.append(1)
        else:
            ind.append(np.nan)
    return ind

In [51]:
# Dictionary mapping states names to abbreviations
states_dict = us.states.mapping('name', 'abbr')

# Party Classification for Gubernatorial, Senatorial, and Congressional Races

In [125]:
def Party_Classification(RCP_df, win_df, states_dict):
    '''
    RCP_df is RealClearPolitics polling data  
    win_df is data about past winners
    Both are formatted as shown in preprocessing
    
    states_dict is a dictionary mapping state names to abbreviation; shown in preprocessing
    '''
    # Create column for poll leaders
    RCP_df['Lead'] = [i[0] for i in RCP_df['Spread'].str.split()]
    RCP_df['Lead'] = RCP_df['Lead'].str.lower()
    
    # Create column for state name
    # The case we have congressional polls
    if RCP_df['Race'][0].find('District') != -1:
        RCP_df['State'] = RCP_df['Race'].str.findall(r"^[A-Za-z ]+ ").str[0].str.strip() 
    # The case we have gubernatorial/senatorial polls
    else:
        race_type = RCP_df['Race'][0].split()[1]
        RCP_df['State'] = [i[:i.index(race_type)-1] for i in RCP_df['Race']]   
    
    # Map state names to abbreviations and create join index (JI)
    RCP_df["State"].replace(states_dict, inplace=True)
    # Weird edge case with New Hampshire (in certain places it's N.H.)
    # Not sure what to do outside of hard coding for it
    RCP_df['State'] = RCP_df['State'].fillna('NH')
    RCP_df['JI'] = RCP_df['Lead']+' '+RCP_df['State']
    
    # Join RCP_df to win_df on JI
    RCP_df = RCP_df.set_index('JI').join(win_df.set_index('JI')).drop_duplicates()
    RCP_df = RCP_df.set_index(['Date', 'State'])
    
    # Democrat=1, Republican=0, nans stay as is
    ind = party_class(RCP_df)
    RCP_df['Party Class'] = ind
    RCP_df.rename(columns={'party':'Party'}, inplace=True)
    
    return RCP_df

In [101]:
# Fewer nans if we use all_win versus sen_win
# Makes sense -- winners of one office often run for something else down the line (ie. Congressman wants to become a Senator)

# Tradeoffs:
# We have high confidence that the sen_win example is correct, but we're limited to winners who've held that specific office in the past
# In the all_win example we can map to winners who've held ANY office in the past -- this makes our data richer, but we're slightly
# more prone to error (ie. candidate switched parties between races)

# I think using all_win is better, but we should discuss this further
print(Party_Classification(sen_df, sen_win, states_dict)['Party'].isnull().sum())
print(Party_Classification(sen_df, all_win, states_dict)['Party'].isnull().sum())

1002
756


### Classified Gubernatorial polling data

In [127]:
gov_df_c = Party_Classification(gov_df, all_win, states_dict)
gov_df_c.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Race,Poll,Result,Spread,Year,Lead,Party,Party Class
Date,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-07-02,TX,Texas Governor - Abbott vs. Davis,PPP (D),"Abbott 48, Davis 40",Abbott +8,2013,abbott,r,0.0
2013-10-02,TX,Texas Governor - Abbott vs. Davis,Texas Lyceum,"Abbott 29, Davis 21",Abbott +8,2013,abbott,r,0.0
2013-11-05,TX,Texas Governor - Abbott vs. Davis,UT/Texas Tribune,"Abbott 40, Davis 34",Abbott +6,2013,abbott,r,0.0
2013-11-05,TX,Texas Governor - Abbott vs. Davis,PPP (D),"Abbott 50, Davis 35",Abbott +15,2013,abbott,r,0.0
2014-02-24,TX,Texas Governor - Abbott vs. Davis,UT/Texas Tribune,"Abbott 47, Davis 36",Abbott +11,2014,abbott,r,0.0


### Classified Senatorial polling data

In [128]:
sen_df_c = Party_Classification(sen_df, all_win, states_dict)
sen_df_c.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Race,Poll,Result,Spread,Year,Lead,Party,Party Class
Date,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-04-19,MO,Missouri Senate - Akin vs. McCaskill,Rasmussen Reports,"Akin 48, McCaskill 43",Akin +5,2012,akin,r,0.0
2012-07-28,MO,Missouri Senate - Akin vs. McCaskill,Post-Dispatch/Mason-Dixon,"McCaskill 44, Akin 49",Akin +5,2012,akin,r,0.0
2012-08-01,MO,Missouri Senate - Akin vs. McCaskill,Rasmussen Reports,"McCaskill 44, Akin 47",Akin +3,2012,akin,r,0.0
2012-08-13,MO,Missouri Senate - Akin vs. McCaskill,SurveyUSA*,"McCaskill 40, Akin 51",Akin +11,2012,akin,r,0.0
2012-08-21,MO,Missouri Senate - Akin vs. McCaskill,PPP (D),"McCaskill 43, Akin 44",Akin +1,2012,akin,r,0.0


### Classified Congressional polling data

In [129]:
house_df_c = Party_Classification(house_df, all_win, states_dict)
house_df_c.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Race,Poll,Result,Spread,Year,Lead,Party,Party Class
Date,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-09-27,NJ,New Jersey 3rd District - Runyan vs. Adler,Stockton/Zogby,"Adler 38, Runyan 30",Adler +8,2010,adler,,
2010-09-29,NJ,New Jersey 3rd District - Runyan vs. Adler,Rutgers-Eagleton,"Adler 41, Runyan 39",Adler +2,2010,adler,,
2010-09-30,NJ,New Jersey 3rd District - Runyan vs. Adler,Monmouth/Gannett,"Adler 42, Runyan 39",Adler +3,2010,adler,,
2014-11-01,GA,Georgia 12th District - Allen vs. Barrow,Landmark Communications,"Allen 48, Barrow 44",Allen +4,2014,allen,r,0.0
2014-11-03,GA,Georgia 12th District - Allen vs. Barrow,Landmark Communications,"Allen 47, Barrow 46",Allen +1,2014,allen,r,0.0


## Edge cases (and not so edge cases)
* This is the best way to aggregate the data, but there are obvious issues
* We have no way to classify the party of the individual leading a poll unless that individual has held office before (shows up in the winners data)
* We get nan for party if the poll results are tied -- this may be okay as is

In [115]:
# Allen doesn't have a party because he's never won an election
sen_df_c.reset_index().set_index('Lead').loc['allen',].head()

Unnamed: 0_level_0,Date,State,Race,Poll,Result,Spread,Year,Party,Party Class
Lead,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
allen,2011-04-08,VA,Virginia Senate - Allen vs. Kaine,Roanoke College,"Allen 45, Kaine 32",Allen +13,2011,,
allen,2011-09-15,VA,Virginia Senate - Allen vs. Kaine,Quinnipiac,"Allen 45, Kaine 44",Allen +1,2011,,
allen,2011-09-25,VA,Virginia Senate - Allen vs. Kaine,Roanoke College,"Allen 42, Kaine 39",Allen +3,2011,,
allen,2011-12-21,VA,Virginia Senate - Allen vs. Kaine,Quinnipiac,"Kaine 42, Allen 44",Allen +2,2011,,
allen,2012-02-20,VA,Virginia Senate - Allen vs. Kaine,CNU/Times-Dispatch,"Kaine 40, Allen 42",Allen +2,2012,,


In [116]:
# Here's our output when there's a tie
sen_df_c[sen_df_c['Result'].str.contains('Rubio')].tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Race,Poll,Result,Spread,Year,Lead,Party,Party Class
Date,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-06-09,FL,Florida Senate - Rubio vs. Meek vs. Crist,Rasmussen Reports,"Crist 37, Rubio 37, Meek 15",Tie,2010,tie,,
2010-08-04,FL,Florida Senate - Rubio vs. Greene vs. Crist,AIF/McLaughlin (R),"Crist 37, Rubio 37, Greene 16",Tie,2010,tie,,
2016-06-29,FL,Florida Senate - Rubio vs. Murphy,Bay News 9/SurveyUSA,"Rubio 43, Murphy 43",Tie,2016,tie,,
2016-10-21,FL,Florida Senate - Rubio vs. Murphy,FOX 13/Opinion Savvy,"Rubio 46, Murphy 46",Tie,2016,tie,,
2016-10-30,FL,Florida Senate - Rubio vs. Murphy,Gravis,"Rubio 46, Murphy 46",Tie,2016,tie,,
