In [994]:
import pandas as pd
import pickle as pkl
from pathlib import Path
import numpy as np
import sklearn 

 # machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [995]:
house_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'house_avg_spread.pkl')
sen_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'sen_avg_spread.pkl')
gov_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'gov_avg_spread.pkl')

df = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'df_donor_clean-Copy1.pkl')

In [996]:
house_spread.head()
# sen_spread.head()
# gov_spread.head()

Unnamed: 0_level_0,spread,race_name,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
critz,-1.0,pennsylvania 12th district special election,2010
burns,1.0,pennsylvania 12th district special election,2010
shea-porter,-7.4,new hampshire 1st district,2010
guinta,7.4,new hampshire 1st district,2010
swett,-12.666667,new hampshire 2nd district,2010


In [997]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,West Virginia,Wyoming,Foreign Countries,Donation Level 1,Donation Level 2,Donation Level 3,Donation Level 4,Donation Level 5,election_result,total_donations
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,Challenger,Senate,REPUBLICAN PARTY,[ABELER4SENATE],1.0,...,0.0,0.0,0.0,53740.0,27348.0,25244.52,27611.0,24014.0,franken,157957.52
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,Challenger,Senate,DEMOCRATIC PARTY,[FRIENDS OF TERRY ADAMS],1.0,...,0.0,0.0,0.0,23208.28,16735.0,17200.0,34800.0,22300.0,alexander,114243.28
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,Challenger,Senate,REPUBLICAN PARTY,[ ADDIVINOLA COMMITTEE; THE],1.0,...,0.0,0.0,0.0,33194.35,8125.0,11850.0,1500.0,3900.0,markey,58569.35
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,Challenger,House,DEMOCRATIC PARTY,0,0.0,...,4800.0,0.0,0.0,116284.98,205176.19,434250.0,596800.0,443050.0,runyan,1795561.17
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,Challenger,House,DEMOCRATIC PARTY,[SHELLEY ADLER FOR CONGRESS],1.0,...,5000.0,0.0,0.0,166446.66,87420.95,137550.0,158655.5,207600.0,runyan,757673.11


# Feature Engineering

## Number of Donations

In [998]:
# by_size/by_candidate
#  -0    $200 and under
#  -200  $200.01 - $499.99
#  -500  $500 - $999.99
#  -1000 $1000 - $1999.99
#  -2000 $2000 +

donorLevels = dict([('d1', 200), ('d2', 499.99), ('d3', 999.99), ('d4', 1999.99), ('d5', 2000)])
adHeur = lambda col, level: np.divide(col, donorLevels[level])

def donor_level_features(table):
    
    """Adds features to the FEC table that include the minimum number of possible donors
    per donor level"""
    
    adHeur = lambda col, level: np.divide(col, donorLevels[level])
    donorLevels = dict([('d1', 200), ('d2', 499.99), ('d3', 999.99), ('d4', 1999.99), ('d5', 2000)])
    
    table['estimated_num_d1_donors'] = adHeur(table['Donation Level 1'], 'd1')
    table['estimated_num_d2_donors'] = adHeur(table['Donation Level 2'], 'd2')
    table['estimated_num_d3_donors'] = adHeur(table['Donation Level 3'], 'd3')
    table['estimated_num_d4_donors'] = adHeur(table['Donation Level 4'], 'd4')
    table['estimated_num_d5_donors'] = adHeur(table['Donation Level 5'], 'd5')
    return table
    
df  = donor_level_features(df)
df

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 3,Donation Level 4,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,Challenger,Senate,REPUBLICAN PARTY,[ABELER4SENATE],1.0,...,2.524452e+04,2.761100e+04,2.401400e+04,franken,1.579575e+05,268.700000,54.697094,25.244772,13.805569,12.007000
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,Challenger,Senate,DEMOCRATIC PARTY,[FRIENDS OF TERRY ADAMS],1.0,...,1.720000e+04,3.480000e+04,2.230000e+04,alexander,1.142433e+05,116.041400,33.470669,17.200172,17.400087,11.150000
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,Challenger,Senate,REPUBLICAN PARTY,[ ADDIVINOLA COMMITTEE; THE],1.0,...,1.185000e+04,1.500000e+03,3.900000e+03,markey,5.856935e+04,165.971750,16.250325,11.850119,0.750004,1.950000
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,Challenger,House,DEMOCRATIC PARTY,0,0.0,...,4.342500e+05,5.968000e+05,4.430500e+05,runyan,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,Challenger,House,DEMOCRATIC PARTY,[SHELLEY ADLER FOR CONGRESS],1.0,...,1.375500e+05,1.586555e+05,2.076000e+05,runyan,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000
5,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,NC,Challenger,House,DEMOCRATIC PARTY,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,1.205295e+05,2.140461e+05,2.249964e+05,ellmers,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180
6,akin w todd,"[akin, mccaskill]",S2MO00429,2012.0,MO,Challenger,Senate,REPUBLICAN PARTY,0,0.0,...,4.583552e+05,6.254501e+05,1.598676e+06,mccaskill,5.081955e+06,10241.937250,702.186904,458.359774,312.726604,799.337995
7,alameel david m,"[cornyn, alameel]",S4TX00516,2014.0,TX,Challenger,Senate,DEMOCRATIC PARTY,[DAVID M ALAMEEL FOR UNITED STATES SENATE],1.0,...,3.500000e+03,6.000000e+03,2.500000e+03,cornyn,4.179300e+04,137.665000,4.520090,3.500035,3.000015,1.250000
8,alexander lamar,"[alexander, adams]",S2TN00058,2014.0,TN,Incumbent,Senate,REPUBLICAN PARTY,"[TENNESSEE SENATE VICTORY FUND, 2013 SENATORS ...",4.0,...,2.652728e+05,8.344301e+05,1.799528e+06,alexander,3.154669e+06,652.785000,249.768093,265.275453,417.217111,899.763979
9,allen george,"[kaine, allen]",S8VA00214,2012.0,VA,Open seat,Senate,REPUBLICAN PARTY,"[GOOD GOVERNMENT FUND; THE, GEORGE ALLEN FOR U...",8.0,...,1.241766e+06,2.675433e+06,6.230647e+06,kaine,1.319114e+07,11765.076907,1380.587600,1241.778333,1337.723273,3115.323546


In [999]:
len(df[df.office_full == 'Senate'].name)
len(df[df.office_full == 'House'].name)
len(df[(df.office_full != 'Senate') & (df.office_full != 'House')].name)

0

## Election Result

In [1000]:
# Update election_result column & last_name for joining df
# 0 if lost, 1 if won. 

binary_election_results = []
last_name_array = []
for index, row in df.iterrows():
    last_name = row[0].split()[0]
    last_name_array = np.append(last_name_array, last_name)
    if last_name == row.election_result:
        binary_election_results = np.append(binary_election_results, 1)
    else: 
        binary_election_results = np.append(binary_election_results, 0)
df['election_result'] = binary_election_results
df['last_name'] = last_name_array

## Incumbent

In [1001]:
# Update incumbent column. 
# 0 if incumbent, 1 if challenger, 2 if Open seat
incumbent_feature = df.incumbent
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Incumbent', 0)
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Challenger', 1)
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Open seat', 2)

print(incumbent_feature.unique())
df['incumbent'] = incumbent_feature


[1 0 2]


## Party

In [1002]:
# Update party column. 
# 0 if Republican, 1 if Democratic or Democratic Farm Labor, 2 if Other: NONE, Libertarian, Independent

party_feature = df.party
party_feature = party_feature.where(party_feature != 'REPUBLICAN PARTY', 0)
party_feature = party_feature.where((party_feature != 'DEMOCRATIC PARTY') & (party_feature != 'DEMOCRATIC-FARM-LABOR'), 1)
party_feature = party_feature.where((party_feature != 'NONE') & (party_feature != 'OTHER') & (party_feature != 'LIBERTARIAN PARTY') & (party_feature != 'INDEPENDENT') , 2)

print(party_feature.unique())
df['party'] = party_feature


[0 1 2]


## Spread From Polls

In [1003]:
# Biased Dataset?
print('Size of loss data: {}'.format(len(df[df.election_result == 0].name)))
print('Size of winner data: {}'.format(len(df[df.election_result == 1].name)))


Size of loss data: 375
Size of winner data: 235


In [1004]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 4,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,last_name
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,1,Senate,0,[ABELER4SENATE],1.0,...,27611.0,24014.0,0.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,abeler
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,1,Senate,1,[FRIENDS OF TERRY ADAMS],1.0,...,34800.0,22300.0,0.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,adams
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,1,Senate,0,[ ADDIVINOLA COMMITTEE; THE],1.0,...,1500.0,3900.0,0.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,addivinola
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,596800.0,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,adler
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,158655.5,207600.0,0.0,757673.11,832.2333,174.845397,137.551376,79.328147,103.8,adler


In [1005]:
# house_spread.set_index
df_house = df[df.office_full == 'House']
df_house = df_house.set_index('last_name').join(house_spread)

df_sen = df[df.office_full == 'Senate']
df_sen = df_sen.set_index('last_name').join(sen_spread)


In [1006]:
df = pd.concat([df_house, df_sen])

In [1007]:
df = df.drop(columns=['cycle'])

In [1008]:

df

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year
adler,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district,2010
adler,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,-10.000000,new jersey 3rd district,2012
adler,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district,2010
adler,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,-10.000000,new jersey 3rd district,2012
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,0.0,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180,-8.000000,north carolina 2nd district,2014
allen,allen richard w,"[allen, barrow]",H2GA12121,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1.0,1.526469e+06,679.775000,329.706594,335.592356,170.038350,275.000000,2.500000,georgia 12th district,2014
altmire,altmire jason,"[altmire, critz]",H6PA04110,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,0.0,7.152095e+05,136.983771,93.740831,202.666968,112.497757,106.641977,4.000000,pennsylvania 12th district,2012
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-12.000000,new york 1st district,2010
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-13.000000,new york 1st district,2012
amash,amash justin,"[pestka, amash]",H0MI03126,MI,0,House,0,"[MICHIGAN YOUNG GUNS VICTORY FUND (AMASH, BENI...",2.0,0.0,...,1.0,1.141024e+06,1157.937850,126.322926,132.726327,96.775484,260.000700,-1.000000,michigan 3rd district,2012


### There are multiple spreads for some candidates. I will limit it to one. Dropping duplicates, arbitrarily keeping the first spread we see.


In [1009]:
# Replace NAN spread values with the average. This may not be a good idea. We could just drop these rows. Open to alternatives.
# average_spread = df.spread.mean()
# spread = df.spread.fillna(average_spread)
# df['spread'] = spread
# df = df.drop(columns = ['year'])
df = df[df.spread.notnull()]
df = df.drop_duplicates('name')


In [1010]:
df

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year
adler,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district,2010
adler,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district,2010
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,0.0,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180,-8.000000,north carolina 2nd district,2014
allen,allen richard w,"[allen, barrow]",H2GA12121,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1.0,1.526469e+06,679.775000,329.706594,335.592356,170.038350,275.000000,2.500000,georgia 12th district,2014
altmire,altmire jason,"[altmire, critz]",H6PA04110,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,0.0,7.152095e+05,136.983771,93.740831,202.666968,112.497757,106.641977,4.000000,pennsylvania 12th district,2012
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-12.000000,new york 1st district,2010
amash,amash justin,"[pestka, amash]",H0MI03126,MI,0,House,0,"[MICHIGAN YOUNG GUNS VICTORY FUND (AMASH, BENI...",2.0,0.0,...,1.0,1.141024e+06,1157.937850,126.322926,132.726327,96.775484,260.000700,-1.000000,michigan 3rd district,2012
appel,appel staci,"[appel, young]",H4IA03065,IA,2,House,1,"[APPEL FOR IOWA, INC.]",1.0,0.0,...,0.0,1.703116e+06,3037.725100,441.701914,254.501765,148.850744,161.262500,0.000000,iowa 3rd district,2014
arnold-jones,arnold-jones janice e,"[grisham, arnold-jones]",H2NM01128,NM,2,House,0,[JANICE ARNOLD-JONES FOR CONGRESS],1.0,0.0,...,0.0,5.266928e+05,735.422700,123.510570,79.272273,64.841694,54.450000,-13.666667,new mexico 1st district,2012
bachmann,bachmann michele,"[bachmann, clark]",H6MN06074,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.0,...,1.0,1.499294e+07,47403.997550,2765.925059,1172.883029,622.590223,855.578000,9.000000,minnesota 6th district,2010


## Majority Donations & State Association

In [1011]:
df

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year
adler,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district,2010
adler,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district,2010
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,0.0,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180,-8.000000,north carolina 2nd district,2014
allen,allen richard w,"[allen, barrow]",H2GA12121,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1.0,1.526469e+06,679.775000,329.706594,335.592356,170.038350,275.000000,2.500000,georgia 12th district,2014
altmire,altmire jason,"[altmire, critz]",H6PA04110,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,0.0,7.152095e+05,136.983771,93.740831,202.666968,112.497757,106.641977,4.000000,pennsylvania 12th district,2012
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-12.000000,new york 1st district,2010
amash,amash justin,"[pestka, amash]",H0MI03126,MI,0,House,0,"[MICHIGAN YOUNG GUNS VICTORY FUND (AMASH, BENI...",2.0,0.0,...,1.0,1.141024e+06,1157.937850,126.322926,132.726327,96.775484,260.000700,-1.000000,michigan 3rd district,2012
appel,appel staci,"[appel, young]",H4IA03065,IA,2,House,1,"[APPEL FOR IOWA, INC.]",1.0,0.0,...,0.0,1.703116e+06,3037.725100,441.701914,254.501765,148.850744,161.262500,0.000000,iowa 3rd district,2014
arnold-jones,arnold-jones janice e,"[grisham, arnold-jones]",H2NM01128,NM,2,House,0,[JANICE ARNOLD-JONES FOR CONGRESS],1.0,0.0,...,0.0,5.266928e+05,735.422700,123.510570,79.272273,64.841694,54.450000,-13.666667,new mexico 1st district,2012
bachmann,bachmann michele,"[bachmann, clark]",H6MN06074,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.0,...,1.0,1.499294e+07,47403.997550,2765.925059,1172.883029,622.590223,855.578000,9.000000,minnesota 6th district,2010


In [1012]:
# Load in States and their abbreviations
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [1013]:
#Replace State Abbrevation with their full state name
df.state
states['MN']
state_array = []
for state in df.state:
    state_array = np.append(state_array, states[state])
df['state'] = state_array

In [1014]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year
adler,adler john h,"[adler, runyan]",H8NJ03156,New Jersey,1,House,1,0,0.0,0.0,...,0.0,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,2010
adler,adler shelley,"[runyan, adler]",H2NJ03183,New Jersey,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,0.0,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,2010
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,North Carolina,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,0.0,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,north carolina 2nd district,2014
allen,allen richard w,"[allen, barrow]",H2GA12121,Georgia,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1.0,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,georgia 12th district,2014
altmire,altmire jason,"[altmire, critz]",H6PA04110,Pennsylvania,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,0.0,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,pennsylvania 12th district,2012


## Data

In [1015]:
#Select only features and result from df
df.columns
data = df[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'election_result']]
data.head()
print(len(data))

610


In [1016]:
len(data.columns)

77

In [1017]:
#Shuffle
data = shuffle(data).reset_index(drop=True)

## Train Model 

In [1018]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)
# train_df, test_df = train_test_split(data, test_size = 0.15, random_state = 1337)
train_df.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,election_result
544,1,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,200502.0,766400.0,1331810.0,1007.6424,120.552411,103.106031,100.251501,383.2,-7.0,0.0
204,1,1,12.0,1500.0,868.125,20082.546818,49810.967727,1.5,25899.422273,0.0,...,4753799.0,10216510.0,27692040.0,44453.003182,3310.051727,2176.16438,2376.911318,5108.253673,0.107143,0.0
413,2,1,0.0,0.0,0.0,500.0,0.0,0.0,650.0,0.0,...,107781.5,118000.0,763246.4,1601.2708,243.686994,95.370654,53.890999,59.0,-9.0,0.0
181,0,0,8.0,0.0,0.0,3500.0,22400.0,0.0,55300.0,0.0,...,1777632.0,5274520.0,8788458.0,3596.866767,589.173401,722.35818,888.820654,2637.260193,18.944444,1.0
218,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,205070.5,287314.0,845361.5,780.596071,134.675949,129.522527,102.535772,143.656977,12.5,1.0


In [1019]:
test_df.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,election_result
559,2,1,1.0,0.0,0.0,100.0,38107.5,0.0,1000.0,0.0,...,1005522.18,3436019.84,5474505.0,2680.6845,337.563631,328.050431,502.763604,1718.00992,3.125,0.0
80,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,29492.79,88100.0,190356.8,229.82,29.600592,12.00012,14.746469,44.05,-18.666667,0.0
545,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,263705.0,495300.0,1108838.0,1007.53215,115.435229,90.610886,131.853159,247.65,8.0,1.0
174,0,1,3.0,0.0,0.0,0.0,0.0,0.0,5200.0,0.0,...,715482.7,650550.0,1911181.0,735.01015,275.580112,260.361077,357.743139,325.275,35.5,1.0
56,0,0,1.0,0.0,0.0,0.0,2400.0,0.0,0.0,0.0,...,97317.77,186000.0,553995.5,703.85875,88.565771,85.624846,48.659128,93.0,-10.0,1.0


In [1020]:
X_train = train_df.iloc[:, :76]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :76]
Y_test = test_df.election_result

In [1021]:
def train_model():
    # Logistic Regression
    
    logreg = LogisticRegression()
    logreg.fit(X_train, Y_train)
    logreg_train_acc = logreg.score(X_train, Y_train)
    logreg_test_acc = logreg.score(X_test, Y_test)
    print ('logreg training acuracy= ',logreg_train_acc)
    print('logreg test accuracy= ',logreg_test_acc)
    print('')
    # Perceptron

    perceptron = Perceptron(max_iter = 1000, tol=1e-3)
    perceptron.fit(X_train, Y_train)
    perceptron_train_acc = perceptron.score(X_train, Y_train)
    perceptron_test_acc = perceptron.score(X_test, Y_test)
    print ('perceptron training acuracy= ',perceptron_train_acc)
    print('perceptron test accuracy= ',perceptron_test_acc)
    print('')
    # Adaboost

    adaboost = AdaBoostClassifier()
    adaboost.fit(X_train, Y_train)
    adaboost_train_acc = adaboost.score(X_train, Y_train)
    adaboost_test_acc = adaboost.score(X_test, Y_test)
    print ('adaboost training acuracy= ',adaboost_train_acc)
    print('adaboost test accuracy= ',adaboost_test_acc)
    print('')
    # Random Forest

    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, Y_train)
    random_forest_train_acc = random_forest.score(X_train, Y_train)
    random_forest_test_acc = random_forest.score(X_test, Y_test)
    print('random_forest training acuracy= ',random_forest_train_acc)
    print('random_forest test accuracy= ',random_forest_test_acc)
    
    return

In [1022]:
train_model()

logreg training acuracy=  0.7090163934426229
logreg test accuracy=  0.5819672131147541

perceptron training acuracy=  0.4057377049180328
perceptron test accuracy=  0.3770491803278688

adaboost training acuracy=  0.9938524590163934
adaboost test accuracy=  0.9180327868852459

random_forest training acuracy=  1.0
random_forest test accuracy=  0.9016393442622951


## Add more Features / Improve Features, Run Model Again

### Majority Donation & State Association 

In [1023]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year
adler,adler john h,"[adler, runyan]",H8NJ03156,New Jersey,1,House,1,0,0.0,0.0,...,0.0,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,2010
adler,adler shelley,"[runyan, adler]",H2NJ03183,New Jersey,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,0.0,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,2010
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,North Carolina,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,0.0,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,north carolina 2nd district,2014
allen,allen richard w,"[allen, barrow]",H2GA12121,Georgia,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1.0,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,georgia 12th district,2014
altmire,altmire jason,"[altmire, critz]",H6PA04110,Pennsylvania,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,0.0,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,pennsylvania 12th district,2012


In [1024]:
# Percentage of contributions made from home state. 
home_state_contrib_array = []
for index, row in df.iterrows():
    home_state = row.state
    home_state_contribution = row[home_state] / row.total_donations
    home_state_contrib_array = np.append(home_state_contrib_array, home_state_contribution)
df['home_state_contrib'] = home_state_contrib_array

In [1025]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year,home_state_contrib
adler,adler john h,"[adler, runyan]",H8NJ03156,New Jersey,1,House,1,0,0.0,0.0,...,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,2010,0.620806
adler,adler shelley,"[runyan, adler]",H2NJ03183,New Jersey,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,2010,0.422481
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,North Carolina,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,north carolina 2nd district,2014,0.22729
allen,allen richard w,"[allen, barrow]",H2GA12121,Georgia,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,georgia 12th district,2014,0.556754
altmire,altmire jason,"[altmire, critz]",H6PA04110,Pennsylvania,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,pennsylvania 12th district,2012,0.714757


In [1026]:
#Select only features and result from df
df.columns
data = df[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'home_state_contrib','election_result']]
data.head()


Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,home_state_contrib,election_result
adler,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,443050.0,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,0.620806,0.0
adler,1,1,1.0,0.0,0.0,1214.28,550.0,0.0,450.0,0.0,...,207600.0,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,0.422481,0.0
aiken,1,1,1.0,0.0,0.0,0.0,751.46,0.0,857.3,0.0,...,224996.36,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,0.22729,0.0
allen,0,0,1.0,0.0,0.0,0.0,8700.0,0.0,2600.0,0.0,...,550000.0,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,0.556754,1.0
altmire,1,1,1.0,0.0,0.0,0.0,5000.0,0.0,0.0,0.0,...,213283.953488,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,0.714757,0.0


In [1027]:
#Visualize Features
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import seaborn as sns



In [1028]:
#Shuffle
data = shuffle(data).reset_index(drop=True)

In [1029]:
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :77]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :77]
Y_test = test_df.election_result

In [1030]:
train_model()

logreg training acuracy=  0.6577868852459017
logreg test accuracy=  0.5655737704918032

perceptron training acuracy=  0.39959016393442626
perceptron test accuracy=  0.32786885245901637

adaboost training acuracy=  0.985655737704918
adaboost test accuracy=  0.9672131147540983

random_forest training acuracy=  0.9959016393442623
random_forest test accuracy=  0.9098360655737705


## Add more Features / Improve Features, Run Model Again (NOT A GOOD FEATURE)

In [1031]:
cpgd = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'cpgd_actual_final.pkl')
cpgd = cpgd.reset_index()
years = []
for date in cpgd.date:
    year = date.split('-')[0]
    years = np.append(years, year)


In [1032]:
cpgd['year'] = years


In [1034]:
cpgd_average = cpgd.groupby('year').mean()
cpgd_average = cpgd_average.reset_index()
cpgd_average

Unnamed: 0,year,approval rating,disapproval rating,spread
0,2010,39.722917,52.474887,-12.438908
1,2011,35.545477,56.023353,-19.884956
2,2012,39.176656,53.378286,-13.681125
3,2013,33.416246,56.401354,-22.860458
4,2014,31.687253,58.056611,-26.075218
5,2015,32.142225,58.557728,-26.398743
6,2016,34.256134,57.349746,-23.074154
7,2017,33.66294,55.073535,-21.410594
8,2018,38.395796,52.69517,-14.297161


In [1035]:
year_reformat_array = []
for year in df.year:
    year_reformat = year.strip()
    year_reformat_array = np.append(year_reformat_array, year_reformat)

df['year'] = year_reformat_array


In [1036]:
df3 = df.merge(cpgd_average, left_on='year', right_on='year', how='outer')
df3

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread_x,race_name,year,home_state_contrib,approval rating,disapproval rating,spread_y
0,adler john h,"[adler, runyan]",H8NJ03156,New Jersey,1,House,1,0,0.0,0.00,...,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district,2010,0.620806,39.722917,52.474887,-12.438908
1,adler shelley,"[runyan, adler]",H2NJ03183,New Jersey,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.00,...,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district,2010,0.422481,39.722917,52.474887,-12.438908
2,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,New York,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.00,...,134.721997,147.059070,435.968335,-12.000000,new york 1st district,2010,0.621923,39.722917,52.474887,-12.438908
3,bachmann michele,"[bachmann, clark]",H6MN06074,Minnesota,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.00,...,1172.883029,622.590223,855.578000,9.000000,minnesota 6th district,2010,0.126658,39.722917,52.474887,-12.438908
4,barela jon,"[barela, heinrich]",H0NM01163,New Mexico,1,House,0,0,0.0,0.00,...,191.259493,128.957995,210.049995,-2.666667,new mexico 1st district,2010,0.806308,39.722917,52.474887,-12.438908
5,barletta lou,"[barletta, kanjorski]",H2PA11098,Pennsylvania,0,House,0,"[PA + 5 COMMITTEE, PATRIOT DAY 2011]",2.0,0.00,...,110.191102,65.413827,92.987500,7.750000,pennsylvania 11th district,2010,0.493122,39.722917,52.474887,-12.438908
6,barr garland andy,"[chandler, barr]",H0KY06104,Kentucky,0,House,0,"[WIRE TO WIRE COMMITTEE, DOWN WITH DEBT, ANDY ...",7.0,0.00,...,239.366005,200.592616,326.654817,-14.000000,kentucky 6th district,2010,0.761131,39.722917,52.474887,-12.438908
7,benishek daniel j.,"[benishek, mcdowell]",H0MI01088,Michigan,0,House,0,"[FOUNDERS' COMMITTEE, PATRIOT DAY III, MICHIGA...",4.0,0.00,...,79.359225,82.841563,191.138093,3.000000,michigan 1st district,2010,0.273310,39.722917,52.474887,-12.438908
8,berryhill michael clare sr,"[cardoza, berryhill]",H0CA18050,California,2,House,0,0,0.0,0.00,...,16.500165,15.995970,20.725000,-6.000000,california 18th district,2010,0.718340,39.722917,52.474887,-12.438908
9,bishop mike,"[bishop, slotkin]",H4MI08135,Michigan,0,House,0,"[WALBERG BISHOP VICTORY FUND, MIKE BISHOP FOR ...",2.0,0.00,...,109.246782,139.939390,379.658800,12.000000,new york 1st district,2010,0.531101,39.722917,52.474887,-12.438908


In [1037]:
df3.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread_x,race_name,year,home_state_contrib,approval rating,disapproval rating,spread_y
0,adler john h,"[adler, runyan]",H8NJ03156,New Jersey,1,House,1,0,0.0,0.0,...,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,2010,0.620806,39.722917,52.474887,-12.438908
1,adler shelley,"[runyan, adler]",H2NJ03183,New Jersey,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,2010,0.422481,39.722917,52.474887,-12.438908
2,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,New York,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,134.721997,147.05907,435.968335,-12.0,new york 1st district,2010,0.621923,39.722917,52.474887,-12.438908
3,bachmann michele,"[bachmann, clark]",H6MN06074,Minnesota,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.0,...,1172.883029,622.590223,855.578,9.0,minnesota 6th district,2010,0.126658,39.722917,52.474887,-12.438908
4,barela jon,"[barela, heinrich]",H0NM01163,New Mexico,1,House,0,0,0.0,0.0,...,191.259493,128.957995,210.049995,-2.666667,new mexico 1st district,2010,0.806308,39.722917,52.474887,-12.438908


In [1045]:
#Select only features and result from df
data = df3[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread_x', 'home_state_contrib', 'approval rating', 'disapproval rating', 'spread_y', 'election_result']]
data.head()


#Shuffle
data = shuffle(data).reset_index(drop=True)

train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :80]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :80]
Y_test = test_df.election_result

train_model()

logreg training acuracy=  0.7172131147540983
logreg test accuracy=  0.5737704918032787

perceptron training acuracy=  0.3770491803278688
perceptron test accuracy=  0.4180327868852459

adaboost training acuracy=  0.9897540983606558
adaboost test accuracy=  0.9016393442622951

random_forest training acuracy=  0.9979508196721312
random_forest test accuracy=  0.860655737704918


## Other Features to include: R or D in Presidential, Positive or Negative based on the party affiliation of candidate.

In [1049]:
wh_party = pd.read_csv(Path('.')/'..'/'..'/'data'/'cleaned'/'WhiteHouseParty.csv')
wh_party.head()

Unnamed: 0,date,WhiteHouseParty
0,01/01/2010,r
1,01/02/2010,r
2,01/03/2010,r
3,01/04/2010,r
4,01/05/2010,r


In [1053]:
years = []
for date in wh_party.date:
    year = date.split('/')[2]
    years = np.append(years, year)
wh_party['year'] = years

In [1062]:
wh_party_years = wh_party.drop_duplicates('year')

In [1063]:
wh_party_years.set_index('year')

Unnamed: 0_level_0,date,WhiteHouseParty
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,01/01/2010,r
2011,01/01/2011,r
2012,01/01/2012,r
2013,01/01/2013,r
2014,01/01/2014,r
2015,01/01/2015,r
2016,01/01/2016,r
2017,01/01/2017,r
2018,01/01/2018,d


In [1067]:
df4 = df.merge(wh_party_years, left_on='year', right_on='year', how='outer')
df4 = df4.drop(columns=['date'])

In [1110]:
# 0 if Republican, 1 if Democratic or Democratic Farm Labor, 2 if Other: NONE, Libertarian, Independent
party_sentiment = []
for index, row in df4.iterrows():
    if (row.party == 0) & (row.WhiteHouseParty == 'r'):
        party_sentiment = np.append(party_sentiment, 1)
        continue
    if (row.party == 1) & (row.WhiteHouseParty == 'd'):
        party_sentiment = np.append(party_sentiment, 1)
        continue
    if (row.party == 2) & ((row.WhiteHouseParty != 'r') & (row.WhiteHouseParty != 'd')):
        party_sentiment = np.append(party_sentiment, 1)
        continue
    if (row.party == 2) &  ((row.WhiteHouseParty == 'r') | (row.WhiteHouseParty == 'd')):
        party_sentiment = np.append(party_sentiment, 0)
        continue
    else: 
        party_sentiment = np.append(party_sentiment, -1)
        continue
    

In [1111]:
len(party_sentiment)
# len(df4.name)

610

In [1113]:
df4['party_sentiment'] = party_sentiment


Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year,home_state_contrib,WhiteHouseParty,party_sentiment
0,adler john h,"[adler, runyan]",H8NJ03156,New Jersey,1,House,1,0,0.0,0.00,...,410.360587,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district,2010,0.620806,r,-1.0
1,adler shelley,"[runyan, adler]",H2NJ03183,New Jersey,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.00,...,174.845397,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district,2010,0.422481,r,-1.0
2,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,New York,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.00,...,219.736955,134.721997,147.059070,435.968335,-12.000000,new york 1st district,2010,0.621923,r,1.0
3,bachmann michele,"[bachmann, clark]",H6MN06074,Minnesota,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.00,...,2765.925059,1172.883029,622.590223,855.578000,9.000000,minnesota 6th district,2010,0.126658,r,1.0
4,barela jon,"[barela, heinrich]",H0NM01163,New Mexico,1,House,0,0,0.0,0.00,...,200.505290,191.259493,128.957995,210.049995,-2.666667,new mexico 1st district,2010,0.806308,r,1.0
5,barletta lou,"[barletta, kanjorski]",H2PA11098,Pennsylvania,0,House,0,"[PA + 5 COMMITTEE, PATRIOT DAY 2011]",2.0,0.00,...,158.593612,110.191102,65.413827,92.987500,7.750000,pennsylvania 11th district,2010,0.493122,r,1.0
6,barr garland andy,"[chandler, barr]",H0KY06104,Kentucky,0,House,0,"[WIRE TO WIRE COMMITTEE, DOWN WITH DEBT, ANDY ...",7.0,0.00,...,308.100989,239.366005,200.592616,326.654817,-14.000000,kentucky 6th district,2010,0.761131,r,1.0
7,benishek daniel j.,"[benishek, mcdowell]",H0MI01088,Michigan,0,House,0,"[FOUNDERS' COMMITTEE, PATRIOT DAY III, MICHIGA...",4.0,0.00,...,100.104971,79.359225,82.841563,191.138093,3.000000,michigan 1st district,2010,0.273310,r,1.0
8,berryhill michael clare sr,"[cardoza, berryhill]",H0CA18050,California,2,House,0,0,0.0,0.00,...,31.050621,16.500165,15.995970,20.725000,-6.000000,california 18th district,2010,0.718340,r,1.0
9,bishop mike,"[bishop, slotkin]",H4MI08135,Michigan,0,House,0,"[WALBERG BISHOP VICTORY FUND, MIKE BISHOP FOR ...",2.0,0.00,...,112.636933,109.246782,139.939390,379.658800,12.000000,new york 1st district,2010,0.531101,r,1.0


In [1114]:
#Select only features and result from df
data = df4[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'home_state_contrib', 'party_sentiment','election_result']]
data.head()


#Shuffle
data = shuffle(data).reset_index(drop=True)

train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :78]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :78]
Y_test = test_df.election_result

train_model()

logreg training acuracy=  0.7213114754098361
logreg test accuracy=  0.5245901639344263

perceptron training acuracy=  0.3770491803278688
perceptron test accuracy=  0.4180327868852459

adaboost training acuracy=  0.9979508196721312
adaboost test accuracy=  0.9344262295081968

random_forest training acuracy=  0.9897540983606558
random_forest test accuracy=  0.9016393442622951
