In [1]:
import pandas as pd
import pickle as pkl
from pathlib import Path
import numpy as np
import sklearn 

 # machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [2]:
house_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'house_avg_spread.pkl')
sen_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'sen_avg_spread.pkl')
gov_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'gov_avg_spread.pkl')

df = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'df_donor_clean-Copy1.pkl')

In [3]:
house_spread.head()
# sen_spread.head()
# gov_spread.head()

Unnamed: 0_level_0,spread,race_name,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
critz,-1.0,pennsylvania 12th district special election,2010
burns,1.0,pennsylvania 12th district special election,2010
shea-porter,-7.4,new hampshire 1st district,2010
guinta,7.4,new hampshire 1st district,2010
swett,-12.666667,new hampshire 2nd district,2010


In [4]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,West Virginia,Wyoming,Foreign Countries,Donation Level 1,Donation Level 2,Donation Level 3,Donation Level 4,Donation Level 5,election_result,total_donations
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,Challenger,Senate,REPUBLICAN PARTY,[ABELER4SENATE],1.0,...,0.0,0.0,0.0,53740.0,27348.0,25244.52,27611.0,24014.0,franken,157957.52
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,Challenger,Senate,DEMOCRATIC PARTY,[FRIENDS OF TERRY ADAMS],1.0,...,0.0,0.0,0.0,23208.28,16735.0,17200.0,34800.0,22300.0,alexander,114243.28
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,Challenger,Senate,REPUBLICAN PARTY,[ ADDIVINOLA COMMITTEE; THE],1.0,...,0.0,0.0,0.0,33194.35,8125.0,11850.0,1500.0,3900.0,markey,58569.35
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,Challenger,House,DEMOCRATIC PARTY,0,0.0,...,4800.0,0.0,0.0,116284.98,205176.19,434250.0,596800.0,443050.0,runyan,1795561.17
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,Challenger,House,DEMOCRATIC PARTY,[SHELLEY ADLER FOR CONGRESS],1.0,...,5000.0,0.0,0.0,166446.66,87420.95,137550.0,158655.5,207600.0,runyan,757673.11


# Feature Engineering

## Number of Donations

In [5]:
# by_size/by_candidate
#  -0    $200 and under
#  -200  $200.01 - $499.99
#  -500  $500 - $999.99
#  -1000 $1000 - $1999.99
#  -2000 $2000 +

donorLevels = dict([('d1', 200), ('d2', 499.99), ('d3', 999.99), ('d4', 1999.99), ('d5', 2000)])
adHeur = lambda col, level: np.divide(col, donorLevels[level])

def donor_level_features(table):
    
    """Adds features to the FEC table that include the minimum number of possible donors
    per donor level"""
    
    adHeur = lambda col, level: np.divide(col, donorLevels[level])
    donorLevels = dict([('d1', 200), ('d2', 499.99), ('d3', 999.99), ('d4', 1999.99), ('d5', 2000)])
    
    table['estimated_num_d1_donors'] = adHeur(table['Donation Level 1'], 'd1')
    table['estimated_num_d2_donors'] = adHeur(table['Donation Level 2'], 'd2')
    table['estimated_num_d3_donors'] = adHeur(table['Donation Level 3'], 'd3')
    table['estimated_num_d4_donors'] = adHeur(table['Donation Level 4'], 'd4')
    table['estimated_num_d5_donors'] = adHeur(table['Donation Level 5'], 'd5')
    return table
    
df  = donor_level_features(df)
df

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 3,Donation Level 4,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,Challenger,Senate,REPUBLICAN PARTY,[ABELER4SENATE],1.0,...,2.524452e+04,2.761100e+04,2.401400e+04,franken,1.579575e+05,268.700000,54.697094,25.244772,13.805569,12.007000
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,Challenger,Senate,DEMOCRATIC PARTY,[FRIENDS OF TERRY ADAMS],1.0,...,1.720000e+04,3.480000e+04,2.230000e+04,alexander,1.142433e+05,116.041400,33.470669,17.200172,17.400087,11.150000
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,Challenger,Senate,REPUBLICAN PARTY,[ ADDIVINOLA COMMITTEE; THE],1.0,...,1.185000e+04,1.500000e+03,3.900000e+03,markey,5.856935e+04,165.971750,16.250325,11.850119,0.750004,1.950000
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,Challenger,House,DEMOCRATIC PARTY,0,0.0,...,4.342500e+05,5.968000e+05,4.430500e+05,runyan,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,Challenger,House,DEMOCRATIC PARTY,[SHELLEY ADLER FOR CONGRESS],1.0,...,1.375500e+05,1.586555e+05,2.076000e+05,runyan,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000
5,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,NC,Challenger,House,DEMOCRATIC PARTY,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,1.205295e+05,2.140461e+05,2.249964e+05,ellmers,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180
6,akin w todd,"[akin, mccaskill]",S2MO00429,2012.0,MO,Challenger,Senate,REPUBLICAN PARTY,0,0.0,...,4.583552e+05,6.254501e+05,1.598676e+06,mccaskill,5.081955e+06,10241.937250,702.186904,458.359774,312.726604,799.337995
7,alameel david m,"[cornyn, alameel]",S4TX00516,2014.0,TX,Challenger,Senate,DEMOCRATIC PARTY,[DAVID M ALAMEEL FOR UNITED STATES SENATE],1.0,...,3.500000e+03,6.000000e+03,2.500000e+03,cornyn,4.179300e+04,137.665000,4.520090,3.500035,3.000015,1.250000
8,alexander lamar,"[alexander, adams]",S2TN00058,2014.0,TN,Incumbent,Senate,REPUBLICAN PARTY,"[TENNESSEE SENATE VICTORY FUND, 2013 SENATORS ...",4.0,...,2.652728e+05,8.344301e+05,1.799528e+06,alexander,3.154669e+06,652.785000,249.768093,265.275453,417.217111,899.763979
9,allen george,"[kaine, allen]",S8VA00214,2012.0,VA,Open seat,Senate,REPUBLICAN PARTY,"[GOOD GOVERNMENT FUND; THE, GEORGE ALLEN FOR U...",8.0,...,1.241766e+06,2.675433e+06,6.230647e+06,kaine,1.319114e+07,11765.076907,1380.587600,1241.778333,1337.723273,3115.323546


In [6]:
len(df[df.office_full == 'Senate'].name)
len(df[df.office_full == 'House'].name)
len(df[(df.office_full != 'Senate') & (df.office_full != 'House')].name)

0

## Election Result

In [7]:
# Update election_result column & last_name for joining df
# 0 if lost, 1 if won. 

binary_election_results = []
last_name_array = []
for index, row in df.iterrows():
    last_name = row[0].split()[0]
    last_name_array = np.append(last_name_array, last_name)
    if last_name == row.election_result:
        binary_election_results = np.append(binary_election_results, 1)
    else: 
        binary_election_results = np.append(binary_election_results, 0)
df['election_result'] = binary_election_results
df['last_name'] = last_name_array

## Incumbent

In [8]:
# Update incumbent column. 
# 0 if incumbent, 1 if challenger, 2 if Open seat
incumbent_feature = df.incumbent
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Incumbent', 0)
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Challenger', 1)
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Open seat', 2)

print(incumbent_feature.unique())
df['incumbent'] = incumbent_feature


[1 0 2]


## Party

In [9]:
# Update party column. 
# 0 if Republican, 1 if Democratic or Democratic Farm Labor, 2 if Other: NONE, Libertarian, Independent

party_feature = df.party
party_feature = party_feature.where(party_feature != 'REPUBLICAN PARTY', 0)
party_feature = party_feature.where((party_feature != 'DEMOCRATIC PARTY') & (party_feature != 'DEMOCRATIC-FARM-LABOR'), 1)
party_feature = party_feature.where((party_feature != 'NONE') & (party_feature != 'OTHER') & (party_feature != 'LIBERTARIAN PARTY') & (party_feature != 'INDEPENDENT') , 2)

print(party_feature.unique())
df['party'] = party_feature


[0 1 2]


## Spread From Polls

In [10]:
# Biased Dataset?
print('Size of loss data: {}'.format(len(df[df.election_result == 0].name)))
print('Size of winner data: {}'.format(len(df[df.election_result == 1].name)))


Size of loss data: 375
Size of winner data: 235


In [11]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 4,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,last_name
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,1,Senate,0,[ABELER4SENATE],1.0,...,27611.0,24014.0,0.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,abeler
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,1,Senate,1,[FRIENDS OF TERRY ADAMS],1.0,...,34800.0,22300.0,0.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,adams
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,1,Senate,0,[ ADDIVINOLA COMMITTEE; THE],1.0,...,1500.0,3900.0,0.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,addivinola
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,596800.0,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,adler
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,158655.5,207600.0,0.0,757673.11,832.2333,174.845397,137.551376,79.328147,103.8,adler


In [12]:
# house_spread.set_index
df_house = df[df.office_full == 'House']
df_house = df_house.set_index('last_name').join(house_spread)

df_sen = df[df.office_full == 'Senate']
df_sen = df_sen.set_index('last_name').join(sen_spread)


In [13]:
df = pd.concat([df_house, df_sen])

In [14]:
df = df.drop(columns=['year'])

In [15]:

df

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,4.430500e+05,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,4.430500e+05,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,-10.000000,new jersey 3rd district
adler,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,2.076000e+05,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district
adler,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,2.076000e+05,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,-10.000000,new jersey 3rd district
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,2.249964e+05,0.0,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180,-8.000000,north carolina 2nd district
allen,allen richard w,"[allen, barrow]",H2GA12121,2014.0,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,...,5.500000e+05,1.0,1.526469e+06,679.775000,329.706594,335.592356,170.038350,275.000000,2.500000,georgia 12th district
altmire,altmire jason,"[altmire, critz]",H6PA04110,2012.0,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,...,2.132840e+05,0.0,7.152095e+05,136.983771,93.740831,202.666968,112.497757,106.641977,4.000000,pennsylvania 12th district
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,2010.0,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,...,8.719367e+05,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-12.000000,new york 1st district
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,2010.0,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,...,8.719367e+05,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-13.000000,new york 1st district
amash,amash justin,"[pestka, amash]",H0MI03126,2012.0,MI,0,House,0,"[MICHIGAN YOUNG GUNS VICTORY FUND (AMASH, BENI...",2.0,...,5.200014e+05,1.0,1.141024e+06,1157.937850,126.322926,132.726327,96.775484,260.000700,-1.000000,michigan 3rd district


### There are multiple spreads for some candidates. I will limit it to one. Dropping duplicates, arbitrarily keeping the first spread we see.


In [16]:
# Replace NAN spread values with the average. This may not be a good idea. We could just drop these rows. Open to alternatives.
# average_spread = df.spread.mean()
# spread = df.spread.fillna(average_spread)
# df['spread'] = spread
# df = df.drop(columns = ['year'])
df = df[df.spread.notnull()]
df = df.drop_duplicates('name')


In [17]:
df

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,4.430500e+05,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district
adler,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,2.076000e+05,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,2.249964e+05,0.0,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180,-8.000000,north carolina 2nd district
allen,allen richard w,"[allen, barrow]",H2GA12121,2014.0,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,...,5.500000e+05,1.0,1.526469e+06,679.775000,329.706594,335.592356,170.038350,275.000000,2.500000,georgia 12th district
altmire,altmire jason,"[altmire, critz]",H6PA04110,2012.0,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,...,2.132840e+05,0.0,7.152095e+05,136.983771,93.740831,202.666968,112.497757,106.641977,4.000000,pennsylvania 12th district
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,2010.0,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,...,8.719367e+05,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-12.000000,new york 1st district
amash,amash justin,"[pestka, amash]",H0MI03126,2012.0,MI,0,House,0,"[MICHIGAN YOUNG GUNS VICTORY FUND (AMASH, BENI...",2.0,...,5.200014e+05,1.0,1.141024e+06,1157.937850,126.322926,132.726327,96.775484,260.000700,-1.000000,michigan 3rd district
appel,appel staci,"[appel, young]",H4IA03065,2014.0,IA,2,House,1,"[APPEL FOR IOWA, INC.]",1.0,...,3.225250e+05,0.0,1.703116e+06,3037.725100,441.701914,254.501765,148.850744,161.262500,0.000000,iowa 3rd district
arnold-jones,arnold-jones janice e,"[grisham, arnold-jones]",H2NM01128,2012.0,NM,2,House,0,[JANICE ARNOLD-JONES FOR CONGRESS],1.0,...,1.089000e+05,0.0,5.266928e+05,735.422700,123.510570,79.272273,64.841694,54.450000,-13.666667,new mexico 1st district
bachmann,bachmann michele,"[bachmann, clark]",H6MN06074,2010.0,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,...,1.711156e+06,1.0,1.499294e+07,47403.997550,2765.925059,1172.883029,622.590223,855.578000,9.000000,minnesota 6th district


## Majority Donations & State Association

In [18]:
df

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,4.430500e+05,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district
adler,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,2.076000e+05,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,2.249964e+05,0.0,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180,-8.000000,north carolina 2nd district
allen,allen richard w,"[allen, barrow]",H2GA12121,2014.0,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,...,5.500000e+05,1.0,1.526469e+06,679.775000,329.706594,335.592356,170.038350,275.000000,2.500000,georgia 12th district
altmire,altmire jason,"[altmire, critz]",H6PA04110,2012.0,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,...,2.132840e+05,0.0,7.152095e+05,136.983771,93.740831,202.666968,112.497757,106.641977,4.000000,pennsylvania 12th district
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,2010.0,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,...,8.719367e+05,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-12.000000,new york 1st district
amash,amash justin,"[pestka, amash]",H0MI03126,2012.0,MI,0,House,0,"[MICHIGAN YOUNG GUNS VICTORY FUND (AMASH, BENI...",2.0,...,5.200014e+05,1.0,1.141024e+06,1157.937850,126.322926,132.726327,96.775484,260.000700,-1.000000,michigan 3rd district
appel,appel staci,"[appel, young]",H4IA03065,2014.0,IA,2,House,1,"[APPEL FOR IOWA, INC.]",1.0,...,3.225250e+05,0.0,1.703116e+06,3037.725100,441.701914,254.501765,148.850744,161.262500,0.000000,iowa 3rd district
arnold-jones,arnold-jones janice e,"[grisham, arnold-jones]",H2NM01128,2012.0,NM,2,House,0,[JANICE ARNOLD-JONES FOR CONGRESS],1.0,...,1.089000e+05,0.0,5.266928e+05,735.422700,123.510570,79.272273,64.841694,54.450000,-13.666667,new mexico 1st district
bachmann,bachmann michele,"[bachmann, clark]",H6MN06074,2010.0,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,...,1.711156e+06,1.0,1.499294e+07,47403.997550,2765.925059,1172.883029,622.590223,855.578000,9.000000,minnesota 6th district


In [19]:
# Load in States and their abbreviations
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [20]:
#Replace State Abbrevation with their full state name
df.state
states['MN']
state_array = []
for state in df.state:
    state_array = np.append(state_array, states[state])
df['state'] = state_array

In [21]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,New Jersey,1,House,1,0,0.0,...,443050.0,0.0,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district
adler,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,New Jersey,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,207600.0,0.0,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,new jersey 3rd district
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,North Carolina,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,224996.36,0.0,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,north carolina 2nd district
allen,allen richard w,"[allen, barrow]",H2GA12121,2014.0,Georgia,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,...,550000.0,1.0,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,georgia 12th district
altmire,altmire jason,"[altmire, critz]",H6PA04110,2012.0,Pennsylvania,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,...,213283.953488,0.0,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,pennsylvania 12th district


## Data

In [22]:
#Select only features and result from df
df.columns
data = df[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'election_result']]
data.head()
print(len(data))

610


In [23]:
len(data.columns)

77

In [24]:
#Shuffle
data = shuffle(data).reset_index(drop=True)

NameError: name 'shuffle' is not defined

## Train Model 

In [25]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)
# train_df, test_df = train_test_split(data, test_size = 0.15, random_state = 1337)
train_df.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,election_result
rutledge,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12500.0,4400.0,144249.3,459.13685,28.51839,21.263253,6.250031,2.2,-27.3,0.0
mowrer,1,1,1.0,0.0,0.0,2000.0,200.0,0.0,2100.0,0.0,...,332502.9,283240.0,1972332.0,3596.07205,566.564671,354.101871,166.252281,141.62,-11.5,0.0
gillespie,1,0,7.0,0.0,0.0,0.0,13300.0,0.0,19600.0,0.0,...,996736.042982,3985571.0,6590395.0,3930.298967,531.989518,556.043797,498.370513,1992.785611,-15.842105,0.0
marino,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,150733.19,197650.0,567430.4,453.6619,100.913758,77.859739,75.366972,98.825,8.333333,1.0
obermueller,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,86161.0,107400.0,1249095.0,3199.26575,473.105662,179.134141,43.080715,53.7,-22.0,0.0


In [587]:
test_df.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,election_result
559,2,0,1.0,200.0,623.0,25023.0,110130.38,100.0,57131.0,0.0,...,2289455.0,2626458.0,27996300.0,92944.4958,4616.030501,2183.541035,1144.733104,1313.228805,-1.569892,0.0
80,1,1,0.0,0.0,0.0,250.0,800.0,0.0,2100.0,0.0,...,334459.8,186560.0,1744161.0,3283.70465,535.753955,298.531135,167.230736,93.28,-9.0,0.0
545,1,0,4.0,0.0,0.0,4950.0,14800.0,0.0,57000.0,0.0,...,3046979.0,9768928.0,15356400.0,3818.06675,1107.185424,1223.311194,1523.496921,4884.464163,-6.0,1.0
174,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,175463.0,161300.0,472787.9,195.46905,83.077662,55.393584,87.731959,80.65,-4.0,0.0
56,0,1,0.0,0.0,1200.0,0.0,450.0,0.0,2400.0,0.0,...,238566.7,182360.0,1132805.0,2143.5346,239.539871,163.405634,119.283926,91.18,-7.4,0.0


In [26]:
X_train = train_df.iloc[:, :76]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :76]
Y_test = test_df.election_result

In [27]:
def train_model():
    # Logistic Regression
    
    logreg = LogisticRegression()
    logreg.fit(X_train, Y_train)
    logreg_train_acc = logreg.score(X_train, Y_train)
    logreg_test_acc = logreg.score(X_test, Y_test)
    print ('logreg training acuracy= ',logreg_train_acc)
    print('logreg test accuracy= ',logreg_test_acc)
    print('')
    # Perceptron

    perceptron = Perceptron(max_iter = 1000, tol=1e-3)
    perceptron.fit(X_train, Y_train)
    perceptron_train_acc = perceptron.score(X_train, Y_train)
    perceptron_test_acc = perceptron.score(X_test, Y_test)
    print ('perceptron training acuracy= ',perceptron_train_acc)
    print('perceptron test accuracy= ',perceptron_test_acc)
    print('')
    # Adaboost

    adaboost = AdaBoostClassifier()
    adaboost.fit(X_train, Y_train)
    adaboost_train_acc = adaboost.score(X_train, Y_train)
    adaboost_test_acc = adaboost.score(X_test, Y_test)
    print ('adaboost training acuracy= ',adaboost_train_acc)
    print('adaboost test accuracy= ',adaboost_test_acc)
    print('')
    # Random Forest

    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, Y_train)
    random_forest_train_acc = random_forest.score(X_train, Y_train)
    random_forest_test_acc = random_forest.score(X_test, Y_test)
    print('random_forest training acuracy= ',random_forest_train_acc)
    print('random_forest test accuracy= ',random_forest_test_acc)
    
    return

In [28]:
train_model()



logreg training acuracy=  0.684426229508
logreg test accuracy=  0.598360655738

perceptron training acuracy=  0.608606557377
perceptron test accuracy=  0.540983606557

adaboost training acuracy=  0.993852459016
adaboost test accuracy=  0.877049180328

random_forest training acuracy=  0.991803278689
random_forest test accuracy=  0.885245901639




## Add more Features / Improve Features, Run Model Again

### Majority Donation & State Association 

In [591]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,home_state_contrib
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,New Jersey,1,House,1,0,0.0,...,0.0,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,0.620806
adler,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,New Jersey,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,0.0,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,0.422481
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,North Carolina,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,0.0,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,north carolina 2nd district,0.22729
allen,allen richard w,"[allen, barrow]",H2GA12121,2014.0,Georgia,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,...,1.0,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,georgia 12th district,0.556754
altmire,altmire jason,"[altmire, critz]",H6PA04110,2012.0,Pennsylvania,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,...,0.0,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,pennsylvania 12th district,0.714757


In [592]:
# Percentage of contributions made from home state. 
home_state_contrib_array = []
for index, row in df.iterrows():
    home_state = row.state
    home_state_contribution = row[home_state] / row.total_donations
    home_state_contrib_array = np.append(home_state_contrib_array, home_state_contribution)
df['home_state_contrib'] = home_state_contrib_array

In [593]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,home_state_contrib
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,New Jersey,1,House,1,0,0.0,...,0.0,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,0.620806
adler,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,New Jersey,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,0.0,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,0.422481
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,North Carolina,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,0.0,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,north carolina 2nd district,0.22729
allen,allen richard w,"[allen, barrow]",H2GA12121,2014.0,Georgia,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,...,1.0,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,georgia 12th district,0.556754
altmire,altmire jason,"[altmire, critz]",H6PA04110,2012.0,Pennsylvania,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,...,0.0,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,pennsylvania 12th district,0.714757


In [594]:
#Select only features and result from df
df.columns
data = df[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'home_state_contrib','election_result']]
data.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,home_state_contrib,election_result
adler,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,443050.0,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,0.620806,0.0
adler,1,1,1.0,0.0,0.0,1214.28,550.0,0.0,450.0,0.0,...,207600.0,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,0.422481,0.0
aiken,1,1,1.0,0.0,0.0,0.0,751.46,0.0,857.3,0.0,...,224996.36,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,0.22729,0.0
allen,0,0,1.0,0.0,0.0,0.0,8700.0,0.0,2600.0,0.0,...,550000.0,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,0.556754,1.0
altmire,1,1,1.0,0.0,0.0,0.0,5000.0,0.0,0.0,0.0,...,213283.953488,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,0.714757,0.0


In [595]:
#Visualize Features
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import seaborn as sns

# data.iloc[:, :76].hist()


In [597]:
#Shuffle
data = shuffle(data).reset_index(drop=True)

In [598]:
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :77]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :77]
Y_test = test_df.election_result

In [599]:
train_model()

logreg training acuracy=  0.6639344262295082
logreg test accuracy=  0.5983606557377049

perceptron training acuracy=  0.5184426229508197
perceptron test accuracy=  0.4672131147540984

adaboost training acuracy=  0.9979508196721312
adaboost test accuracy=  0.9016393442622951

random_forest training acuracy=  1.0
random_forest test accuracy=  0.9098360655737705


## Add more Features / Improve Features, Run Model Again

In [571]:
cpgd = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'cpgd_actual_final.pkl')
cpgd

Unnamed: 0_level_0,approval rating,disapproval rating,spread,approval type
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-04-21,20.000000,71.0,-51.000000,Congressional Job Approval
2010-04-22,22.000000,68.0,-46.000000,Congressional Job Approval
2010-05-12,21.000000,72.0,-51.000000,Congressional Job Approval
2010-05-14,21.000000,75.0,-54.000000,Congressional Job Approval
2010-05-15,28.000000,71.0,-43.000000,Congressional Job Approval
2010-05-21,22.000000,65.0,-43.000000,Congressional Job Approval
2010-05-25,15.000000,77.0,-62.000000,Congressional Job Approval
2010-06-08,26.000000,71.0,-45.000000,Congressional Job Approval
2010-06-11,22.000000,69.0,-47.000000,Congressional Job Approval
2010-06-15,24.000000,73.0,-49.000000,Congressional Job Approval
