In [215]:
import pandas as pd
import pickle as pkl
from pathlib import Path
import numpy as np
import sklearn 

 # machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [159]:
cand_poll_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'candidates_avg_spread.pkl')
df = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'df_donor_clean-Copy1.pkl')

In [160]:
cand_poll_spread.head()

Unnamed: 0_level_0,spread,race_name,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
burns,1.0,pennsylvania 12th district special election,2010
critz,-1.0,pennsylvania 12th district special election,2010
guinta,7.4,new hampshire 1st district,2010
shea-porter,-7.4,new hampshire 1st district,2010
bass,2.416667,new hampshire 2nd district,2010


In [161]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,West Virginia,Wyoming,Foreign Countries,Donation Level 1,Donation Level 2,Donation Level 3,Donation Level 4,Donation Level 5,election_result,total_donations
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,Challenger,Senate,REPUBLICAN PARTY,[ABELER4SENATE],1.0,...,0.0,0.0,0.0,53740.0,27348.0,25244.52,27611.0,24014.0,franken,157957.52
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,Challenger,Senate,DEMOCRATIC PARTY,[FRIENDS OF TERRY ADAMS],1.0,...,0.0,0.0,0.0,23208.28,16735.0,17200.0,34800.0,22300.0,alexander,114243.28
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,Challenger,Senate,REPUBLICAN PARTY,[ ADDIVINOLA COMMITTEE; THE],1.0,...,0.0,0.0,0.0,33194.35,8125.0,11850.0,1500.0,3900.0,markey,58569.35
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,Challenger,House,DEMOCRATIC PARTY,0,0.0,...,4800.0,0.0,0.0,116284.98,205176.19,434250.0,596800.0,443050.0,runyan,1795561.17
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,Challenger,House,DEMOCRATIC PARTY,[SHELLEY ADLER FOR CONGRESS],1.0,...,5000.0,0.0,0.0,166446.66,87420.95,137550.0,158655.5,207600.0,runyan,757673.11


# Feature Engineering

## Number of Donations

In [162]:
# by_size/by_candidate
#  -0    $200 and under
#  -200  $200.01 - $499.99
#  -500  $500 - $999.99
#  -1000 $1000 - $1999.99
#  -2000 $2000 +

donorLevels = dict([('d1', 200), ('d2', 499.99), ('d3', 999.99), ('d4', 1999.99), ('d5', 2000)])
adHeur = lambda col, level: np.divide(col, donorLevels[level])

def donor_level_features(table):
    
    """Adds features to the FEC table that include the minimum number of possible donors
    per donor level"""
    
    adHeur = lambda col, level: np.divide(col, donorLevels[level])
    donorLevels = dict([('d1', 200), ('d2', 499.99), ('d3', 999.99), ('d4', 1999.99), ('d5', 2000)])
    
    table['estimated_num_d1_donors'] = adHeur(table['Donation Level 1'], 'd1')
    table['estimated_num_d2_donors'] = adHeur(table['Donation Level 2'], 'd2')
    table['estimated_num_d3_donors'] = adHeur(table['Donation Level 3'], 'd3')
    table['estimated_num_d4_donors'] = adHeur(table['Donation Level 4'], 'd4')
    table['estimated_num_d5_donors'] = adHeur(table['Donation Level 5'], 'd5')
    return table
    
df  = donor_level_features(df)
df

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 3,Donation Level 4,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,Challenger,Senate,REPUBLICAN PARTY,[ABELER4SENATE],1.0,...,2.524452e+04,2.761100e+04,2.401400e+04,franken,1.579575e+05,268.700000,54.697094,25.244772,13.805569,12.007000
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,Challenger,Senate,DEMOCRATIC PARTY,[FRIENDS OF TERRY ADAMS],1.0,...,1.720000e+04,3.480000e+04,2.230000e+04,alexander,1.142433e+05,116.041400,33.470669,17.200172,17.400087,11.150000
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,Challenger,Senate,REPUBLICAN PARTY,[ ADDIVINOLA COMMITTEE; THE],1.0,...,1.185000e+04,1.500000e+03,3.900000e+03,markey,5.856935e+04,165.971750,16.250325,11.850119,0.750004,1.950000
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,Challenger,House,DEMOCRATIC PARTY,0,0.0,...,4.342500e+05,5.968000e+05,4.430500e+05,runyan,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,Challenger,House,DEMOCRATIC PARTY,[SHELLEY ADLER FOR CONGRESS],1.0,...,1.375500e+05,1.586555e+05,2.076000e+05,runyan,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000
5,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,NC,Challenger,House,DEMOCRATIC PARTY,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,1.205295e+05,2.140461e+05,2.249964e+05,ellmers,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180
6,akin w todd,"[akin, mccaskill]",S2MO00429,2012.0,MO,Challenger,Senate,REPUBLICAN PARTY,0,0.0,...,4.583552e+05,6.254501e+05,1.598676e+06,mccaskill,5.081955e+06,10241.937250,702.186904,458.359774,312.726604,799.337995
7,alameel david m,"[cornyn, alameel]",S4TX00516,2014.0,TX,Challenger,Senate,DEMOCRATIC PARTY,[DAVID M ALAMEEL FOR UNITED STATES SENATE],1.0,...,3.500000e+03,6.000000e+03,2.500000e+03,cornyn,4.179300e+04,137.665000,4.520090,3.500035,3.000015,1.250000
8,alexander lamar,"[alexander, adams]",S2TN00058,2014.0,TN,Incumbent,Senate,REPUBLICAN PARTY,"[TENNESSEE SENATE VICTORY FUND, 2013 SENATORS ...",4.0,...,2.652728e+05,8.344301e+05,1.799528e+06,alexander,3.154669e+06,652.785000,249.768093,265.275453,417.217111,899.763979
9,allen george,"[kaine, allen]",S8VA00214,2012.0,VA,Open seat,Senate,REPUBLICAN PARTY,"[GOOD GOVERNMENT FUND; THE, GEORGE ALLEN FOR U...",8.0,...,1.241766e+06,2.675433e+06,6.230647e+06,kaine,1.319114e+07,11765.076907,1380.587600,1241.778333,1337.723273,3115.323546


## Election Result

In [163]:
# Update election_result column & last_name for joining df
# 0 if lost, 1 if won. 

binary_election_results = []
last_name_array = []
for index, row in df.iterrows():
    last_name = row[0].split()[0]
    last_name_array = np.append(last_name_array, last_name)
    if last_name == row.election_result:
        binary_election_results = np.append(binary_election_results, 1)
    else: 
        binary_election_results = np.append(binary_election_results, 0)
df['election_result'] = binary_election_results

## Incumbent

In [164]:
# Update incumbent column. 
# 0 if incumbent, 1 if challenger, 2 if Open seat
incumbent_feature = df.incumbent
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Incumbent', 0)
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Challenger', 1)
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Open seat', 2)

print(incumbent_feature.unique())
df['incumbent'] = incumbent_feature


[1 0 2]


## Party

In [165]:
# Update party column. 
# 0 if Republican, 1 if Democratic or Democratic Farm Labor, 2 if Other: NONE, Libertarian, Independent

party_feature = df.party
party_feature = party_feature.where(party_feature != 'REPUBLICAN PARTY', 0)
party_feature = party_feature.where((party_feature != 'DEMOCRATIC PARTY') & (party_feature != 'DEMOCRATIC-FARM-LABOR'), 1)
party_feature = party_feature.where((party_feature != 'NONE') & (party_feature != 'OTHER') & (party_feature != 'LIBERTARIAN PARTY') & (party_feature != 'INDEPENDENT') , 2)

print(party_feature.unique())
df['party'] = party_feature


[0 1 2]


## Spread From Polls

In [167]:
df['last_name'] = last_name_array

In [168]:
# Biased Dataset?
print('Size of loss data: {}'.format(len(df[df.election_result == 0].name)))
print('Size of winner data: {}'.format(len(df[df.election_result == 1].name)))


Size of loss data: 375
Size of winner data: 235


In [169]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 4,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,last_name
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,1,Senate,0,[ABELER4SENATE],1.0,...,27611.0,24014.0,0.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,abeler
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,1,Senate,1,[FRIENDS OF TERRY ADAMS],1.0,...,34800.0,22300.0,0.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,adams
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,1,Senate,0,[ ADDIVINOLA COMMITTEE; THE],1.0,...,1500.0,3900.0,0.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,addivinola
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,596800.0,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,adler
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,158655.5,207600.0,0.0,757673.11,832.2333,174.845397,137.551376,79.328147,103.8,adler


In [170]:
# Join on last name to poll spread data
df = df.set_index('last_name').join(cand_poll_spread)

In [171]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year
abeler,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,1,Senate,0,[ABELER4SENATE],1.0,...,0.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,,,
adams,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,1,Senate,1,[FRIENDS OF TERRY ADAMS],1.0,...,0.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,,,
addivinola,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,1,Senate,0,[ ADDIVINOLA COMMITTEE; THE],1.0,...,0.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,,,
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,2010.0
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,-10.0,new jersey 3rd district,2012.0


In [172]:
# Replace NAN spread values with the average. This may not be a good idea. We could just drop these rows. Open to alternatives.
average_spread = df.spread.mean()
spread = df.spread.fillna(average_spread)
df['spread'] = spread
df = df.drop(columns = ['year'])

## Majority Donations & State Association

In [177]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name
abeler,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,1,Senate,0,[ABELER4SENATE],1.0,...,24014.0,0.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,-0.011529,
adams,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,1,Senate,1,[FRIENDS OF TERRY ADAMS],1.0,...,22300.0,0.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,-0.011529,
addivinola,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,1,Senate,0,[ ADDIVINOLA COMMITTEE; THE],1.0,...,3900.0,0.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,-0.011529,
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,-10.0,new jersey 3rd district


In [174]:
# Load in States and their abbreviations
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [189]:
#Replace State Abbrevation with their full state name
df.state
states['MN']
state_array = []
for state in df.state:
    state_array = np.append(state_array, states[state])
df['state'] = state_array

In [191]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name
abeler,abeler james j,"[franken, abeler]",S4MN00353,2014.0,Minnesota,1,Senate,0,[ABELER4SENATE],1.0,...,24014.0,0.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,-0.011529,
adams,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,Tennessee,1,Senate,1,[FRIENDS OF TERRY ADAMS],1.0,...,22300.0,0.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,-0.011529,
addivinola,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,Massachusetts,1,Senate,0,[ ADDIVINOLA COMMITTEE; THE],1.0,...,3900.0,0.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,-0.011529,
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,New Jersey,1,House,1,0,0.0,...,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,New Jersey,1,House,1,0,0.0,...,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,-10.0,new jersey 3rd district


## Data

In [205]:
#Select only features and result from df
df.columns
data = df[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'election_result']]
data.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,election_result
abeler,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27611.0,24014.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,-0.011529,0.0
adams,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,34800.0,22300.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,-0.011529,0.0
addivinola,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,3900.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,-0.011529,0.0
adler,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,596800.0,443050.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,0.0
adler,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,596800.0,443050.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,-10.0,0.0


In [208]:
len(data.columns)

77

## Train Model 

In [241]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)
# train_df, test_df = train_test_split(data, test_size = 0.15, random_state = 1337)
train_df.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,election_result
dahlkemper,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255998.33,187850.0,891295.2,732.8846,239.434789,181.156812,127.999805,93.925,-10.0,0.0
hughes,1,1,1.0,0.0,0.0,0.0,1000.0,0.0,0.0,0.0,...,177347.0,250800.0,647570.0,350.29315,90.590632,104.071041,88.673943,125.4,-14.666667,0.0
mills,1,0,4.0,0.0,0.0,0.0,0.0,0.0,2600.0,0.0,...,431760.69,1039894.0,2621179.0,3945.2622,273.0002,223.97682,215.881424,519.94686,8.0,0.0
merkley,0,1,9.0,0.0,217.62,1630.0,2683.336667,0.0,750.0,0.0,...,846109.68,1976481.0,7702703.0,19721.478633,672.792803,599.433361,423.056955,988.240321,-0.011529,1.0
conway,2,1,0.0,0.0,50.0,1920.0,3362.0,0.0,3250.0,0.0,...,1325344.22,1917292.0,5632233.0,5833.62105,935.333067,755.222272,662.675423,958.646125,-0.011529,0.0


In [242]:
test_df.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,election_result
conlin,1,1,0.0,0.0,0.0,2000.0,14307.0,0.0,3400.0,0.0,...,463634.2,758235.0,2041961.0,2269.8011,310.053801,211.109711,231.818259,379.1175,-0.011529,0.0
heller,0,0,14.0,0.0,0.0,0.0,58950.594762,0.0,22636.97,0.0,...,1061183.0,8503185.0,11274430.0,4248.720507,580.405817,570.122443,530.594127,4251.592723,-0.011529,1.0
brunner,1,0,1.0,0.0,0.0,0.0,200.0,0.0,1000.0,0.0,...,125100.0,375996.2,706936.2,453.245,87.48373,71.450715,62.550313,187.998125,-0.011529,0.0
steele,1,0,0.0,0.0,0.0,0.0,6100.0,0.0,250.0,0.0,...,187530.0,225525.0,954678.6,1745.2137,164.349287,110.408934,93.765469,112.7625,-7.5,0.0
horsford,2,1,3.0,0.0,0.0,0.0,500.0,0.0,0.0,0.0,...,579397.8,788608.0,2097158.0,1113.9169,322.764575,344.99349,289.700364,394.303985,3.0,1.0


In [243]:
X_train = train_df.iloc[:, :76]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :76]
Y_test = test_df.election_result

In [244]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
logreg_train_acc = logreg.score(X_train, Y_train)
logreg_test_acc = logreg.score(X_test, Y_test)
print ('logreg training acuracy= ',logreg_train_acc)
print('logreg test accuracy= ',logreg_test_acc)

logreg training acuracy=  0.6515837104072398
logreg test accuracy=  0.5421686746987951


In [245]:
# Perceptron

perceptron = Perceptron(max_iter = 1000, tol=1e-3)
perceptron.fit(X_train, Y_train)
perceptron_train_acc = perceptron.score(X_train, Y_train)
perceptron_test_acc = perceptron.score(X_test, Y_test)
print ('perceptron training acuracy= ',perceptron_train_acc)
print('perceptron test accuracy= ',perceptron_test_acc)


perceptron training acuracy=  0.5761689291101055
perceptron test accuracy=  0.5542168674698795


In [246]:
# Adaboost

adaboost = AdaBoostClassifier()
adaboost.fit(X_train, Y_train)
adaboost_train_acc = adaboost.score(X_train, Y_train)
adaboost_test_acc = adaboost.score(X_test, Y_test)
print ('adaboost training acuracy= ',adaboost_train_acc)
print('adaboost test accuracy= ',adaboost_test_acc)

adaboost training acuracy=  0.9788838612368024
adaboost test accuracy=  0.9518072289156626


In [247]:
# Random Forest

random_forest = RandomForestClassifier()
random_forest.fit(X_train, Y_train)
random_forest_train_acc = random_forest.score(X_train, Y_train)
random_forest_test_acc = random_forest.score(X_test, Y_test)
print('random_forest training acuracy= ',random_forest_train_acc)
print('random_forest test accuracy= ',random_forest_test_acc)

random_forest training acuracy=  0.9924585218702866
random_forest test accuracy=  0.9337349397590361


In [266]:
def train_model():
    # Logistic Regression
    
    logreg = LogisticRegression()
    logreg.fit(X_train, Y_train)
    logreg_train_acc = logreg.score(X_train, Y_train)
    logreg_test_acc = logreg.score(X_test, Y_test)
    print ('logreg training acuracy= ',logreg_train_acc)
    print('logreg test accuracy= ',logreg_test_acc)
    print('')
    # Perceptron

    perceptron = Perceptron(max_iter = 1000, tol=1e-3)
    perceptron.fit(X_train, Y_train)
    perceptron_train_acc = perceptron.score(X_train, Y_train)
    perceptron_test_acc = perceptron.score(X_test, Y_test)
    print ('perceptron training acuracy= ',perceptron_train_acc)
    print('perceptron test accuracy= ',perceptron_test_acc)
    print('')
    # Adaboost

    adaboost = AdaBoostClassifier()
    adaboost.fit(X_train, Y_train)
    adaboost_train_acc = adaboost.score(X_train, Y_train)
    adaboost_test_acc = adaboost.score(X_test, Y_test)
    print ('adaboost training acuracy= ',adaboost_train_acc)
    print('adaboost test accuracy= ',adaboost_test_acc)
    print('')
    # Random Forest

    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, Y_train)
    random_forest_train_acc = random_forest.score(X_train, Y_train)
    random_forest_test_acc = random_forest.score(X_test, Y_test)
    print('random_forest training acuracy= ',random_forest_train_acc)
    print('random_forest test accuracy= ',random_forest_test_acc)
    
    return

## Add more Features / Improve Features, Run Model Again

### Majority Donation & State Association 

In [249]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name
abeler,abeler james j,"[franken, abeler]",S4MN00353,2014.0,Minnesota,1,Senate,0,[ABELER4SENATE],1.0,...,24014.0,0.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,-0.011529,
adams,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,Tennessee,1,Senate,1,[FRIENDS OF TERRY ADAMS],1.0,...,22300.0,0.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,-0.011529,
addivinola,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,Massachusetts,1,Senate,0,[ ADDIVINOLA COMMITTEE; THE],1.0,...,3900.0,0.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,-0.011529,
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,New Jersey,1,House,1,0,0.0,...,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,New Jersey,1,House,1,0,0.0,...,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,-10.0,new jersey 3rd district


In [256]:
# Percentage of contributions made from home state. 
home_state_contrib_array = []
for index, row in df.iterrows():
    home_state = row.state
    home_state_contribution = row[home_state] / row.total_donations
    home_state_contrib_array = np.append(home_state_contrib_array, home_state_contribution)
df['home_state_contrib'] = home_state_contrib_array

In [257]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,home_state_contrib
abeler,abeler james j,"[franken, abeler]",S4MN00353,2014.0,Minnesota,1,Senate,0,[ABELER4SENATE],1.0,...,0.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,-0.011529,,0.628172
adams,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,Tennessee,1,Senate,1,[FRIENDS OF TERRY ADAMS],1.0,...,0.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,-0.011529,,0.793441
addivinola,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,Massachusetts,1,Senate,0,[ ADDIVINOLA COMMITTEE; THE],1.0,...,0.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,-0.011529,,0.581618
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,New Jersey,1,House,1,0,0.0,...,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,0.620806
adler,adler john h,"[adler, runyan]",H8NJ03156,2010.0,New Jersey,1,House,1,0,0.0,...,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,-10.0,new jersey 3rd district,0.620806


In [258]:
#Select only features and result from df
df.columns
data = df[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'home_state_contrib','election_result']]
data.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,home_state_contrib,election_result
abeler,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24014.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,-0.011529,0.628172,0.0
adams,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,22300.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,-0.011529,0.793441,0.0
addivinola,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3900.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,-0.011529,0.581618,0.0
adler,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,443050.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,0.620806,0.0
adler,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,443050.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,-10.0,0.620806,0.0


In [287]:
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :77]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :77]
Y_test = test_df.election_result

In [293]:
train_model()

logreg training acuracy=  0.6606334841628959
logreg test accuracy=  0.5421686746987951

perceptron training acuracy=  0.5761689291101055
perceptron test accuracy=  0.5542168674698795

adaboost training acuracy=  0.9849170437405732
adaboost test accuracy=  0.9337349397590361

random_forest training acuracy=  0.9939668174962293
random_forest test accuracy=  0.9518072289156626


## Add more Features / Improve Features, Run Model Again

In [296]:
cpgd_final = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'cpgd_final.pkl')
cpgd_final.head()

Unnamed: 0_level_0,approval rating,disapproval rating,spread
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-04-16,42.5,50.0,-7.5
2010-04-19,46.0,39.5,6.5
2010-04-21,32.0,58.5,-26.5
2010-04-22,36.666667,50.666667,-14.0
2010-04-23,39.0,57.0,-18.0
