In [1682]:
import pandas as pd
import pickle as pkl
from pathlib import Path
import numpy as np
import sklearn 

 # machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

#Visualize Features
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import seaborn as sns


In [1683]:
house_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'house_avg_spread.pkl')
sen_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'sen_avg_spread.pkl')
gov_spread = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'gov_avg_spread.pkl')

df = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'df_donor_clean-Copy1.pkl')

In [1684]:
house_spread.head()
# sen_spread.head()
# gov_spread.head()

Unnamed: 0_level_0,spread,race_name,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
critz,-1.0,pennsylvania 12th district special election,2010
burns,1.0,pennsylvania 12th district special election,2010
shea-porter,-7.4,new hampshire 1st district,2010
guinta,7.4,new hampshire 1st district,2010
swett,-12.666667,new hampshire 2nd district,2010


In [1685]:
df.head()
df.columns

Index(['name', 'race_candidates', 'cand_id', 'cycle', 'state', 'incumbent',
       'office_full', 'party', 'committee_name', 'amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washing

# Feature Engineering

## Number of Donations

In [1686]:
# by_size/by_candidate
#  -0    $200 and under
#  -200  $200.01 - $499.99
#  -500  $500 - $999.99
#  -1000 $1000 - $1999.99
#  -2000 $2000 +

donorLevels = dict([('d1', 200), ('d2', 499.99), ('d3', 999.99), ('d4', 1999.99), ('d5', 2000)])
adHeur = lambda col, level: np.divide(col, donorLevels[level])

def donor_level_features(table):
    
    """Adds features to the FEC table that include the minimum number of possible donors
    per donor level"""
    
    adHeur = lambda col, level: np.divide(col, donorLevels[level])
    donorLevels = dict([('d1', 200), ('d2', 499.99), ('d3', 999.99), ('d4', 1999.99), ('d5', 2000)])
    
    table['estimated_num_d1_donors'] = adHeur(table['Donation Level 1'], 'd1')
    table['estimated_num_d2_donors'] = adHeur(table['Donation Level 2'], 'd2')
    table['estimated_num_d3_donors'] = adHeur(table['Donation Level 3'], 'd3')
    table['estimated_num_d4_donors'] = adHeur(table['Donation Level 4'], 'd4')
    table['estimated_num_d5_donors'] = adHeur(table['Donation Level 5'], 'd5')
    return table
    
df  = donor_level_features(df)
df

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 3,Donation Level 4,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,Challenger,Senate,REPUBLICAN PARTY,[ABELER4SENATE],1.0,...,2.524452e+04,2.761100e+04,2.401400e+04,franken,1.579575e+05,268.700000,54.697094,25.244772,13.805569,12.007000
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,Challenger,Senate,DEMOCRATIC PARTY,[FRIENDS OF TERRY ADAMS],1.0,...,1.720000e+04,3.480000e+04,2.230000e+04,alexander,1.142433e+05,116.041400,33.470669,17.200172,17.400087,11.150000
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,Challenger,Senate,REPUBLICAN PARTY,[ ADDIVINOLA COMMITTEE; THE],1.0,...,1.185000e+04,1.500000e+03,3.900000e+03,markey,5.856935e+04,165.971750,16.250325,11.850119,0.750004,1.950000
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,Challenger,House,DEMOCRATIC PARTY,0,0.0,...,4.342500e+05,5.968000e+05,4.430500e+05,runyan,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,Challenger,House,DEMOCRATIC PARTY,[SHELLEY ADLER FOR CONGRESS],1.0,...,1.375500e+05,1.586555e+05,2.076000e+05,runyan,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000
5,aiken clayton clay,"[ellmers, aiken]",H4NC02127,2014.0,NC,Challenger,House,DEMOCRATIC PARTY,[CLAY AIKEN FOR NORTH CAROLINA],1.0,...,1.205295e+05,2.140461e+05,2.249964e+05,ellmers,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180
6,akin w todd,"[akin, mccaskill]",S2MO00429,2012.0,MO,Challenger,Senate,REPUBLICAN PARTY,0,0.0,...,4.583552e+05,6.254501e+05,1.598676e+06,mccaskill,5.081955e+06,10241.937250,702.186904,458.359774,312.726604,799.337995
7,alameel david m,"[cornyn, alameel]",S4TX00516,2014.0,TX,Challenger,Senate,DEMOCRATIC PARTY,[DAVID M ALAMEEL FOR UNITED STATES SENATE],1.0,...,3.500000e+03,6.000000e+03,2.500000e+03,cornyn,4.179300e+04,137.665000,4.520090,3.500035,3.000015,1.250000
8,alexander lamar,"[alexander, adams]",S2TN00058,2014.0,TN,Incumbent,Senate,REPUBLICAN PARTY,"[TENNESSEE SENATE VICTORY FUND, 2013 SENATORS ...",4.0,...,2.652728e+05,8.344301e+05,1.799528e+06,alexander,3.154669e+06,652.785000,249.768093,265.275453,417.217111,899.763979
9,allen george,"[kaine, allen]",S8VA00214,2012.0,VA,Open seat,Senate,REPUBLICAN PARTY,"[GOOD GOVERNMENT FUND; THE, GEORGE ALLEN FOR U...",8.0,...,1.241766e+06,2.675433e+06,6.230647e+06,kaine,1.319114e+07,11765.076907,1380.587600,1241.778333,1337.723273,3115.323546


In [1687]:
len(df[df.office_full == 'Senate'].name)
len(df[df.office_full == 'House'].name)
len(df[(df.office_full != 'Senate') & (df.office_full != 'House')].name)

0

## Election Result

In [1688]:
# Update election_result column & last_name for joining df
# 0 if lost, 1 if won. 

binary_election_results = []
last_name_array = []
for index, row in df.iterrows():
    last_name = row[0].split()[0]
    last_name_array = np.append(last_name_array, last_name)
    if last_name == row.election_result:
        binary_election_results = np.append(binary_election_results, 1)
    else: 
        binary_election_results = np.append(binary_election_results, 0)
df['election_result'] = binary_election_results
df['last_name'] = last_name_array

## Incumbent

In [1689]:
# Update incumbent column. 
# 0 if incumbent, 1 if challenger, 2 if Open seat
incumbent_feature = df.incumbent
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Incumbent', 0)
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Challenger', 1)
incumbent_feature = incumbent_feature.where(incumbent_feature != 'Open seat', 2)

print(incumbent_feature.unique())
df['incumbent'] = incumbent_feature


[1 0 2]


## Party

In [1690]:
# Update party column. 
# 0 if Republican, 1 if Democratic or Democratic Farm Labor, 2 if Other: NONE, Libertarian, Independent

party_feature = df.party
party_feature = party_feature.where(party_feature != 'REPUBLICAN PARTY', 0)
party_feature = party_feature.where((party_feature != 'DEMOCRATIC PARTY') & (party_feature != 'DEMOCRATIC-FARM-LABOR'), 1)
party_feature = party_feature.where((party_feature != 'NONE') & (party_feature != 'OTHER') & (party_feature != 'LIBERTARIAN PARTY') & (party_feature != 'INDEPENDENT') , 2)

print(party_feature.unique())
df['party'] = party_feature


[0 1 2]


## Spread From Polls

In [1691]:
# Biased Dataset?
print('Size of loss data: {}'.format(len(df[df.election_result == 0].name)))
print('Size of winner data: {}'.format(len(df[df.election_result == 1].name)))


Size of loss data: 375
Size of winner data: 235


In [1692]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,cycle,state,incumbent,office_full,party,committee_name,amnt_committees,...,Donation Level 4,Donation Level 5,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,last_name
0,abeler james j,"[franken, abeler]",S4MN00353,2014.0,MN,1,Senate,0,[ABELER4SENATE],1.0,...,27611.0,24014.0,0.0,157957.52,268.7,54.697094,25.244772,13.805569,12.007,abeler
1,adams terry glen jr,"[alexander, adams]",S4TN00328,2014.0,TN,1,Senate,1,[FRIENDS OF TERRY ADAMS],1.0,...,34800.0,22300.0,0.0,114243.28,116.0414,33.470669,17.200172,17.400087,11.15,adams
2,addivinola frank j jr,"[markey, addivinola]",S4MA00242,2014.0,MA,1,Senate,0,[ ADDIVINOLA COMMITTEE; THE],1.0,...,1500.0,3900.0,0.0,58569.35,165.97175,16.250325,11.850119,0.750004,1.95,addivinola
3,adler john h,"[adler, runyan]",H8NJ03156,2010.0,NJ,1,House,1,0,0.0,...,596800.0,443050.0,0.0,1795561.17,581.4249,410.360587,434.254343,298.401492,221.525,adler
4,adler shelley,"[runyan, adler]",H2NJ03183,2012.0,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,...,158655.5,207600.0,0.0,757673.11,832.2333,174.845397,137.551376,79.328147,103.8,adler


In [1693]:
# house_spread.set_index
df_house = df[df.office_full == 'House']
df_house = df_house.set_index('last_name').join(house_spread)

df_sen = df[df.office_full == 'Senate']
df_sen = df_sen.set_index('last_name').join(sen_spread)


In [1694]:
df = pd.concat([df_house, df_sen])

In [1695]:
df = df.drop(columns=['cycle'])

### There are multiple spreads for some candidates. I will limit it to one. Dropping duplicates, arbitrarily keeping the first spread we see.


In [1696]:
# Replace NAN spread values with the average. This may not be a good idea. We could just drop these rows. Open to alternatives.
# average_spread = df.spread.mean()
# spread = df.spread.fillna(average_spread)
# df['spread'] = spread
# df = df.drop(columns = ['year'])
df = df[df.spread.notnull()]
df = df.drop_duplicates('name')


In [1697]:
df

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year
adler,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district,2010
adler,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district,2010
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,0.0,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180,-8.000000,north carolina 2nd district,2014
allen,allen richard w,"[allen, barrow]",H2GA12121,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1.0,1.526469e+06,679.775000,329.706594,335.592356,170.038350,275.000000,2.500000,georgia 12th district,2014
altmire,altmire jason,"[altmire, critz]",H6PA04110,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,0.0,7.152095e+05,136.983771,93.740831,202.666968,112.497757,106.641977,4.000000,pennsylvania 12th district,2012
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-12.000000,new york 1st district,2010
amash,amash justin,"[pestka, amash]",H0MI03126,MI,0,House,0,"[MICHIGAN YOUNG GUNS VICTORY FUND (AMASH, BENI...",2.0,0.0,...,1.0,1.141024e+06,1157.937850,126.322926,132.726327,96.775484,260.000700,-1.000000,michigan 3rd district,2012
appel,appel staci,"[appel, young]",H4IA03065,IA,2,House,1,"[APPEL FOR IOWA, INC.]",1.0,0.0,...,0.0,1.703116e+06,3037.725100,441.701914,254.501765,148.850744,161.262500,0.000000,iowa 3rd district,2014
arnold-jones,arnold-jones janice e,"[grisham, arnold-jones]",H2NM01128,NM,2,House,0,[JANICE ARNOLD-JONES FOR CONGRESS],1.0,0.0,...,0.0,5.266928e+05,735.422700,123.510570,79.272273,64.841694,54.450000,-13.666667,new mexico 1st district,2012
bachmann,bachmann michele,"[bachmann, clark]",H6MN06074,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.0,...,1.0,1.499294e+07,47403.997550,2765.925059,1172.883029,622.590223,855.578000,9.000000,minnesota 6th district,2010


## Majority Donations & State Association

In [1698]:
df

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,election_result,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year
adler,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,0.0,1.795561e+06,581.424900,410.360587,434.254343,298.401492,221.525000,4.333333,new jersey 3rd district,2010
adler,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,0.0,7.576731e+05,832.233300,174.845397,137.551376,79.328147,103.800000,4.333333,new jersey 3rd district,2010
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,0.0,1.058519e+06,1866.147900,251.439569,120.530735,107.023575,112.498180,-8.000000,north carolina 2nd district,2014
allen,allen richard w,"[allen, barrow]",H2GA12121,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1.0,1.526469e+06,679.775000,329.706594,335.592356,170.038350,275.000000,2.500000,georgia 12th district,2014
altmire,altmire jason,"[altmire, critz]",H6PA04110,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,0.0,7.152095e+05,136.983771,93.740831,202.666968,112.497757,106.641977,4.000000,pennsylvania 12th district,2012
altschuler,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,0.0,1.486438e+06,378.988650,219.736955,134.721997,147.059070,435.968335,-12.000000,new york 1st district,2010
amash,amash justin,"[pestka, amash]",H0MI03126,MI,0,House,0,"[MICHIGAN YOUNG GUNS VICTORY FUND (AMASH, BENI...",2.0,0.0,...,1.0,1.141024e+06,1157.937850,126.322926,132.726327,96.775484,260.000700,-1.000000,michigan 3rd district,2012
appel,appel staci,"[appel, young]",H4IA03065,IA,2,House,1,"[APPEL FOR IOWA, INC.]",1.0,0.0,...,0.0,1.703116e+06,3037.725100,441.701914,254.501765,148.850744,161.262500,0.000000,iowa 3rd district,2014
arnold-jones,arnold-jones janice e,"[grisham, arnold-jones]",H2NM01128,NM,2,House,0,[JANICE ARNOLD-JONES FOR CONGRESS],1.0,0.0,...,0.0,5.266928e+05,735.422700,123.510570,79.272273,64.841694,54.450000,-13.666667,new mexico 1st district,2012
bachmann,bachmann michele,"[bachmann, clark]",H6MN06074,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.0,...,1.0,1.499294e+07,47403.997550,2765.925059,1172.883029,622.590223,855.578000,9.000000,minnesota 6th district,2010


In [1699]:
# Load in States and their abbreviations
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [1700]:
#Replace State Abbrevation with their full state name
df.state
states['MN']
state_array = []
for state in df.state:
    state_array = np.append(state_array, states[state])
df['state_full'] = state_array

In [1701]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year,state_full
adler,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,2010,New Jersey
adler,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,2010,New Jersey
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,north carolina 2nd district,2014,North Carolina
allen,allen richard w,"[allen, barrow]",H2GA12121,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,georgia 12th district,2014,Georgia
altmire,altmire jason,"[altmire, critz]",H6PA04110,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,pennsylvania 12th district,2012,Pennsylvania


## Data

In [1702]:
#Select only features and result from df
df.columns
data = df[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'election_result']]
data.head()
print(len(data))

610


In [1703]:
len(data.columns)

77

In [1704]:
#Shuffle
data = shuffle(data).reset_index(drop=True)

## Train Model 

In [1705]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)
# train_df, test_df = train_test_split(data, test_size = 0.15, random_state = 1337)
train_df.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,election_result
544,0,1,0.0,0.0,0.0,500.0,1000.0,0.0,0.0,0.0,...,98100.0,82600.0,665733.1,1773.727,105.325847,77.626596,49.050245,41.3,6.0,1.0
204,0,0,1.0,0.0,0.0,0.0,250.0,0.0,7800.0,0.0,...,358450.0,510200.0,1145125.0,411.2628,104.738495,141.855309,179.225896,255.1,-23.0,1.0
413,2,1,0.0,100.0,0.0,2000.0,0.0,0.0,0.0,0.0,...,50487.0,101600.0,340088.9,493.6791,77.243705,50.645506,25.243626,50.8,-14.0,0.0
181,1,0,4.0,0.0,0.0,0.0,0.0,0.0,2600.0,0.0,...,431760.69,1039894.0,2621179.0,3945.2622,273.0002,223.97682,215.881424,519.94686,8.0,0.0
218,0,0,4.0,0.0,0.0,0.0,3903.0,0.0,0.0,0.0,...,250862.921579,691034.2,1365822.0,1219.34,101.455029,129.331609,125.432088,345.517105,9.833333,1.0


In [1706]:
test_df.head()

Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,election_result
559,0,0,10.0,0.0,0.0,275.0,102625.0,0.0,45550.0,0.0,...,1874574.0,7425357.0,12090590.0,7228.512467,900.117182,894.918169,937.29147,3712.6785,-4.0,1.0
80,1,0,1.0,0.0,0.0,250.0,0.0,0.0,8100.0,0.0,...,552740.4,2412098.0,3421412.0,1040.12035,177.067541,160.0196,276.371607,1206.048835,-9.192308,0.0
545,0,1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,244646.4,329214.0,1454918.0,2583.748171,354.73035,186.948381,122.323791,164.606977,5.0,1.0
174,0,0,2.0,0.0,0.0,0.0,9576.0,0.0,5250.0,0.0,...,437803.1,538906.1,1376421.0,406.36395,160.953239,237.96617,218.90265,269.453071,12.0,1.0
56,0,1,2.0,0.0,0.0,0.0,500.0,0.0,0.0,0.0,...,438985.9,1889035.0,2633637.0,216.02135,122.088482,201.370434,219.494037,944.5175,1.0,1.0


In [1717]:
X_train = train_df.iloc[:, :76]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :76]
Y_test = test_df.election_result

In [1718]:
def train_model():
    # Logistic Regression
    
    logreg = LogisticRegression()
    logreg.fit(X_train, Y_train)
    logreg_train_acc = logreg.score(X_train, Y_train)
    logreg_test_acc = logreg.score(X_test, Y_test)
    print ('logreg training acuracy= ',logreg_train_acc)
    print('logreg test accuracy= ',logreg_test_acc)
    print('')
    # Perceptron

    perceptron = Perceptron(max_iter = 1000, tol=1e-3)
    perceptron.fit(X_train, Y_train)
    perceptron_train_acc = perceptron.score(X_train, Y_train)
    perceptron_test_acc = perceptron.score(X_test, Y_test)
    print ('perceptron training acuracy= ',perceptron_train_acc)
    print('perceptron test accuracy= ',perceptron_test_acc)
    print('')
    # Adaboost

    adaboost = AdaBoostClassifier()
    adaboost.fit(X_train, Y_train)
    adaboost_train_acc = adaboost.score(X_train, Y_train)
    adaboost_test_acc = adaboost.score(X_test, Y_test)
    print ('adaboost training acuracy= ',adaboost_train_acc)
    print('adaboost test accuracy= ',adaboost_test_acc)
    print('')
    # Random Forest

    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, Y_train)
    random_forest_train_acc = random_forest.score(X_train, Y_train)
    random_forest_test_acc = random_forest.score(X_test, Y_test)
    print('random_forest training acuracy= ',random_forest_train_acc)
    print('random_forest test accuracy= ',random_forest_test_acc)
    
    return

In [1719]:
train_model()

logreg training acuracy=  0.6762295081967213
logreg test accuracy=  0.5983606557377049

perceptron training acuracy=  0.6188524590163934
perceptron test accuracy=  0.6311475409836066

adaboost training acuracy=  0.985655737704918
adaboost test accuracy=  0.9426229508196722

random_forest training acuracy=  0.9938524590163934
random_forest test accuracy=  0.9180327868852459


## Add more Features / Improve Features, Run Model Again

### Majority Donation & State Association 

In [1720]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year,state_full
adler,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,2010,New Jersey
adler,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,2010,New Jersey
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,north carolina 2nd district,2014,North Carolina
allen,allen richard w,"[allen, barrow]",H2GA12121,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,georgia 12th district,2014,Georgia
altmire,altmire jason,"[altmire, critz]",H6PA04110,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,pennsylvania 12th district,2012,Pennsylvania


In [1739]:
# Percentage of contributions made from home state. 
home_state_contrib_array = []
for index, row in df.iterrows():
    home_state = row.state_full
    home_state_contribution = row[home_state] / row.total_donations
    home_state_contrib_array = np.append(home_state_contrib_array, home_state_contribution)
df['home_state_contrib'] = home_state_contrib_array

In [1740]:
df.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year,state_full,home_state_contrib
adler,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,2010,New Jersey,0.620806
adler,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,2010,New Jersey,0.422481
aiken,aiken clayton clay,"[ellmers, aiken]",H4NC02127,NC,1,House,1,[CLAY AIKEN FOR NORTH CAROLINA],1.0,0.0,...,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,north carolina 2nd district,2014,North Carolina,0.22729
allen,allen richard w,"[allen, barrow]",H2GA12121,GA,0,House,0,[RICK W. ALLEN FOR CONGRESS],1.0,0.0,...,679.775,329.706594,335.592356,170.03835,275.0,2.5,georgia 12th district,2014,Georgia,0.556754
altmire,altmire jason,"[altmire, critz]",H6PA04110,PA,1,House,1,[JARED POLIS VICTORY FUND 2012],1.0,0.0,...,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,pennsylvania 12th district,2012,Pennsylvania,0.714757


In [1741]:
#Select only features and result from df
df.columns
data = df[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'home_state_contrib','election_result']]
data.head()


Unnamed: 0,incumbent,party,amnt_committees,Armed Forces Americas,Armed Forces Europe,Alaska,Alabama,Armed Forces Pacific,Arkansas,American Samoa,...,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,home_state_contrib,election_result
adler,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,443050.0,1795561.0,581.4249,410.360587,434.254343,298.401492,221.525,4.333333,0.620806,0.0
adler,1,1,1.0,0.0,0.0,1214.28,550.0,0.0,450.0,0.0,...,207600.0,757673.1,832.2333,174.845397,137.551376,79.328147,103.8,4.333333,0.422481,0.0
aiken,1,1,1.0,0.0,0.0,0.0,751.46,0.0,857.3,0.0,...,224996.36,1058519.0,1866.1479,251.439569,120.530735,107.023575,112.49818,-8.0,0.22729,0.0
allen,0,0,1.0,0.0,0.0,0.0,8700.0,0.0,2600.0,0.0,...,550000.0,1526469.0,679.775,329.706594,335.592356,170.03835,275.0,2.5,0.556754,1.0
altmire,1,1,1.0,0.0,0.0,0.0,5000.0,0.0,0.0,0.0,...,213283.953488,715209.5,136.983771,93.740831,202.666968,112.497757,106.641977,4.0,0.714757,0.0


In [1742]:
#Shuffle
data = shuffle(data).reset_index(drop=True)

In [1743]:
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :77]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :77]
Y_test = test_df.election_result

In [1744]:
train_model()

logreg training acuracy=  0.694672131147541
logreg test accuracy=  0.6147540983606558

perceptron training acuracy=  0.6127049180327869
perceptron test accuracy=  0.6065573770491803

adaboost training acuracy=  0.9938524590163934
adaboost test accuracy=  0.9098360655737705

random_forest training acuracy=  0.9938524590163934
random_forest test accuracy=  0.8524590163934426


## Add more Features / Improve Features, Run Model Again (NOT A GOOD FEATURE)

In [1745]:
cpgd = pd.read_pickle(Path('.')/'..'/'..'/'data'/'cleaned'/'cpgd_actual_final.pkl')
cpgd = cpgd.reset_index()
years = []
for date in cpgd.date:
    year = date.split('-')[0]
    years = np.append(years, year)


In [1746]:
cpgd['year'] = years


In [1747]:
cpgd_average = cpgd.groupby('year').mean()
cpgd_average = cpgd_average.reset_index()
cpgd_average

Unnamed: 0,year,approval rating,disapproval rating,spread
0,2010,39.722917,52.474887,-12.438908
1,2011,35.545477,56.023353,-19.884956
2,2012,39.176656,53.378286,-13.681125
3,2013,33.416246,56.401354,-22.860458
4,2014,31.687253,58.056611,-26.075218
5,2015,32.142225,58.557728,-26.398743
6,2016,34.256134,57.349746,-23.074154
7,2017,33.66294,55.073535,-21.410594
8,2018,38.395796,52.69517,-14.297161


In [1748]:
year_reformat_array = []
for year in df.year:
    year_reformat = year.strip()
    year_reformat_array = np.append(year_reformat_array, year_reformat)

df['year'] = year_reformat_array


In [1749]:
df3 = df.merge(cpgd_average, left_on='year', right_on='year', how='outer')
df3

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,estimated_num_d4_donors,estimated_num_d5_donors,spread_x,race_name,year,state_full,home_state_contrib,approval rating,disapproval rating,spread_y
0,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.00,...,298.401492,221.525000,4.333333,new jersey 3rd district,2010,New Jersey,0.620806,39.722917,52.474887,-12.438908
1,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.00,...,79.328147,103.800000,4.333333,new jersey 3rd district,2010,New Jersey,0.422481,39.722917,52.474887,-12.438908
2,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.00,...,147.059070,435.968335,-12.000000,new york 1st district,2010,New York,0.621923,39.722917,52.474887,-12.438908
3,bachmann michele,"[bachmann, clark]",H6MN06074,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.00,...,622.590223,855.578000,9.000000,minnesota 6th district,2010,Minnesota,0.126658,39.722917,52.474887,-12.438908
4,barela jon,"[barela, heinrich]",H0NM01163,NM,1,House,0,0,0.0,0.00,...,128.957995,210.049995,-2.666667,new mexico 1st district,2010,New Mexico,0.806308,39.722917,52.474887,-12.438908
5,barletta lou,"[barletta, kanjorski]",H2PA11098,PA,0,House,0,"[PA + 5 COMMITTEE, PATRIOT DAY 2011]",2.0,0.00,...,65.413827,92.987500,7.750000,pennsylvania 11th district,2010,Pennsylvania,0.493122,39.722917,52.474887,-12.438908
6,barr garland andy,"[chandler, barr]",H0KY06104,KY,0,House,0,"[WIRE TO WIRE COMMITTEE, DOWN WITH DEBT, ANDY ...",7.0,0.00,...,200.592616,326.654817,-14.000000,kentucky 6th district,2010,Kentucky,0.761131,39.722917,52.474887,-12.438908
7,benishek daniel j.,"[benishek, mcdowell]",H0MI01088,MI,0,House,0,"[FOUNDERS' COMMITTEE, PATRIOT DAY III, MICHIGA...",4.0,0.00,...,82.841563,191.138093,3.000000,michigan 1st district,2010,Michigan,0.273310,39.722917,52.474887,-12.438908
8,berryhill michael clare sr,"[cardoza, berryhill]",H0CA18050,CA,2,House,0,0,0.0,0.00,...,15.995970,20.725000,-6.000000,california 18th district,2010,California,0.718340,39.722917,52.474887,-12.438908
9,bishop mike,"[bishop, slotkin]",H4MI08135,MI,0,House,0,"[WALBERG BISHOP VICTORY FUND, MIKE BISHOP FOR ...",2.0,0.00,...,139.939390,379.658800,12.000000,new york 1st district,2010,Michigan,0.531101,39.722917,52.474887,-12.438908


In [1750]:
df3.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,estimated_num_d4_donors,estimated_num_d5_donors,spread_x,race_name,year,state_full,home_state_contrib,approval rating,disapproval rating,spread_y
0,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,298.401492,221.525,4.333333,new jersey 3rd district,2010,New Jersey,0.620806,39.722917,52.474887,-12.438908
1,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,79.328147,103.8,4.333333,new jersey 3rd district,2010,New Jersey,0.422481,39.722917,52.474887,-12.438908
2,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,147.05907,435.968335,-12.0,new york 1st district,2010,New York,0.621923,39.722917,52.474887,-12.438908
3,bachmann michele,"[bachmann, clark]",H6MN06074,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.0,...,622.590223,855.578,9.0,minnesota 6th district,2010,Minnesota,0.126658,39.722917,52.474887,-12.438908
4,barela jon,"[barela, heinrich]",H0NM01163,NM,1,House,0,0,0.0,0.0,...,128.957995,210.049995,-2.666667,new mexico 1st district,2010,New Mexico,0.806308,39.722917,52.474887,-12.438908


In [1751]:
#Select only features and result from df
data = df3[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread_x', 'home_state_contrib', 'approval rating', 'disapproval rating', 'spread_y', 'election_result']]
data.head()


#Shuffle
data = shuffle(data).reset_index(drop=True)

train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :80]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :80]
Y_test = test_df.election_result

train_model()

logreg training acuracy=  0.7151639344262295
logreg test accuracy=  0.6147540983606558

perceptron training acuracy=  0.6127049180327869
perceptron test accuracy=  0.5737704918032787

adaboost training acuracy=  0.9897540983606558
adaboost test accuracy=  0.9180327868852459

random_forest training acuracy=  0.9959016393442623
random_forest test accuracy=  0.8852459016393442


## Other Features to include: R or D in Presidential, Positive or Negative based on the party affiliation of candidate.

In [1752]:
wh_party = pd.read_csv(Path('.')/'..'/'..'/'data'/'cleaned'/'WhiteHouseParty.csv')
wh_party.head()

Unnamed: 0,date,WhiteHouseParty
0,01/01/2010,r
1,01/02/2010,r
2,01/03/2010,r
3,01/04/2010,r
4,01/05/2010,r


In [1753]:
years = []
for date in wh_party.date:
    year = date.split('/')[2]
    years = np.append(years, year)
wh_party['year'] = years

In [1754]:
wh_party_years = wh_party.drop_duplicates('year')

In [1755]:
wh_party_years.set_index('year')

Unnamed: 0_level_0,date,WhiteHouseParty
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,01/01/2010,r
2011,01/01/2011,r
2012,01/01/2012,r
2013,01/01/2013,r
2014,01/01/2014,r
2015,01/01/2015,r
2016,01/01/2016,r
2017,01/01/2017,r
2018,01/01/2018,d


In [1756]:
df4 = df.merge(wh_party_years, left_on='year', right_on='year', how='outer')
df4 = df4.drop(columns=['date'])

In [1757]:
# 0 if Republican, 1 if Democratic or Democratic Farm Labor, 2 if Other: NONE, Libertarian, Independent
party_sentiment = []
for index, row in df4.iterrows():
    if (row.party == 0) & (row.WhiteHouseParty == 'r'):
        party_sentiment = np.append(party_sentiment, 1)
        continue
    if (row.party == 1) & (row.WhiteHouseParty == 'd'):
        party_sentiment = np.append(party_sentiment, 1)
        continue
    if (row.party == 2) & ((row.WhiteHouseParty != 'r') & (row.WhiteHouseParty != 'd')):
        party_sentiment = np.append(party_sentiment, 1)
        continue
    if (row.party == 2) &  ((row.WhiteHouseParty == 'r') | (row.WhiteHouseParty == 'd')):
        party_sentiment = np.append(party_sentiment, 0)
        continue
    else: 
        party_sentiment = np.append(party_sentiment, -1)
        continue
    

In [1758]:
len(party_sentiment)
# len(df4.name)

610

In [1759]:
df4['party_sentiment'] = party_sentiment


In [1760]:
#Select only features and result from df
data = df4[['incumbent','party','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'home_state_contrib', 'party_sentiment','election_result']]
data.head()


#Shuffle
data = shuffle(data).reset_index(drop=True)

train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :78]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :78]
Y_test = test_df.election_result

train_model()

logreg training acuracy=  0.6864754098360656
logreg test accuracy=  0.5081967213114754

perceptron training acuracy=  0.5286885245901639
perceptron test accuracy=  0.45081967213114754

adaboost training acuracy=  0.9836065573770492
adaboost test accuracy=  0.9016393442622951

random_forest training acuracy=  0.9959016393442623
random_forest test accuracy=  0.8524590163934426


## Testing Features

In [1761]:
#Select only features and result from df
data = df4[['incumbent','amnt_committees',
       'Armed Forces Americas', 'Armed Forces Europe', 'Alaska', 'Alabama',
       'Armed Forces Pacific', 'Arkansas', 'American Samoa', 'Arizona',
       'California', 'Colorado', 'Connecticut', 'District Of Columbia',
       'Delaware', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
       'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
       'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
       'Missouri', 'Northern Mariana Islands', 'Mississippi', 'Montana',
       'North Carolina', 'North Dakota', 'Nebraska', 'New Hampshire',
       'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
       'Oregon', 'Other', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'U.S. Virgin Islands', 'Vermont', 'Washington', 'Wisconsin',
       'West Virginia', 'Wyoming', 'Foreign Countries', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'home_state_contrib', 'party_sentiment','election_result']]
data.head()


#Shuffle
data = shuffle(data).reset_index(drop=True)

train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :77]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :77]
Y_test = test_df.election_result

train_model()

logreg training acuracy=  0.7049180327868853
logreg test accuracy=  0.5409836065573771

perceptron training acuracy=  0.5881147540983607
perceptron test accuracy=  0.4426229508196721

adaboost training acuracy=  0.9877049180327869
adaboost test accuracy=  0.9344262295081968

random_forest training acuracy=  0.9959016393442623
random_forest test accuracy=  0.9262295081967213


In [1862]:
#Select only features and result from df
data = df4[['incumbent','amnt_committees', 'Donation Level 1',
       'Donation Level 2', 'Donation Level 3', 'Donation Level 4',
       'Donation Level 5','total_donations',
       'estimated_num_d1_donors', 'estimated_num_d2_donors',
       'estimated_num_d3_donors', 'estimated_num_d4_donors',
       'estimated_num_d5_donors', 'spread', 'home_state_contrib', 'party_sentiment','election_result']]
print(len(data.columns))
data.head()


#Shuffle
data = shuffle(data).reset_index(drop=True)

train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 1337)

X_train = train_df.iloc[:, :16]
Y_train = train_df.election_result
X_test  = test_df.iloc[:, :16]
Y_test = test_df.election_result

train_model()

17
logreg training acuracy=  0.6331967213114754
logreg test accuracy=  0.5245901639344263

perceptron training acuracy=  0.6372950819672131
perceptron test accuracy=  0.6147540983606558

adaboost training acuracy=  0.9733606557377049
adaboost test accuracy=  0.9016393442622951

random_forest training acuracy=  0.9959016393442623
random_forest test accuracy=  0.9180327868852459


In [1764]:
data.head()

Unnamed: 0,incumbent,amnt_committees,Donation Level 1,Donation Level 2,Donation Level 3,Donation Level 4,Donation Level 5,total_donations,estimated_num_d1_donors,estimated_num_d2_donors,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,home_state_contrib,party_sentiment,election_result
0,1,1.0,639853.15,236548.1,179132.35,86161.0,107400.0,1249095.0,3199.26575,473.105662,179.134141,43.080715,53.7,-22.0,0.387683,-1.0,0.0
1,2,1.0,607545.02,220846.54,254499.22,297700.0,322525.0,1703116.0,3037.7251,441.701914,254.501765,148.850744,161.2625,0.0,0.436239,-1.0,0.0
2,0,2.0,476127.75,223198.92,390858.715789,626583.631579,1650994.0,3367763.0,2380.63875,446.406768,390.862624,313.293382,825.497105,-7.865385,0.702889,1.0,1.0
3,2,1.0,279761.0,18467.0,19000.0,26600.0,50000.0,393828.0,1398.805,36.934739,19.00019,13.300067,25.0,-21.0,0.334514,-1.0,0.0
4,1,2.0,570205.71,58831.0,63351.0,55950.0,203000.0,951337.7,2851.02855,117.664353,63.351634,27.97514,101.5,-20.857143,0.34917,1.0,0.0


In [1765]:
df4.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,estimated_num_d3_donors,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year,state_full,home_state_contrib,WhiteHouseParty,party_sentiment
0,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,434.254343,298.401492,221.525,4.333333,new jersey 3rd district,2010,New Jersey,0.620806,r,-1.0
1,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,137.551376,79.328147,103.8,4.333333,new jersey 3rd district,2010,New Jersey,0.422481,r,-1.0
2,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,134.721997,147.05907,435.968335,-12.0,new york 1st district,2010,New York,0.621923,r,1.0
3,bachmann michele,"[bachmann, clark]",H6MN06074,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.0,...,1172.883029,622.590223,855.578,9.0,minnesota 6th district,2010,Minnesota,0.126658,r,1.0
4,barela jon,"[barela, heinrich]",H0NM01163,NM,1,House,0,0,0.0,0.0,...,191.259493,128.957995,210.049995,-2.666667,new mexico 1st district,2010,New Mexico,0.806308,r,1.0


## Unbiasing Candidate Pool

In [1863]:
race_id = []
for index, row in df4.iterrows():
    race_string = row.race_candidates[0] + '|' + row.race_candidates[1]
    race_id = np.append(race_id, race_string)
df4['race_id'] = race_id

In [1864]:
len(df4.drop_duplicates('race_id'))

390

#### There should be 390 x 2 rows in this dataframe. Two candidate rows per race

- Lets do this iteratively. 



In [1865]:
len(last_names)
len(df4)
df4.to_pickle('df4.pkl')

In [1866]:
df4.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,spread,race_name,year,state_full,home_state_contrib,WhiteHouseParty,party_sentiment,last_name,race_id,candidate_unique_id
0,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,4.333333,new jersey 3rd district,2010,New Jersey,0.620806,r,-1.0,adler,adler|runyan,adler_NJ
1,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,4.333333,new jersey 3rd district,2010,New Jersey,0.422481,r,-1.0,adler,runyan|adler,adler_NJ
2,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,-12.0,new york 1st district,2010,New York,0.621923,r,1.0,altschuler,bishop|altschuler,altschuler_NY
3,bachmann michele,"[bachmann, clark]",H6MN06074,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.0,...,9.0,minnesota 6th district,2010,Minnesota,0.126658,r,1.0,bachmann,bachmann|clark,bachmann_MN
4,barela jon,"[barela, heinrich]",H0NM01163,NM,1,House,0,0,0.0,0.0,...,-2.666667,new mexico 1st district,2010,New Mexico,0.806308,r,1.0,barela,barela|heinrich,barela_NM


In [1851]:
df4['candidate_unique_id'] = df4['last_name'] + '_' + df4['state']

In [1852]:
df4.head()

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,spread,race_name,year,state_full,home_state_contrib,WhiteHouseParty,party_sentiment,last_name,race_id,candidate_unique_id
0,adler john h,"[adler, runyan]",H8NJ03156,NJ,1,House,1,0,0.0,0.0,...,4.333333,new jersey 3rd district,2010,New Jersey,0.620806,r,-1.0,adler,adler|runyan,adler_NJ
1,adler shelley,"[runyan, adler]",H2NJ03183,NJ,1,House,1,[SHELLEY ADLER FOR CONGRESS],1.0,0.0,...,4.333333,new jersey 3rd district,2010,New Jersey,0.422481,r,-1.0,adler,runyan|adler,adler_NJ
2,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,-12.0,new york 1st district,2010,New York,0.621923,r,1.0,altschuler,bishop|altschuler,altschuler_NY
3,bachmann michele,"[bachmann, clark]",H6MN06074,MN,1,House,0,[BACHMANN FOR PRESIDENT],1.0,0.0,...,9.0,minnesota 6th district,2010,Minnesota,0.126658,r,1.0,bachmann,bachmann|clark,bachmann_MN
4,barela jon,"[barela, heinrich]",H0NM01163,NM,1,House,0,0,0.0,0.0,...,-2.666667,new mexico 1st district,2010,New Mexico,0.806308,r,1.0,barela,barela|heinrich,barela_NM


In [1867]:
uncontained_names = []
for index, row in df4.iterrows():
    for i in np.arange(2):
        cand_name = row[0]
        name = row.race_candidates[i]
        year = row.year
        race= row.race_candidates
        state = row.state
        office = row.office_full
        
        contains = df4[(df4.last_name.str.contains(name))]
#         (contains.race_candidates == race) & (contains.year == year) & (contains.office_full == office)
        contains = contains[(contains.state == state) ]
#         string_race = row.race_candidates[0] + '|' + row.race_candidates[1]
#         contains = df4[(df4.last_name.str.contains(name)) & (df4.race_candidates.str.contains(string_race))]

#         if len(contains) > 1:
#             print(name)

        if len(contains) == 0:
            element = []
            element.append(name)
            element.append(year)
            element.append(state)
            element.append(race)
            if df4[df4.name == cand_name].election_result.values[0] == 1:
                element.append(0)
            else:
                element.append(1)
            uncontained_names.append(element)
#             uncontained_names = np.append(uncontained_names, name)
    

In [1868]:
print(len(uncontained_names))
uncontained_names

25


[['kanjorski', '2010', 'PA', ['barletta', 'kanjorski'], 0],
 ['etheridge', '2010', 'NC', ['ellmers', 'etheridge'], 0],
 ['boucher', '2010', 'VA', ['boucher', 'griffith'], 0],
 ['bass', '2010', 'NH', ['bass', 'kuster'], 1],
 ['rahall', '2010', 'WV', ['rahall', 'maynard'], 1],
 ['fitzpatrick', '2010', 'PA', ['fitzpatrick', 'murphy'], 1],
 ['fitzpatrick', '2010', 'PA', ['fitzpatrick', 'murphy'], 1],
 ['dingell', '2010', 'MI', ['dingell', 'steele'], 1],
 ['reid', '2010', 'NV', ['reid', 'angle'], 1],
 ['leahy', '2010', 'VT', ['leahy', 'britton'], 1],
 ['inouye', '2010', 'HI', ['inouye', 'cavasso'], 1],
 ['grassley', '2010', 'IA', ['grassley', 'conlin'], 1],
 ['reid', '2010', 'NV', ['lowden', 'reid'], 1],
 ['specter', '2010', 'PA', ['toomey', 'specter'], 0],
 ['young', '2014', 'AK', ['young', 'dunbar'], 1],
 ['rangel', '2014', 'NY', ['rangel', 'espaillat'], 1],
 ['young', '2014', 'AK', ['young', 'moore'], 1],
 ['cochran', '2014', 'MS', ['cochran', 'childers'], 1],
 ['wheby', '2014', 'OR', ['

In [1873]:
uncontained_df = pd.DataFrame(uncontained_names)
uncontained_df.to_pickle('uncontained_names.pkl')


In [1821]:
df4[df4.last_name.str.contains('altschuler')]

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year,state_full,home_state_contrib,WhiteHouseParty,party_sentiment,last_name
2,altschuler randolph mr.,"[bishop, altschuler]",H0NY01129,NY,1,House,0,[LONG ISLAND VICTORY FUND],1.0,0.0,...,147.05907,435.968335,-12.0,new york 1st district,2010,New York,0.621923,r,1.0,altschuler


In [1620]:
contains = df4[df4.last_name.str.contains('runyan')]
len(df4[df4.last_name == 'barr'])

# df4[df4.name.str.contains('kanjorski')]

1

### Get Candidate IDs Matched on Last Name, Year, & Office

In [1625]:
df_candidate_id = pd.read_pickle(Path('.')/'..'/'candidate_ids.pkl')
df_candidate_id.head()

Unnamed: 0,name,candidate_id,state,incumbent_challenge_full,party_full,office_full,cycles,election_years,last_file_date,load_date
0,"AALDERS, TIMOTHY NOEL",S2UT00229,UT,Open seat,CONSTITUTION PARTY,Senate,"[2012, 2014, 2016, 2018]","[2012, 2018]",2018-04-23,2018-08-01T00:13:22
1,"AANESTAD, SAMUEL",H2CA01110,CA,Challenger,REPUBLICAN PARTY,House,"[2012, 2014, 2016]",[2012],2012-02-22,2013-04-26T09:04:30
2,"AARESTAD, DAVID",H8CO06237,CO,Challenger,DEMOCRATIC PARTY,House,[2018],[2018],2017-04-26,2017-08-01T20:57:28
3,"AARON, LAURA DAVIS",P80002926,US,Open seat,DEMOCRATIC PARTY,President,"[2006, 2008, 2010, 2012, 2014, 2016]",[2008],2007-03-13,2016-11-17T06:10:48
4,"ABAIR, PETER JON",H0MA01024,MA,Challenger,REPUBLICAN PARTY,House,"[2000, 2002, 2004]",[2000],2000-02-02,2002-04-12T00:00:00


In [1626]:
last_name_array = []
for index, row in df_candidate_id.iterrows():
    last_name = row[0].split()[0][:-1].lower()
    last_name_array = np.append(last_name_array, last_name)

df_candidate_id['last_name'] = last_name_array
df_candidate_id.head()

Unnamed: 0,name,candidate_id,state,incumbent_challenge_full,party_full,office_full,cycles,election_years,last_file_date,load_date,last_name
0,"AALDERS, TIMOTHY NOEL",S2UT00229,UT,Open seat,CONSTITUTION PARTY,Senate,"[2012, 2014, 2016, 2018]","[2012, 2018]",2018-04-23,2018-08-01T00:13:22,aalders
1,"AANESTAD, SAMUEL",H2CA01110,CA,Challenger,REPUBLICAN PARTY,House,"[2012, 2014, 2016]",[2012],2012-02-22,2013-04-26T09:04:30,aanestad
2,"AARESTAD, DAVID",H8CO06237,CO,Challenger,DEMOCRATIC PARTY,House,[2018],[2018],2017-04-26,2017-08-01T20:57:28,aarestad
3,"AARON, LAURA DAVIS",P80002926,US,Open seat,DEMOCRATIC PARTY,President,"[2006, 2008, 2010, 2012, 2014, 2016]",[2008],2007-03-13,2016-11-17T06:10:48,aaron
4,"ABAIR, PETER JON",H0MA01024,MA,Challenger,REPUBLICAN PARTY,House,"[2000, 2002, 2004]",[2000],2000-02-02,2002-04-12T00:00:00,abair


In [1644]:
df_temp = df_candidate_id.copy()
df_temp = df_temp.drop(columns = ['election_years'])

# Create the DataFrame with all Nans
cols = df_temp.columns
df_cand = pd.DataFrame(columns=cols)
# df_cand = pd.DataFrame(np.full((100, len(cols)), np.nan), columns=cols)
# df_cand['cycles'] = df_cand['cycles'].astype(list)
i = 0

for uncontained_name in uncontained_names:
    cand_contains = df_temp[(df_temp.last_name.str.contains(uncontained_name[0])) 
                            & (uncontained_name[1] in df_temp.cycles)]
    if len(cand_contains) != 0:
        df_cand = pd.concat([df_cand, cand_contains])
     

                name candidate_id state incumbent_challenge_full  \
5021  ETHERIDGE, BOB    H6NC02080    NC                     None   

            party_full office_full  \
5021  DEMOCRATIC PARTY       House   

                                                 cycles last_file_date  \
5021  [1996, 1998, 2000, 2002, 2004, 2006, 2008, 201...     2010-11-19   

                load_date  last_name  
5021  2011-02-24T10:53:11  etheridge  
                      name candidate_id state incumbent_challenge_full  \
1669  BOUCHER, FREDERICK C    H2VA09010    VA                     None   

            party_full office_full  \
1669  DEMOCRATIC PARTY       House   

                                                 cycles last_file_date  \
1669  [1982, 1984, 1986, 1988, 1990, 1992, 1994, 199...     2009-11-23   

                load_date last_name  
1669  2011-02-24T10:53:11   boucher  
                           name candidate_id state incumbent_challenge_full  \
959     BASSA, REGINALD LEON 

                 name candidate_id state incumbent_challenge_full  \
15835  SPECTER, ARLEN    P60003233    US               Challenger   

             party_full office_full  \
15835  REPUBLICAN PARTY   President   

                                                 cycles last_file_date  \
15835  [1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010]     1995-01-20   

                 load_date last_name  
15835  2002-04-12T00:00:00   specter  
                  name candidate_id state incumbent_challenge_full  \
13634  RANGEL, JORGE C    H2TX27018    TX               Challenger   
13635  RANGEL, RICARDO    H6FL09153    FL                Open seat   

             party_full office_full  \
13634  DEMOCRATIC PARTY       House   
13635  DEMOCRATIC PARTY       House   

                                           cycles last_file_date  \
13634  [1982, 1984, 1986, 1988, 1990, 1992, 1994]     1981-11-02   
13635                                      [2016]     2015-07-01   

                 load

In [1645]:
df_cand

Unnamed: 0,name,candidate_id,state,incumbent_challenge_full,party_full,office_full,cycles,last_file_date,load_date,last_name
5021,"ETHERIDGE, BOB",H6NC02080,NC,,DEMOCRATIC PARTY,House,"[1996, 1998, 2000, 2002, 2004, 2006, 2008, 201...",2010-11-19,2011-02-24T10:53:11,etheridge
1669,"BOUCHER, FREDERICK C",H2VA09010,VA,,DEMOCRATIC PARTY,House,"[1982, 1984, 1986, 1988, 1990, 1992, 1994, 199...",2009-11-23,2011-02-24T10:53:11,boucher
959,"BASSA, REGINALD LEON JR",H4MO05101,MO,Challenger,DEMOCRATIC PARTY,House,"[1994, 1996]",1994-04-13,2002-04-02T00:00:00,bassa
960,"BASSETT, JAMES P",H4NH02068,NH,Challenger,REPUBLICAN PARTY,House,"[1994, 1996, 1998, 2000]",1994-03-09,2002-04-07T00:00:00,bassett
961,"BASSETT, STEPHEN GERALD",H2MD08134,MD,Challenger,INDEPENDENT,House,"[2002, 2004]",2002-03-11,2003-07-02T00:00:00,bassett
962,"BASSILIAN, RON",H8CA37293,CA,Challenger,REPUBLICAN PARTY,House,[2018],2017-01-31,2018-10-16T21:05:48,bassilian
963,"BASS, KAREN",H0CA33117,CA,Incumbent,DEMOCRATIC PARTY,House,"[2010, 2012, 2014, 2016, 2018]",2017-03-06,2018-04-12T21:15:49,bass
964,"BASS, TOM",H2TX25020,TX,Challenger,DEMOCRATIC PARTY,House,"[1982, 1984]",1981-10-05,2002-03-30T00:00:00,bass
965,"BASS, WILLIAM HENRY MR. JR",H8NY24116,NY,Challenger,NO PARTY AFFILIATION,House,[2018],2018-04-13,2018-04-27T00:10:45,bass
5379,"FITZPATRICK, BRIAN",H6PA08277,PA,Incumbent,REPUBLICAN PARTY,House,"[2016, 2018]",2018-03-02,2018-03-02T21:07:24,fitzpatrick


In [1767]:
#Using Lists, Approach that I went with. Dictionary removed duplicate last names
all_last_names = []
last_name_array = []
for index, row in df4.iterrows():
    last_name = row[0].split()[0]
    last_name_array = np.append(last_name_array, last_name)
    for i in np.arange(2):
        element = []
        element.append(row.race_candidates[i])
        element.append(row.year)
        element.append(row.office_full)
        element.append(row.race_candidates)
        element.append(row.state)
        all_last_names.append(element)
    
df4['last_name'] = last_name_array
len(all_last_names)


1220

In [1768]:
last_names = []
for row in all_last_names:
    last_names = np.append(last_names, row[0])
len(last_names)

1220

In [1769]:
#narrow down to only rows that match with the last name.
cand_ids = df_candidate_id[df_candidate_id.last_name.isin(last_names)]
print(len(cand_ids.name))
cand_ids

3441


Unnamed: 0,name,candidate_id,state,incumbent_challenge_full,party_full,office_full,cycles,election_years,last_file_date,load_date,last_name
19,"ABELER, JAMES J",S4MN00353,MN,Challenger,REPUBLICAN PARTY,Senate,"[2014, 2016]",[2014],2013-07-08,2016-11-17T06:10:49,abeler
66,"ADAMS, ALMA SHEALEY",H4NC12100,NC,Incumbent,DEMOCRATIC PARTY,House,"[2014, 2016, 2018]","[2014, 2016, 2018]",2017-01-23,2018-04-09T21:07:45,adams
67,"ADAMS, ANN MARIE DR",S8CT00105,CT,Challenger,DEMOCRATIC PARTY,Senate,[2018],[2018],2017-04-24,2018-02-17T09:16:20,adams
68,"ADAMS, ANTHONY T",H2CA08180,CA,Challenger,NO PARTY AFFILIATION,House,"[2012, 2014]",[2012],2012-02-22,2014-04-01T09:54:32,adams
69,"ADAMS, BILL",H6IL17098,IL,Challenger,REPUBLICAN PARTY,House,[1996],[1996],1995-11-29,2002-04-02T00:00:00,adams
70,"ADAMS, BOYCE",H6MS01156,MS,Open seat,REPUBLICAN PARTY,House,"[2016, 2018]",[2015],2015-03-14,2017-02-23T14:47:46,adams
71,"ADAMS, BRENT THOMAS",H8NV02012,NV,Challenger,DEMOCRATIC PARTY,House,[1988],[1988],1987-07-23,2005-05-26T00:00:00,adams
72,"ADAMS, CHARLES H",H0AL03077,AL,Challenger,DEMOCRATIC PARTY,House,"[1990, 1992, 1994, 1996]",[1990],1989-01-24,2002-04-02T00:00:00,adams
73,"ADAMS, DANIELLE",H4NC06078,NC,Open seat,DEMOCRATIC PARTY,House,"[2014, 2016]",[2014],2013-07-12,2015-04-02T15:52:44,adams
74,"ADAMS, DENISE DARCEL",H8NC05063,NC,Challenger,DEMOCRATIC PARTY,House,[2018],[2018],2017-03-29,2018-04-09T21:07:45,adams


In [1771]:
def select_loser_names(df):
    df_temp = df.copy()
    df_temp = df_temp.drop(columns = ['election_years'])
    # Create the DataFrame with all Nans
    cols = df_temp.columns
    df_cand = pd.DataFrame(np.full((5000, len(cols)), np.nan), columns=cols)
    df_cand['cycles'] = df_cand['cycles'].astype(list)
    
    i = 0
    j= 0
    
    for index, row in df_temp.iterrows():
        for person in all_last_names:
            last_name = person[0]
            year = person[1]
            office = person[2]
            state = person[4]
            if (last_name == row.last_name) & (office == row.office_full) & (int(year) in row.cycles) & (state == row.state):
                print('yes')
                df_cand.iloc[j, :] = df_temp.iloc[i, :]
                df_cand.iloc[j, 6] = year
                df_cand.iloc[j, 8] = person[3]
                j += 1
                
        i += 1 
        print(i)
    
    return df_cand.dropna()

In [1772]:
selected_losers = select_loser_names(cand_ids)
selected_losers
# for person in all_last_names:
#     print(person)
#     print(all_last_names[person])



yes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
yes
yes
33
34
35
yes
36
37
38
39
40
yes
yes
yes
41
42
43
44
yes
45
46
47
48
49
yes
yes
50
51
52
53
54
yes
yes
yes
yes
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
yes
yes
90
91
92
93
94
95
96
97
yes
98
99
yes
yes
100
yes
yes
yes
101
102
yes
103
104
105
yes
yes
106
107
yes
108
yes
yes
yes
yes
yes
109
yes
yes
yes
110
yes
yes
111
112
113
114
115
116
yes
yes
yes
117
118
119
120
121
122
yes
123
124
125
126
127
128
129
130
131
132
133
yes
yes
134
135
yes
yes
136
137
yes
138
139
yes
140
141
142
143
144
145
146
147
148
yes
yes
149
150
151
152
yes
yes
153
154
yes
yes
155
156
157
158
159
160
161
162
yes
yes
163
164
165
166
167
168
169
170
yes
171
172
yes
173
yes
yes
174
175
yes
yes
176
yes
yes
177
yes
yes
yes
yes
yes
178
yes
yes
yes
179
yes
180
181
182
183
184
yes
185
186
187
188
189
yes
190
191
192
193
194
yes
yes
195
yes
196
197
yes
yes
198
199


1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
yes
yes
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
yes
yes
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
yes
1557
1558
1559
1560
1561
1562
1563
yes
yes
yes
1564
yes
1565
yes
1566
yes
yes
1567
1568
yes
1569
1570
1571
yes
yes
1572
1573
1574
1575
1576
1577
yes
yes
1578
yes
yes
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
yes
yes
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
yes
1632
1633
yes
yes
1634
1635
yes
yes
1636
1637
yes
yes
1638
1639
1640
yes
yes
yes
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1

2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
yes
yes
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
yes
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
yes
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
yes
2873
2874
2875
2876
2877
yes
yes
2878
2879
2880
2881
yes
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
yes
yes
2909
2910
2911
2912
2913
2914
2915
yes
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
yes
yes
2944
2945
2946
2947
2948
2949
2950
yes
2951
2952
2953
2954
2955
2956
yes
yes
2957
yes
yes
2958
yes
2959
2960
yes
yes
2961
2962
2963
2964
2965
2966
2967
2968
yes
yes
2969
2970
2971
2972
2973
2974
2975
2976
2977
297

Unnamed: 0,name,candidate_id,state,incumbent_challenge_full,party_full,office_full,cycles,last_file_date,load_date,last_name
0,"ABELER, JAMES J",S4MN00353,MN,Challenger,REPUBLICAN PARTY,Senate,2014,2013-07-08,"[franken, abeler]",abeler
1,"ADAMS, TERRY GLEN JR",S4TN00328,TN,Challenger,DEMOCRATIC PARTY,Senate,2014,2013-10-18,"[alexander, adams]",adams
2,"ADAMS, TERRY GLEN JR",S4TN00328,TN,Challenger,DEMOCRATIC PARTY,Senate,2014,2013-10-18,"[alexander, adams]",adams
3,"ADDIVINOLA, FRANK J JR",S4MA00242,MA,Challenger,REPUBLICAN PARTY,Senate,2014,2014-03-19,"[markey, addivinola]",addivinola
4,"ADLER, JOHN H",H8NJ03156,NJ,Challenger,DEMOCRATIC PARTY,House,2010,2008-12-10,"[adler, runyan]",adler
5,"ADLER, JOHN H",H8NJ03156,NJ,Challenger,DEMOCRATIC PARTY,House,2010,2008-12-10,"[runyan, adler]",adler
6,"ADLER, JOHN H",H8NJ03156,NJ,Challenger,DEMOCRATIC PARTY,House,2010,2008-12-10,"[adler, runyan]",adler
7,"AIKEN, CLAYTON CLAY",H4NC02127,NC,Challenger,DEMOCRATIC PARTY,House,2014,2014-01-31,"[ellmers, aiken]",aiken
8,"ALAMEEL, DAVID M",S4TX00516,TX,Challenger,DEMOCRATIC PARTY,Senate,2014,2015-10-15,"[cornyn, alameel]",alameel
9,"ALAMEEL, DAVID M",S4TX00516,TX,Challenger,DEMOCRATIC PARTY,Senate,2014,2015-10-15,"[cornyn, alameel]",alameel


In [1777]:
print(len(selected_losers))
selected_losers_unique = selected_losers.drop_duplicates('name')
len(selected_losers_unique)

998


530

In [1778]:
selected_losers_unique.rename(columns={'load_date': 'race_candidates'})

Unnamed: 0,name,candidate_id,state,incumbent_challenge_full,party_full,office_full,cycles,last_file_date,race_candidates,last_name
0,"ABELER, JAMES J",S4MN00353,MN,Challenger,REPUBLICAN PARTY,Senate,2014,2013-07-08,"[franken, abeler]",abeler
1,"ADAMS, TERRY GLEN JR",S4TN00328,TN,Challenger,DEMOCRATIC PARTY,Senate,2014,2013-10-18,"[alexander, adams]",adams
3,"ADDIVINOLA, FRANK J JR",S4MA00242,MA,Challenger,REPUBLICAN PARTY,Senate,2014,2014-03-19,"[markey, addivinola]",addivinola
4,"ADLER, JOHN H",H8NJ03156,NJ,Challenger,DEMOCRATIC PARTY,House,2010,2008-12-10,"[adler, runyan]",adler
7,"AIKEN, CLAYTON CLAY",H4NC02127,NC,Challenger,DEMOCRATIC PARTY,House,2014,2014-01-31,"[ellmers, aiken]",aiken
8,"ALAMEEL, DAVID M",S4TX00516,TX,Challenger,DEMOCRATIC PARTY,Senate,2014,2015-10-15,"[cornyn, alameel]",alameel
10,"ALEXANDER, LAMAR",S2TN00058,TN,Incumbent,REPUBLICAN PARTY,Senate,2014,2014-12-16,"[alexander, adams]",alexander
14,"ALLEN, RICHARD W",H2GA12121,GA,Incumbent,REPUBLICAN PARTY,House,2014,2017-04-01,"[allen, barrow]",allen
16,"ALTMIRE, JASON",H6PA04110,PA,Challenger,DEMOCRATIC PARTY,House,2012,2010-12-13,"[altmire, critz]",altmire
17,"ALTSCHULER, RANDOLPH MR.",H0NY01129,NY,Challenger,REPUBLICAN PARTY,House,2010,2012-09-26,"[bishop, altschuler]",altschuler


In [1779]:
df4[df4.last_name == 'markey']

Unnamed: 0,name,race_candidates,cand_id,state,incumbent,office_full,party,committee_name,amnt_committees,Armed Forces Americas,...,estimated_num_d4_donors,estimated_num_d5_donors,spread,race_name,year,state_full,home_state_contrib,WhiteHouseParty,party_sentiment,last_name
86,markey betsy,"[gardner, markey]",H8CO04067,CO,1,House,1,0,0.0,0.0,...,281.885064,199.368335,-3.0,colorado 4th district,2010,Colorado,0.4513,r,-1.0,markey
539,markey edward john mr,"[markey, herr]",S4MA00028,MA,0,Senate,1,"[MARKEY SENATE VICTORY, MARKEY GRASSROOTS VICT...",2.0,0.0,...,2200.735414,5304.32987,-20.125,massachusetts senate,2011,Massachusetts,0.381065,r,-1.0,markey


In [1541]:
len(all_last_names)

566

#### Approach That Did Not Work Properly, Need to match on more than just the names. 

In [1859]:

included_names = df4.last_name
included_names = np.array(included_names)

names_to_collect = []
for name in all_last_names:
    if name not in included_names:
        names_to_collect = np.append(names_to_collect, name)
print(len(names_to_collect))
names_to_collect

  import sys


ValueError: setting an array element with a sequence

In [1364]:
len(included_names)
#len(all_last_names)

610

In [1336]:

included_names = df4.last_name
included_names = np.array(included_names)
names_to_collect = []
for name in all_last_names:
    contains = False
    for name_in in included_names:
        if name == name_in:
            contains = True
    if contains == False:
        names_to_collect = np.append(names_to_collect, name)
print(len(names_to_collect))
names_to_collect

23


array(['kanjorski', 'etheridge', 'boucher', 'bass', 'rahall',
       'fitzpatrick', 'fitzpatrick', 'dingell', 'reid', 'leahy', 'inouye',
       'grassley', 'reid', 'specter', 'rangel', 'cochran', 'wheby',
       'grisham', 'clayton', 'hatch', 'grassley', 'leahy', 'cochran'],
      dtype='<U32')

In [1338]:
all_last_names

array(['adler', 'runyan', 'runyan', ..., 'ward', 'thune', 'williams'],
      dtype='<U32')

In [None]:
#Using Lists, Approach that I went with. Dictionary removed duplicate last names
all_last_names = []
last_name_array = []
for index, row in df4.iterrows():
    last_name = row[0].split()[0]
    last_name_array = np.append(last_name_array, last_name)
    for i in np.arange(2):
        element = []
        element.append(row.race_candidates[i])
        element.append(row.year)
        element.append(row.office_full)
        element.append(row.race_candidates)
        all_last_names.append(element)
    
df4['last_name'] = last_name_array
len(all_last_names)
