In [1]:
import numpy as np
import pandas as pd
from src.modules import *
import os 

In [2]:
# Why would we be using a third dataset to group our election data by party?
# Rather than attempting to account for every possible abbr. and spelling of US political parties across all 50 States, 
# using a single database and joining it on the Candidate names (which are less likely to show variance between the State and Federal data) 

# Including the FEC data will also allow the incumbancy status to be included, giving another dimension to examine the impact of the IRS data on turnout

In [3]:
fec_raw_folder_path = 'data/FEC/raw'

fec_raw_files = [file for file in os.listdir(fec_raw_folder_path) if os.path.isfile(os.path.join(fec_raw_folder_path, file))]

In [4]:
fec_raw_files

['FEC_2012.xls',
 'FEC_2014.xls',
 'FEC_2016.xlsx',
 'FEC_2018.xlsx',
 'FEC_2020.xlsx']

In [5]:

# Get a list of sheetnames in the FEC data files
for i in fec_raw_files:
    file = pd.ExcelFile(f'{fec_raw_folder_path}/{i}')
    print(i)
    print(file.sheet_names)

FEC_2012.xls
['Publication Information', 'Table 5. P&G VotesCastforCong', 'Table 6. Senate by Party', 'Table 7. House by Party', '2012 US House Results by State', 'CT, UT, OR Party Conventions', '2012 Party Labels', '2012 Primary Dates']
FEC_2014.xls
['Publication Information', 'Table 1. GE Votes Cast', 'Table 2. GE Votes Cast by Party', 'Table 3. P&G VotesCastforCong', 'Table 4. Senate by Party', 'Table 5. House by Party', '2014 US Senate Results by State', '2014 US House Results by State', 'CT, IA, UT Party Conventions', '2014 Party Labels', '2014 Primary Dates']
FEC_2016.xlsx
['Publication Information', 'Table 1. 2016 Pres Popular Vote', 'Table 2. Electoral &  Pop Vote', 'Table 3. GEVotes for Pres, H, S', 'Table 4. GE Votes Cast by Party', 'Table 5. P&G VotesCastforCong', 'Table 6. Senate by Party', 'Table 7. House by Party', '2016 Pres General Results', '2016 Pres Primary Results', '2016 Pres Primary Party Summary', '2016 US Senate Results by State', '2016 US House Results by State

In [6]:
# It looks like the table for the house data changes the sheet name per year, but in a predictable way. 

In [7]:
fec_data = []
for i in fec_raw_files: 
    data = pd.read_excel(f"{fec_raw_folder_path}/{i}", sheet_name=f"20{i[6:8]} US House Results by State", header=0)
    name = i.split('.')[0]
    fec_data.append((name,data))

In [8]:
for year, i in fec_data:
    print(year)
    print(i.columns)

FEC_2012
Index([                                      1,
                          'STATE ABBREVIATION',
                                       'STATE',
                                           'D',
                                     'FEC ID#',
                                         '(I)',
                      'CANDIDATE NAME (First)',
                       'CANDIDATE NAME (Last)',
                              'CANDIDATE NAME',
                                 'TOTAL VOTES',
                                       'PARTY',
                               'PRIMARY VOTES',
                                   'PRIMARY %',
                                'RUNOFF VOTES',
                                    'RUNOFF %',
                              'GENERAL VOTES ',
                                   'GENERAL %',
               'GE RUNOFF ELECTION VOTES (LA)',
                   'GE RUNOFF ELECTION % (LA)',
       'COMBINED GE PARTY TOTALS (CT, NY, SC)',
                     'COMBINED 

In [9]:
# Let's do a  cursory check of the party ids present in the FEC data

In [10]:
for year, i in fec_data:
    print(i['PARTY'].unique())

[nan 'R' 'W' 'D' 'LIB' 'NAF' 'CON' 'IND' 'NPA' 'W(LIB)/LIB' 'W(D)'
 'W(AE)/AE' 'AE' 'W(GRE)/GRE' 'GRE' 'W(R)' 'NOP' 'PAF' 'W(R)/R' 'W(PAF)'
 'W(LIB)' 'AIP' 'UN' 'AMC' 'D*' 'WF' 'R*' 'PC' 'IP' 'IDE' 'DCG' 'W(DCG)'
 'W(IND)' 'N' 'D/W' 'NP' 'SWP' 'IFM' 'NON' 'IFL' 'W(GR)' 'GR' 'W(D)/D'
 'UST' 'W(R)/W' 'NLP' 'DFL' 'IDP' 'GRT' 'MOP' 'REF' 'N(R)' 'N(D)' 'IAP'
 'CC' 'TPA' 'CCC' 'LMP' 'BFJ' 'NS' 'NOT' 'NSF' 'OAI' 'IR' 'PAC' 'RAP'
 'UIS' 'AA' 'CHA' 'OCG' 'TVH' 'JSI' 'TIC' 'AF' 'RFI' 'SUS' 'RN'
 'D/WF Combined Parties' 'R/CRV/IDP Combined Parties' 'CRV'
 'R/CRV/IDP/TRP Combined Parties' 'R/TRP' 'D/WF/IDP Combined Parties'
 'R/CRV/TRP Combined Parties' 'D/IDP/WF Combined Parties'
 'R/TRP Combined Parties' 'TRP' 'R/CRV/LIB Combined Parties' 'CRV/LIB'
 'R/CRV Combined Parties' 'WTP' 'R/IDP Combined Parties' 'CSP' 'R  ' 'D  '
 'DNL' 'W(DNL)' 'W(CON)' 'W(GRE)' 'D/WF' 'LIB/PG/PRO' 'PG/PRO' 'D/PRO/WF'
 'R/CON' 'PG' 'BFC' 'NPP' 'PPD' 'PRI' 'PPT' 'MUS' 'PPR' 'UJP' 'LU' 'VKS'
 'W(PRO)' 'PRO' 'D/IND' 'USM'

In [11]:
# There are two cases where the details of party ids will be slightly more complicated  
## Write-in candidates that win office
### This has happened only a handful of times, but we should always be concerned with potential outliers!

## States where candidates can be nominated by multiple parties (referred to as 'Electoral Fusion' or 'Fusion voting')
### Connecticut and New York both fall under this category

## States where the main natoinal parties run under a different name
### Minnesota has the DFL (Democratic–Farmer–Labor Party), an affiliate of the national Democratic party with four house members as of 2024

In [12]:
for year, i in fec_data:
    print(i.loc[i['STATE ABBREVIATION']=='MN']['PARTY'].unique())

[nan 'DFL' 'R' 'W' 'IDP' 'GRT' 'MOP']
['DFL' 'R' 'W' nan 'IDP' 'GRE']
[nan 'DFL' 'R' 'W' 'IDP' 'LMN']
['R' 'DFL' 'W' nan 'LMN' 'IDP']
['R' 'DFL' 'GLC' 'W' nan 'LMN']


In [13]:
# Since our first level of analysis with be primarily concerned with incumbancy rather than party ID, we can simplify this a bit more hastily than we woud prefer, but future analysis should endevor to complete this transformation to a two-party map more rigorously

In [14]:
def two_party(data):
    """this will reduce the parties to D for Democrat, R for Republican or 'Other' for any 3rd parties
    There is a joke here somewhere about 'tm is is how the political system actually works'"""
    if data['PARTY'] in ['R', 'D']:
        return data['PARTY']
    elif data['PARTY'] == 'DFL':
        return "D"
    else:
        return "OTHER"
    

In [15]:
# Let's examine the names format

# Notes beforehand:
## Because we well be working on a State-by-State and year-by-year basis, we can be slightly more flexible about how rigorously we ensure that duplicate names for candidates are dealt with cleanly

## Still, using the FEC data as a general check for duplicate names within a particular state and for a particular year is the easiest check, so after formatting we will perform just such a test.

In [16]:
for year, i in fec_data:
    print(i[['CANDIDATE NAME (First)','CANDIDATE NAME (Last)','CANDIDATE NAME']])

     CANDIDATE NAME (First) CANDIDATE NAME (Last)   CANDIDATE NAME
0                       NaN                   NaN       DISTRICT 1
1                        Jo                Bonner       Bonner, Jo
2                      Dean                 Young      Young, Dean
3                      Pete                 Riehm      Riehm, Pete
4                     Peter              Gounares  Gounares, Peter
...                     ...                   ...              ...
4927                    NaN                   NaN              NaN
4928                   Joel                  Otto       Otto, Joel
4929                    NaN             Scattered        Scattered
4930                    NaN                   NaN              NaN
4931                    NaN                   NaN              NaN

[4932 rows x 3 columns]
     CANDIDATE NAME (First) CANDIDATE NAME (Last)          CANDIDATE NAME
0                   Bradley                 Byrne          Byrne, Bradley
1                 Burto

In [17]:
# A quick glance shows that we have first names, middle names and titles like 'jr.' in the 'CANDIDATE NAME (First)' column, and lastnames, NaNs and 'Scattered' in the 'CANDIDATE NAME (Last)' column

# We can use the 'trim_party' function used on the Ohio data to remove the middle names (see src.modules for the full function) and then concatinate the 'CANDIDATE NAME (First)' and 'CANDIDATE NAME (Last)' to produce a standard name format

# We will also perform several other formatting changes:
## Reduce the number of columns to what is necessary to join with the State election data
## apply the 'two_party' function to reduce the number of parties we will be grouping by
## homogenize the formatting of the column names
## transforming the incumbency marker to a binary

In [18]:
formatted_fec = []
for year, i in fec_data:
    # Rename columns to simple format across all years
    new_col_dic = {'(I) Incumbent Indicator':'(I)','District':'D', 'DISTRICT':'D'}
    # Drop all rows that received no votes in the general election, or do not have a first name
    i = i.dropna(subset=['GENERAL VOTES ','CANDIDATE NAME (First)']) 
    i = i.reset_index(drop=True)
    
    # Current columns needed
    keep_cols = ['STATE ABBREVIATION','D','CANDIDATE NAME (First)','CANDIDATE NAME (Last)','CANDIDATE NAME(f)','CANDIDATE NAME','PARTY','(I)','GENERAL VOTES ']
    
    # Apply new column names
    i_copy = i.rename(columns=new_col_dic).copy()
    if '(I) Incumbent Indicator' in i.columns:
        i['(I)'] = i[['(I) Incumbent Indicator']]
    else:
        i = i.copy()
    # Convert incumbancy to binary value
    i_copy['(I)'] = i['(I)'].notna().astype(int)
    
    # Apply the two_party function (see modules.py)
    i_copy['PARTY'] = i.apply(two_party,axis=1)
    
    # Use trim_party (see modules.py) to remove middle names or titles stored in FEC First Names column
    i_copy['SIMPLE_FIRST'] = trim_party(i['CANDIDATE NAME (First)'],' ')
    
    # Create full name from first and last, transform to lower case to allow for easy comparison across datasets
    i_copy['CANDIDATE NAME(f)'] = i_copy['SIMPLE_FIRST']+' '+i_copy['CANDIDATE NAME (Last)']
    i_copy['CANDIDATE NAME(f)'] = i_copy['CANDIDATE NAME(f)'].astype(str).str.lower()
    
    # Remove all punctuation from candidate names
    cand_names = i_copy['CANDIDATE NAME(f)']
    cand_names = [''.join(char for char in i if char.isalpha() or char.isspace()) for i in cand_names]
    i_copy['CANDIDATE NAME(f)'] = cand_names
    
    # Keep only relevant columns
    i_copy = i_copy[keep_cols].copy()
    formatted_fec.append((year,i_copy))

In [19]:
for year, i in formatted_fec:
    print(year)
    print(i.columns)

FEC_2012
Index(['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)',
       'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY',
       '(I)', 'GENERAL VOTES '],
      dtype='object')
FEC_2014
Index(['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)',
       'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY',
       '(I)', 'GENERAL VOTES '],
      dtype='object')
FEC_2016
Index(['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)',
       'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY',
       '(I)', 'GENERAL VOTES '],
      dtype='object')
FEC_2018
Index(['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)',
       'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY',
       '(I)', 'GENERAL VOTES '],
      dtype='object')
FEC_2020
Index(['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)',
       'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY',
       '(I)', 'GENERAL VOTES '],
  

In [20]:
for year, i in formatted_fec:
    states = [state for state in list(i['STATE ABBREVIATION'].unique())]
    print(i['STATE ABBREVIATION'].nunique())
    print(states)

56
['AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VI', 'VA', 'WA', 'WV', 'WI', 'WY']
55
['AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VI', 'VA', 'WA', 'WV', 'WI', 'WY']
56
['AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'VI', 'WA', 'WV'

In [21]:
# Probably better to have too many than too few States..

In [22]:
for year, i in formatted_fec:
    states = [state for state in i['STATE ABBREVIATION'].unique()]
    for state in states:
        cand_list = i.loc[i['STATE ABBREVIATION'] == state]['CANDIDATE NAME(f)'].tolist()
        if len(cand_list) > i.loc[i['STATE ABBREVIATION'] == state]['CANDIDATE NAME(f)'].unique().shape[0]:
            print(f"{state} in {year} has candidates with duplicate names")
            # print([item for item in set(cand_list) if cand_list.count(item) > 1])
        # else:
        #     print(f"{state} in {year} has all unique candidates")


CT in FEC_2012 has candidates with duplicate names
KY in FEC_2012 has candidates with duplicate names
MI in FEC_2012 has candidates with duplicate names
NJ in FEC_2012 has candidates with duplicate names
NY in FEC_2012 has candidates with duplicate names
SC in FEC_2012 has candidates with duplicate names
WA in FEC_2012 has candidates with duplicate names
CT in FEC_2014 has candidates with duplicate names
NJ in FEC_2014 has candidates with duplicate names
NY in FEC_2014 has candidates with duplicate names
NC in FEC_2014 has candidates with duplicate names
SC in FEC_2014 has candidates with duplicate names
VA in FEC_2014 has candidates with duplicate names
CT in FEC_2016 has candidates with duplicate names
HI in FEC_2016 has candidates with duplicate names
KY in FEC_2016 has candidates with duplicate names
NY in FEC_2016 has candidates with duplicate names
PA in FEC_2016 has candidates with duplicate names
RI in FEC_2016 has candidates with duplicate names
SC in FEC_2016 has candidates w

In [None]:

# Since we are processing states individually, we should keep this list in mind and make sure to be diligent when they do come up

In [None]:
# We should also build a workflow now that incoporates the formatting choices we have made here.

# Keeping in mind our principle of non-destructive processes, we will plan to do the minimial number of formatting choices when handling the FEC data at this stage

# This will mean repeating other formatting choices when adding in the State data, 
##which will thus be more resource heavy each time we do so, than simply performining the alterations in a single step

In [None]:
# Empty list to hold FEC files
fec_files = []

# format FEC data
print('Formatting FEC data')
for year,i in fec_data: # Call item in the file list
    
    # Read each file from the FEC file list
    file = pd.read_csv(fr'{fec_folder_path}{i}', index_col=0)
    
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4] 
    
    # Assign the dataframe to the variable name
    globals()[name] = file # from the documentation: 'the globals() function is a built-in function that returns a dictionary representing the current global symbol table' only half understand this, but it works (#programming)
    
    # Append both to the empty list
    fec_files.append(file)

In [None]:
for i in fec_files:
    print(i.columns)

In [None]:
# How nice to have data that is consistent over the years examined!

In [None]:
# In terms of the structure, remember that the shape of our final data for analysis will be:
## rows of counties with
### columns for each party
### columns for each IRS datafield

## we will need format the FEC data so that it can be used to replace the candidate names and allow the data to be grouped by party and incumbancy 

# A consistent formatting for candidate names must be used. 

In [None]:
fec_files[0]['CANDIDATE NAME']

In [None]:
######
# Potential pitfalls
######
## Candidates might have the same first and last names as other candidates
## Absolute worst case, candidates with the same name might be running in the same race

## Checks should be instituted to ensure this is not the case

# Mitigating this issue: The data to be examined will be broken up and processed with the FEC data by year and by state
# This siloing will help limit the chances of duplicate names for different candidates becoming an issue before we reduce the dimension of the data to include only incumbancy and party affiliation


In [None]:
# At this time, we can also consider another aspect of the data which will need to be handled - the presence of third party candidates
## To reduce the dimensionality of our data, for this initial anaylsis, all third parties will be labeled as 'Other'
## Future analysis may wish to look at this in more detail

In [None]:
for i in fec_files:
    print(i['PARTY'].unique())

In [None]:
def FEC_simplifier(i):
    # Rename columns to simple format across all years
    new_col_dic = {'(I) Incumbent Indicator':'(I)','District':'D', 'DISTRICT':'D'}
    # Drop all rows that received no votes in the general election, or do not have a first name
    i = i.dropna(subset=['GENERAL VOTES ','CANDIDATE NAME (First)']) 
    i = i.reset_index(drop=True)
    
    # Current columns needed
    keep_cols = ['STATE ABBREVIATION','D','CANDIDATE NAME (First)','CANDIDATE NAME (Last)','CANDIDATE NAME(f)','CANDIDATE NAME','PARTY','(I)','GENERAL VOTES ']
    
    # Apply new column names
    i_copy = i.rename(columns=new_col_dic).copy()
    if '(I) Incumbent Indicator' in i.columns:
        i['(I)'] = i[['(I) Incumbent Indicator']]
    else:
        i = i.copy()
    # Convert incumbancy to binary value
    i_copy['(I)'] = i['(I)'].notna().astype(int)
    
    # Apply the two_party function (see modules.py)
    i_copy['PARTY'] = i.apply(two_party,axis=1)
    
    # Use trim_party (see modules.py) to remove middle names or titles stored in FEC First Names column
    i_copy['SIMPLE_FIRST'] = trim_party(i['CANDIDATE NAME (First)'],' ')
    
    # Create full name from first and last, transform to lower case to allow for easy comparison across datasets
    i_copy['CANDIDATE NAME(f)'] = i_copy['SIMPLE_FIRST']+' '+i_copy['CANDIDATE NAME (Last)']
    i_copy['CANDIDATE NAME(f)'] = i_copy['CANDIDATE NAME(f)'].astype(str).str.lower()
    
    # Keep only relevant columns
    i_copy = i_copy[keep_cols].copy()
    i = i_copy
    return i