In [1]:
import numpy as np
import pandas as pd
from modules import *

import os #Used when reading/writing csv files programatically

In [2]:
# fed_folder_path = '../data/FEC/'

# fec_files = [file for file in os.listdir(fed_folder_path) if os.path.isfile(os.path.join(fed_folder_path, file))]

# # Empty list to hold FEC files
# FEC_files = []

# for i in fec_files: # Call item in the file list
    
#     # Read each file from the FEC file list
#     file = pd.read_csv(fr'{fed_folder_path}{i}', index_col=0)
    
    
#     # Generate a name for each dataframe based on the filename without the file extension
#     name = f'{i}' 
#     name = name[:-4] 
    
#     # Assign the dataframe to the variable name
#     globals()[name] = file # from the documentation: 'the globals() function is a built-in function that returns a dictionary representing the current global symbol table' only half understand this, but it works (#programming)
    
#     # Append both to the empty list
#     FEC_files.append(file)

In [11]:
fec_folder_path = '../data/FEC/'

fec_files = [file for file in os.listdir(fec_folder_path) if os.path.isfile(os.path.join(fec_folder_path, file))]


# Empty list to hold FEC files
FEC_files = []

# format FEC data
print('Formatting FEC data')
for i in fec_files: # Call item in the file list
    
    # Read each file from the FEC file list
    file = pd.read_csv(fr'{fec_folder_path}{i}')
    
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = i.split('.')[0]
    
    # Add the dataframe to the list
    FEC_files.append(file)
print("FEC files formatted")

Formatting FEC data
FEC files formatted


In [10]:
print(fec_files)

['FEC_2012.csv', 'FEC_2014.csv', 'FEC_2016.csv', 'FEC_2018.csv', 'FEC_2020.csv']


In [4]:
# All raw election data is stored in the same folder, ordered by the State abbreviation.
print("Loading WI election data")
elec_folder_path = '../data/raw_elec_totals'

WI_files = [file for file in os.listdir(elec_folder_path) if os.path.isfile(os.path.join(elec_folder_path, file)) and file.startswith('WI')]


WI_sheetnames = []
for i in WI_files:
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    WI_sheetnames = [i for i in file.sheet_names if i.startswith('Sheet')]

Loading WI election data


In [5]:
print("Formattting WI election data")

# Create empty lists to hold dataframes/filenames
WI_cong = []
WI_filenames = []

for i in WI_files:
    # Create name based on filename
    name = i[:7]
    
    # Read WI files
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    
    # Get a list of sheetnames in the WI data files
    WI_sheetnames = [i for i in file.sheet_names if i.startswith('Sheet')]
    
    # expected column names to reformat
    col_dic = {'Unnamed: 0':'County'} 
    drop_col = ['Unnamed: 1', 'Unnamed: 2','SCATTERING']
    
    # Empty dataframe to hold sheets as they are concatinated
    data = pd.DataFrame()
    
    for sheet in WI_sheetnames:
        current_sheet = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name=sheet, header=5)
        data = pd.concat([data, current_sheet], ignore_index=True)
    
    # All sheets should be read in and concatinated
    
    # Drop the columns containing vote totals per county and the empty 2rd column
    data_copy = data.drop(columns=drop_col)
    # Rename 'County' column to 'County'
    data_copy = data_copy.rename(columns=col_dic) 
    data_copy = data_copy[~data_copy['County'].str.contains('Total')].copy()
    data_copy = data_copy.groupby('County').sum().reset_index()
    data_copy['County'] = [i.lower().strip() for i in data_copy['County'].tolist()]
    
    # To begin, candidate names will be rendered in all lowers to match the FEC data
    cols = data_copy.columns.tolist()
    cols = [i.lower() for i in cols]
    
    # Drop party/write-in lables and remove middle names
    cols = trim_party(cols)
    cols = remove_middle_name(cols)
    cols = [''.join(char for char in i if char.isalpha() or char.isspace()) for i in cols]
    data_copy.columns = cols
    data_copy = data_copy.fillna(0)
    
    WI_cong.append(data_copy)
    WI_filenames.append(name)

Formattting WI election data


In [12]:
# Create zipped list of formatted State Election data, FEC list of candidates and parties, and the filenames found in raw_elec data
zipped_WI_FEC = zip(WI_cong, FEC_files, WI_filenames)

In [13]:
# Further process and transform election data, grouping vote totals by party and incumbancy
# Allows analysis on these two metrics
for i, j, k in zipped_WI_FEC:
    
    # Joins FEC and State data for each year, produces list of counties as well
    # If an error is generated here, there is likely a mismatch between the counties in these files
    formatted_WI_FEC, counties = state_join_FEC(i,j)
    transformed_data = state_trans(formatted_WI_FEC, counties)
    
    # Writes the transformed data to a .csv file whose name references the original filename
    transformed_data.to_csv(fr"../data/formatted_house_totals/{k[:7]}.csv", index=False)

In [None]:
def state_join_FEC(data, fec_data):
    """ 
    This function prepares and joins FEC data to the state election data
    """
    #for state data, first column is always County names
    county_col = data.columns[0]
    
    #county names will be reinserted later for the merger with IRS data
    counties = data[county_col].tolist()
    counties = [i.lower() for i in counties]
    data_t = data.drop(county_col, axis=1).copy()
    
    # Transpose the dataframe so that our columns are the county vote totals and candidates are rows
    # This is done to aid the transformation and grouping of candidates by party
    data_t=data_t.transpose()
    cand_list = list(data_t.index)
    
    # Render candidate names in lowercase to match FEC data
    # cand_list = [i.lower() for i in cand_list]
    # data_t.index = cand_list
    
    # Merge FEC data, associating each candidate with their party and incumbancy
    data_t = pd.merge(data_t, fec_data, left_index=True, right_on='CANDIDATE NAME(f)').reset_index(drop=True)
    
    # return dataframe of candidates and list of counties
    return data_t, counties

In [36]:
WI_formatted_path = '../data/formatted_house_totals'

WI_files_f = [file for file in os.listdir(elec_folder_path) if os.path.isfile(os.path.join(WI_formatted_path, file)) and file.startswith('WI')]

formatted_WI = []

for i in WI_files_f: # Call item in the file list
    
    # Read each file from the FEC file list
    file = pd.read_csv(fr'{WI_formatted_path}/{i}')
    
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = i.split('.')[0]
    
    # Add the dataframe to a dictionary with the key being the filename
    formatted_WI.append(file)

In [37]:
formatted_WI

[         County        D0        D1   OTHER0       R0        R1
 0         adams       0.0    5183.0      0.0   3799.0       0.0
 1       ashland    5051.0       0.0      0.0      0.0    3172.0
 2        barron    9708.0       0.0     17.0      0.0   11621.0
 3      bayfield    5573.0       0.0      0.0      0.0    4026.0
 4         brown   55730.0       0.0      0.0      0.0   67021.0
 5       buffalo       0.0    4290.0      0.0   2252.0       0.0
 6       burnett    3587.0       0.0      0.0      0.0    4526.0
 7       calumet   10245.0       0.0      0.0      0.0   15354.0
 8      chippewa    5249.0   11053.0      0.0   6094.0    6966.0
 9         clark    5702.0       0.0      0.0      0.0    7823.0
 10     columbia   14699.0       0.0      0.0      0.0   13709.0
 11     crawford       0.0    5203.0      0.0   2158.0       0.0
 12         dane  204185.0       0.0      6.0  81979.0       0.0
 13        dodge   15206.0       0.0      0.0      0.0   26924.0
 14         door    8544.

In [17]:
irs_folder_path = '../data/irs_data/'

irs_files = [file for file in os.listdir(irs_folder_path) if os.path.isfile(os.path.join(irs_folder_path, file))]

In [18]:
house_data_path = '../data/formatted_house_totals/'

WI_formatted = [file for file in os.listdir(house_data_path) if os.path.isfile(os.path.join(house_data_path, file)) and file.startswith('WI')]

In [25]:
county_check = []

WI_FEC_IRS_merge = []

for i in WI_formatted:
    name = i[:7]
    state = i[0:2] #first two letters of the filename, corresponding to the state abbv.
    year = i[5:7] #two digit year 
    
    # Import house file
    house = pd.read_csv(f'{house_data_path}{i}')
    
    irs = pd.read_csv(f'{irs_folder_path}irs_count_20{year}_f.csv')
    
    # Get IRS data only for the relevant state (this is done to ensure that states with identicle county names do not have their data mixed)
    irs_state = irs.loc[irs['STATE']==state]
    irs_county = irs_state['COUNTYNAME'].tolist()
    # irs_county = [i for i in remove_keyword(irs_county,'county')] # format to match the House data
    
    # Due to the length of some county names and the character limit of the 'COUNTYNAME' field in the original IRS data
    # some counties have 'County' displayed as 'Count', 'Coun' or 'Co'
    # Thus, the current solution will instead by to drop the final word
    irs_county = [' '.join(i.split()[:-1]) for i in irs_county]
    
    #These two counties are misnamed in the IRS data
    irs_misnamed = {'de witt':'dewitt','jo daviess':'jodaviess'}
    # This '.get's (haha) the correct county names
    cor_counties = [irs_misnamed.get(item, item) for item in irs_county]
    
    irs_copy = irs_state.reset_index().copy() #avoid setting a value on a copy of a slice
    irs_copy['COUNTYNAME'] = cor_counties
    
    # Drop the 'index' column
    irs_copy.drop(columns=['index'], inplace=True)
    
    # Check if any county names do not match between the state data and the IRS data, if so, add to the 'county_check' list
    unmatched_counties = [county for county in irs_copy['COUNTYNAME'].tolist() if county not in house['County'].tolist()]
    if len(unmatched_counties) > 0:
        print(f"Counties mismatch in {i}: {unmatched_counties}")
        county_check.append('1')
    else:
        print(f"IRS data merged with {i}")
    name = f"{name}_irs"
    
    WI_FEC_IRS_merge.append((name,irs_copy))

IRS data merged with WI_2012.csv
IRS data merged with WI_2014.csv
IRS data merged with WI_2016.csv
IRS data merged with WI_2018.csv
IRS data merged with WI_2020.csv


In [None]:
transformed_data.to_csv(fr"../data/formatted_house_totals/{k[:7]}.csv", index=False)

In [33]:
for year,i in WI_FEC_IRS_merge:
    print(f"{year} has a shape: {i.shape}")
    print(i)

WI_2012_irs has a shape: (72, 74)
   STATE   COUNTYNAME        N1     MARS1     MARS2    MARS4      PREP  \
0     WI        adams    9000.0    4090.0    4070.0    780.0    5440.0   
1     WI      ashland    7460.0    3670.0    2910.0    820.0    3800.0   
2     WI       barron   22520.0   10580.0    9790.0   1990.0   13910.0   
3     WI     bayfield    7780.0    3540.0    3570.0    610.0    3950.0   
4     WI        brown  124780.0   60820.0   49900.0  13290.0   66700.0   
5     WI      buffalo    6630.0    3020.0    3050.0    510.0    5040.0   
6     WI      burnett    7680.0    3460.0    3460.0    700.0    4950.0   
7     WI      calumet   23450.0   10000.0   11650.0   1710.0   13260.0   
8     WI     chippewa   29840.0   13700.0   13360.0   2610.0   17580.0   
9     WI        clark   14760.0    6320.0    7010.0   1360.0    9940.0   
10    WI     columbia   29210.0   13860.0   12580.0   2530.0   17480.0   
11    WI     crawford    7710.0    3720.0    3290.0    610.0    5310.0   
12  

In [31]:
for year,i in WI_FEC_IRS_merge:
    print(f"{year} has a shape: {i.shape}")
    print(i.info())

WI_2012_irs has a shape: (72, 74)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 74 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   STATE       72 non-null     object 
 1   COUNTYNAME  72 non-null     object 
 2   N1          72 non-null     float64
 3   MARS1       72 non-null     float64
 4   MARS2       72 non-null     float64
 5   MARS4       72 non-null     float64
 6   PREP        72 non-null     float64
 7   N2          72 non-null     float64
 8   NUMDEP      72 non-null     float64
 9   A00100      72 non-null     float64
 10  N00200      72 non-null     float64
 11  A00200      72 non-null     float64
 12  N00300      72 non-null     float64
 13  A00300      72 non-null     float64
 14  N00600      72 non-null     float64
 15  A00600      72 non-null     float64
 16  N00650      72 non-null     float64
 17  A00650      72 non-null     float64
 18  N00900      72 non-null     float64
 1

In [34]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

for year, i in WI_FEC_IRS_merge:
    print(f"{year} has a shape: {i.shape}")
    print(i.info())

WI_2012_irs has a shape: (72, 74)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 74 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   STATE       72 non-null     object 
 1   COUNTYNAME  72 non-null     object 
 2   N1          72 non-null     float64
 3   MARS1       72 non-null     float64
 4   MARS2       72 non-null     float64
 5   MARS4       72 non-null     float64
 6   PREP        72 non-null     float64
 7   N2          72 non-null     float64
 8   NUMDEP      72 non-null     float64
 9   A00100      72 non-null     float64
 10  N00200      72 non-null     float64
 11  A00200      72 non-null     float64
 12  N00300      72 non-null     float64
 13  A00300      72 non-null     float64
 14  N00600      72 non-null     float64
 15  A00600      72 non-null     float64
 16  N00650      72 non-null     float64
 17  A00650      72 non-null     float64
 18  N00900      72 non-null     float64
 1

In [35]:
for year, i in WI_FEC_IRS_merge:
    print(f"{year} has a shape: {i.shape}")
    print("Column Names:")
    print(i.columns)
    print("Number of Non-Null Entries:")
    print(i.count())
    print("Data Types:")
    print(i.dtypes)


WI_2012_irs has a shape: (72, 74)
Column Names:
Index(['STATE', 'COUNTYNAME', 'N1', 'MARS1', 'MARS2', 'MARS4', 'PREP', 'N2',
       'NUMDEP', 'A00100', 'N00200', 'A00200', 'N00300', 'A00300', 'N00600',
       'A00600', 'N00650', 'A00650', 'N00900', 'A00900', 'SCHF', 'N01000',
       'A01000', 'N01400', 'A01400', 'N01700', 'A01700', 'N02300', 'A02300',
       'N02500', 'A02500', 'N03300', 'A03300', 'N04470', 'A00101', 'A04470',
       'N18425', 'A18425', 'N18450', 'A18450', 'N18500', 'A18500', 'N18300',
       'A18300', 'N19300', 'A19300', 'N19700', 'A19700', 'N04800', 'A04800',
       'N09600', 'A09600', 'N07100', 'A07100', 'N07180', 'A07180', 'N07220',
       'A07220', 'N07260', 'A07260', 'N59660', 'A59660', 'N59720', 'A59720',
       'N11070', 'A11070', 'N06500', 'A06500', 'N10300', 'A10300', 'N11901',
       'A11901', 'N11902', 'A11902'],
      dtype='object')
Number of Non-Null Entries:
STATE         72
COUNTYNAME    72
N1            72
MARS1         72
MARS2         72
MARS4      

In [None]:
WI_formatted_path = '../data/formatted_house_totals'

WI_files_f = [file for file in os.listdir(elec_folder_path) if os.path.isfile(os.path.join(elec_folder_path, file)) and file.startswith('WI')]

formatted_WI = []

for i in WI_files_f: # Call item in the file list
    
    # Read each file from the FEC file list
    file = pd.read_csv(fr'{WI_formatted_path}/{i}')
    
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = i.split('.')[0]
    
    # Add the dataframe to a dictionary with the key being the filename
    formatted_WI.append(file)