In [1]:
import numpy as np
import pandas as pd
from modules import *

import os #Used when reading/writing csv files programatically

In [2]:
fed_folder_path = '../data/FEC/'

fec_files = [file for file in os.listdir(fed_folder_path) if os.path.isfile(os.path.join(fed_folder_path, file))]

# Empty list to hold FEC files
FEC_files = []

for i in fec_files: # Call item in the file list
    
    # Read each file from the FEC file list
    file = pd.read_csv(fr'{fed_folder_path}{i}', index_col=0)
    
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4] 
    
    # Assign the dataframe to the variable name
    globals()[name] = file # from the documentation: 'the globals() function is a built-in function that returns a dictionary representing the current global symbol table' only half understand this, but it works (#programming)
    
    # Append both to the empty list
    FEC_files.append(file)

In [3]:
# All raw election data is stored in the same folder, ordered by the State abbreviation.

elec_folder_path = 'data/raw_elec_totals'

WI_files = [file for file in os.listdir(elec_folder_path) if os.path.isfile(os.path.join(elec_folder_path, file)) and file.startswith('WI')]

# Get a list of sheetnames in the WI data files
WI_sheetnames = []
for i in WI_files:
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    WI_sheetnames = [i for i in file.sheet_names if i.startswith('Sheet')]

In [4]:
WI_cong = []

for i in WI_files:
    # Create name based on filename
    name = i[:7]
    
    # Read file to get sheet names
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    WI_sheetnames = [i for i in file.sheet_names if i.startswith('Sheet')]
    
    # expected column names to reformat
    col_dic = {'Unnamed: 0':'County'} 
    drop_col = ['Unnamed: 1', 'Unnamed: 2','SCATTERING']
    
    # Empty dataframe to hold sheets as they are concatinated
    data = pd.DataFrame()
    
    for sheet in WI_sheetnames:
        current_sheet = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name=sheet, header=5)
        data = pd.concat([data, current_sheet], ignore_index=True)
    
    # All sheets should be read in and concatinated
    
    # Drop the columns containing vote totals per county and the empty 2rd column
    data_copy = data.drop(columns=drop_col)
    # Rename 'County' column to 'County'
    data_copy = data_copy.rename(columns=col_dic) 
    data_copy = data_copy[~data_copy['County'].str.contains('Total')].copy()
    data_copy = data_copy.groupby('County').sum().reset_index()
    data_copy['County'] = [i.lower().strip() for i in data_copy['County'].tolist()]
    
    # To begin, candidate names will be rendered in all lowers to match the FEC data
    cols = data_copy.columns.tolist()
    cols = [i.lower() for i in cols]
    
    # Drop party/write-in lables and remove middle names
    cols = trim_party(cols)
    cols = remove_middle_name(cols)
    cols = [''.join(char for char in i if char.isalpha() or char.isspace()) for i in cols]
    data_copy.columns = cols
    data_copy = data_copy.fillna(0)
    
    WI_cong.append((name,data_copy))

In [13]:
# Create zipped list of formatted State Election data, FEC list of candidates and parties, and the filenames found in raw_elec data
zipped_WI_FEC = zip(formatted_WI, FEC_files, WI_names)

In [13]:
# Further process and transform election data, grouping vote totals by party and incumbancy
# Allows analysis on these two metrics
for i, j, k in zipped_WI_FEC:
    
    # Joins FEC and State data for each year, produces list of counties as well
    # If an error is generated here, there is likely a mismatch between the counties in these files
    formatted_WI_FEC, counties = state_join_FEC(i,j)
    transformed_data = state_trans(formatted_WI_FEC, counties)
    
    # Writes the transformed data to a .csv file whose name references the original filename
    transformed_data.to_csv(fr"../data/formatted_house_totals/{k[:7]}.csv", index=False)

In [5]:
formatted_WI[0][]

Unnamed: 0,county,paul ryan,rob zerban,keith deschler,chad lee,mark pocan,joe kopsick,ray boland,ron kind,dan sebring,...,robert raymond,f. sensenbrenner,dave heaster,tom petri,joe kallas,sean duffy,pat kreitlow,dale lehner,reid ribble,jamie wall
0,adams,0.0,0.0,0.0,0.0,0.0,0.0,3799.0,5183.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ashland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3172.0,5051.0,0.0,0.0,0.0
2,barron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11621.0,9708.0,17.0,0.0,0.0
3,bayfield,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4026.0,5573.0,0.0,0.0,0.0
4,brown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67021.0,55730.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,waukesha,37026.0,13164.0,724.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,113469.0,44074.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69,waupaca,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14342.0,9919.0
70,waushara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7160.0,4009.0,0.0,0.0,0.0,0.0,0.0
71,winnebago,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,45824.0,32411.0,0.0,0.0,0.0,4141.0,2318.0


In [6]:
irs_folder_path = '../data/irs_data/'

irs_files = [file for file in os.listdir(irs_folder_path) if os.path.isfile(os.path.join(irs_folder_path, file))]

In [7]:
house_data_path = '../data/formatted_house_totals/'

WI_formatted = [file for file in os.listdir(house_data_path) if os.path.isfile(os.path.join(house_data_path, file)) and file.startswith('WI')]

In [8]:
county_check = []
for i in WI_formatted:
    state = i[0:2] #first two letters of the filename, corresponding to the state abbv.
    year = i[5:7] #two digit year 
    
    # Import house file
    house = pd.read_csv(f'{house_data_path}{i}')
    
    irs = pd.read_csv(f'{irs_folder_path}irs_count_20{year}_f.csv')
    
    # Get IRS data only for the relevant state (this is done to ensure that states with identicle county names do not have their data mixed)
    irs_state = irs.loc[irs['STATE']==state]
    irs_county = irs_state['COUNTYNAME'].tolist()
    # irs_county = [i for i in remove_keyword(irs_county,'county')] # format to match the House data
    
    # Due to the length of some county names and the character limit of the 'COUNTYNAME' field in the original IRS data
    # some counties have 'County' displayed as 'Count', 'Coun' or 'Co'
    # Thus, the current solution will instead by to drop the final word
    irs_county = [' '.join(i.split()[:-1]) for i in irs_county]
    
    #These two counties are misnamed in the IRS data
    irs_misnamed = {'de witt':'dewitt','jo daviess':'jodaviess'}
    # This '.get's (haha) the correct county names
    cor_counties = [irs_misnamed.get(item, item) for item in irs_county]
    
    irs_copy = irs_state.copy() #avoid setting a value on a copy of a slice
    irs_copy['COUNTYNAME'] = cor_counties
    
    # Check if any county names do not match between the state data and the IRS data, if so, add to the 'county_check' list
    unmatched_counties = [county for county in irs_copy['COUNTYNAME'].tolist() if county not in house['County'].tolist()]
    if len(unmatched_counties) > 0:
        print(f"Counties mismatch in {i}: {unmatched_counties}")
        county_check.append('1')
    else:
        print(f"IRS data merged with {i}")

Counties mismatch in WI_2012.csv: ['adams', 'ashland', 'barron', 'bayfield', 'brown', 'buffalo', 'burnett', 'calumet', 'chippewa', 'clark', 'columbia', 'crawford', 'dane', 'dodge', 'door', 'douglas', 'dunn', 'eau claire', 'florence', 'fond du lac', 'forest', 'grant', 'green', 'green lake', 'iowa', 'iron', 'jackson', 'jefferson', 'juneau', 'kenosha', 'kewaunee', 'la crosse', 'lafayette', 'langlade', 'lincoln', 'manitowoc', 'marathon', 'marinette', 'marquette', 'menominee', 'milwaukee', 'monroe', 'oconto', 'oneida', 'outagamie', 'ozaukee', 'pepin', 'pierce', 'polk', 'portage', 'price', 'racine', 'richland', 'rock', 'rusk', 'st. croix', 'sauk', 'sawyer', 'shawano', 'sheboygan', 'taylor', 'trempealeau', 'vernon', 'vilas', 'walworth', 'washburn', 'washington', 'waukesha', 'waupaca', 'waushara', 'winnebago', 'wood']
Counties mismatch in WI_2014.csv: ['adams', 'ashland', 'barron', 'bayfield', 'brown', 'buffalo', 'burnett', 'calumet', 'chippewa', 'clark', 'columbia', 'crawford', 'dane', 'dodge