In [1]:
import numpy as np
import pandas as pd
from src.modules import *
import os

In [2]:
# All raw election data is stored in the same folder, ordered by the State abbreviation.

elec_folder_path = 'data/raw_elec_totals'

WI_files = [file for file in os.listdir(elec_folder_path) if os.path.isfile(os.path.join(elec_folder_path, file)) and file.startswith('WI')]


In [3]:
WI_files

# Confirm the filetypes

['WI_2012_Cong_raw.xlsx',
 'WI_2014_Cong_raw.xlsx',
 'WI_2016county.xlsx',
 'WI_2018county.xlsx',
 'WI_2020county.xlsx']

In [4]:

# Get a list of sheetnames in the WI data files
for i in WI_files:
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    
    print(file.sheet_names)

['Sheet2', 'Sheet3', 'Sheet4', 'Sheet5', 'Sheet6', 'Sheet7', 'Sheet8', 'Sheet9']
['Sheet2', 'Sheet3', 'Sheet4', 'Sheet5', 'Sheet6', 'Sheet7', 'Sheet8', 'Sheet9']
['Document map', 'Sheet2', 'Sheet3', 'Sheet4', 'Sheet5', 'Sheet6', 'Sheet7', 'Sheet8', 'Sheet9']
['Document map', 'Sheet2', 'Sheet3', 'Sheet4', 'Sheet5', 'Sheet6', 'Sheet7', 'Sheet8', 'Sheet9']
['Document map', 'Sheet2', 'Sheet3', 'Sheet4', 'Sheet5', 'Sheet6', 'Sheet7', 'Sheet8', 'Sheet9']


In [5]:
# Let's produce a list of these sheetnames, excluding the document map
WI_sheetnames = []
for i in WI_files:
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    WI_sheetnames = [i for i in file.sheet_names if i.startswith('Sheet')]

In [6]:
WI_test = pd.read_excel(f'{elec_folder_path}/{WI_files[0]}',sheet_name=WI_sheetnames[0])

In [7]:
WI_test

Unnamed: 0.1,Unnamed: 0,G.A.B. Canvass Reporting System\nCounty by County Report,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,2012 PRESIDENTIAL AND GENERAL ELECTION,,,,,
1,,,,,,,
2,CONGRESSIONAL - DISTRICT 1,,,,,,
3,County,,Total Votes Cast,REP,DEM,IND,
4,,,,PAUL RYAN,ROB ZERBAN,KEITH DESCHLER,SCATTERING
5,KENOSHA,,78698,36130,41121,1396,51
6,MILWAUKEE,,49828,30150,18904,751,23
7,RACINE,,99535,50294,47722,1494,25
8,ROCK,,40672,18888,20928,833,23
9,WALWORTH,,45391,27935,16575,856,25


In [8]:
# Assuming that this formatting is the same across the years of data, a function using the 5th row as the header will allow the easiest construction of dataframes adhering to the form we need:

# Our goals for formatting this data will be - 
## Build a dataframe with columns for each Candidate containing:

### Vote totals broken down by county

### Party ID for that Candidate

### A binary marker for Incumbant/Challenger
#### These last two will provide the keys to merging candidate vote totals, since our depended variables in the initially-proposed analysis will be 'Vote Total for Incumbents' and 'Vote Total for Challengers'
#### Tagging candidates by Party ID will allow future analysis to examine effects of party popularity on the IRS and vote-total data

## The list of columns should be reduced to:

### County names
#### This will be the Primary Key for merging the IRS data
#### These counties should be formatted in all lowercase to match the IRS data and to avoid any inconsistencies with naming conventions year-to-year

### Canddidate names
#### Each candidate should have their name formatted to match the FEC data while avoiding differences in formatting between States ('Lastname, Firstname' vs 'Firstname M. Lastname' vs. 'Firstname Middlename Lastname' vs. '(Title) Firsname Lastname(, Jr.)', etc etc) and year-to-year
#### Anticipated diffrerences include name suffixes ("Jr., Sr.") and titles ("Dr.") which are inculded inconsistently across the datasets
#### The formatting initially agreed on is 'Firstname Lastname' to balance legibility, consistency, and ease
##### EXPECTED ISSUES: It is conceivable that candidates will have the same first and last names within a given year
##### Because we are transforming the data for each State in each year before joining, replacing Candidate names with Party ID and Incumbancy means that data is unlikely to become polluted by merging vote totals for candidates with the same simplified name.
##### None the less, future analysis may benefit from developing more rigorous methods of ensuring this pollution cannot occur

In [9]:
# We will need a function that will concatinate these sheets so that we can examine all of their contents together and make sure the correct number of counties appear, and that the candidates match the data from the FEC

In [10]:
# Initial assumptions:
## The first column will contain our county names, as labelled in the 4th row
### Currently, using the 5th row as our header will cause it to be named 'Unnamed: 0' when it is read into a dataframe

## The 3rd column contains total votes cast, as labelled in the 4th row
### We should be good to drop this whole column

## Each sheet appears to be the results in individual congressional races
## Columns in the 4th row with string data will be Candidate names and the word 'Scattering'
## Subsequent rows appear to contain the vote totals

In [11]:
WI_cong = []

for i in WI_files:
    # Read file to get sheet names
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    WI_sheetnames = [i for i in file.sheet_names if i.startswith('Sheet')]
    
    # expected column names to reformat
    col_dic = {'Unnamed: 0':'County'} 
    drop_col = ['Unnamed: 1', 'Unnamed: 2']
    
    # Empty dataframe to hold sheets as they are concatinated
    data = pd.DataFrame()
    
    for sheet in WI_sheetnames:
        current_sheet = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name=sheet, header=5)
        data = pd.concat([data, current_sheet], ignore_index=True)
    
    # All sheets should be read in and concatinated
    
    # Drop the columns containing vote totals per county and the empty 2rd column
    data_copy = data.drop(columns=drop_col)
    # Rename 'County' column.. to 'County'
    data_copy = data_copy.rename(columns=col_dic) 
    # data = data.groupby('County').sum().reset_index()
    data_copy['County'] = [i.lower().strip() for i in data_copy['County'].tolist()]
    
    WI_cong.append((i,data_copy))

In [12]:
for year, i in WI_cong:
    print(i.head())
    print(i.columns)

      County  PAUL RYAN  ROB ZERBAN  KEITH DESCHLER  SCATTERING  CHAD LEE  \
0    kenosha    36130.0     41121.0          1396.0          51       NaN   
1  milwaukee    30150.0     18904.0           751.0          23       NaN   
2     racine    50294.0     47722.0          1494.0          25       NaN   
3       rock    18888.0     20928.0           833.0          23       NaN   
4   walworth    27935.0     16575.0           856.0          25       NaN   

   MARK POCAN  JOE KOPSICK (WRITE-IN)  RAY BOLAND  RON KIND  ...  \
0         NaN                     NaN         NaN       NaN  ...   
1         NaN                     NaN         NaN       NaN  ...   
2         NaN                     NaN         NaN       NaN  ...   
3         NaN                     NaN         NaN       NaN  ...   
4         NaN                     NaN         NaN       NaN  ...   

   ROBERT R. RAYMOND  F. JAMES SENSENBRENNER JR  DAVE HEASTER  TOM PETRI  \
0                NaN                        NaN     

In [13]:
# This is getting close to what we need.
# A quick glance shows that the candidate names can have middle initials, which should be removed
# and write-in candidates have (write-in) after their names, which will run afoul of the formatting in the FEC data
## We can re-use the 'trim_party' function used to clean up the Ohio data as well
# We will add 'SCATTERING' to the list of columns to drop

# Check the number of counties and see if there are duplicates or some other values in that column

In [14]:
def remove_middle_name(data):
    """this will return the start and end of a split item,
    built to remove middle names and titles from full name columns """
    no_middle = []
    for i in data:
        if len(i.split()) > 1:
            if i.split()[-1][-1] == '.': #will call the line below, skipping any titles appended to last names ending in '.'
                no_middle.append(i.split()[0] + ' ' + i.split()[-2])
            elif i.split()[-1].lower() == 'jr': #evauluating on .lower() ensures any changes in case between datasets will be ignored
                no_middle.append(i.split()[0] + ' ' + i.split()[-2])
            elif i.split()[-1].lower() == 'sr':
                no_middle.append(i.split()[0] + ' ' + i.split()[-2])
            else:
                no_middle.append(i.split()[0] + ' ' + i.split()[-1]) #default format is assumned to be 'Firstname Lastname'
        else:
            no_middle.append(i) #accounts for items without spaces, expected to be 'County' when applied to the assumed datasets.
    return no_middle

In [15]:
WI_cong = []

for i in WI_files:
    # Read file to get sheet names
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    WI_sheetnames = [i for i in file.sheet_names if i.startswith('Sheet')]
    
    # expected column names to reformat
    col_dic = {'Unnamed: 0':'County'} 
    drop_col = ['Unnamed: 1', 'Unnamed: 2','SCATTERING']
    
    # Empty dataframe to hold sheets as they are concatinated
    data = pd.DataFrame()
    
    for sheet in WI_sheetnames:
        current_sheet = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name=sheet, header=5)
        data = pd.concat([data, current_sheet], ignore_index=True)
    
    # All sheets should be read in and concatinated
    
    # Drop the columns containing vote totals per county and the empty 2rd column
    data_copy = data.drop(columns=drop_col)
    # Rename 'County' column.. to 'County'
    data_copy = data_copy.rename(columns=col_dic) 
    # data = data.groupby('County').sum().reset_index()
    data_copy['County'] = [i.lower().strip() for i in data_copy['County'].tolist()]
    
    # To begin, candidate names will be rendered in all lowers to match the FEC data
    cols = data_copy.columns.tolist()
    cols = [i.lower() for i in cols]
    
    # Drop party/write-in lables and remove middle names
    cols = trim_party(cols)
    cols = remove_middle_name(cols)
    data_copy.columns = cols
    data_copy = data_copy.fillna(0)
    
    WI_cong.append((i,data_copy))

In [16]:
for year, i in WI_cong:
    print(i.head())
    print(i.columns)

      county  paul ryan  rob zerban  keith deschler  chad lee  mark pocan  \
0    kenosha    36130.0     41121.0          1396.0       0.0         0.0   
1  milwaukee    30150.0     18904.0           751.0       0.0         0.0   
2     racine    50294.0     47722.0          1494.0       0.0         0.0   
3       rock    18888.0     20928.0           833.0       0.0         0.0   
4   walworth    27935.0     16575.0           856.0       0.0         0.0   

   joe kopsick  ray boland  ron kind  dan sebring  ...  robert raymond  \
0          0.0         0.0       0.0          0.0  ...             0.0   
1          0.0         0.0       0.0          0.0  ...             0.0   
2          0.0         0.0       0.0          0.0  ...             0.0   
3          0.0         0.0       0.0          0.0  ...             0.0   
4          0.0         0.0       0.0          0.0  ...             0.0   

   f. sensenbrenner  dave heaster  tom petri  joe kallas  sean duffy  \
0               0.0 

In [17]:
# As there are only 72 counties in Wisconsin, we should only have 72 rows

In [18]:
for year, i in WI_cong:
    sort_list = list(i['county'])
    sort_list.sort()
    
    print(f"{year} has {len(sort_list)} counties")
    print(sort_list)

WI_2012_Cong_raw.xlsx has 95 counties
['adams', 'ashland', 'barron', 'bayfield', 'brown', 'buffalo', 'burnett', 'calumet', 'chippewa', 'chippewa', 'clark', 'columbia', 'crawford', 'dane', 'dodge', 'dodge', 'door', 'douglas', 'dunn', 'eau claire', 'florence', 'fond du lac', 'forest', 'grant', 'green', 'green lake', 'iowa', 'iron', 'jackson', 'jackson', 'jefferson', 'juneau', 'juneau', 'kenosha', 'kewaunee', 'la crosse', 'lafayette', 'langlade', 'lincoln', 'manitowoc', 'marathon', 'marinette', 'marquette', 'menominee', 'milwaukee', 'milwaukee', 'milwaukee', 'milwaukee', 'monroe', 'monroe', 'oconto', 'office totals:', 'office totals:', 'office totals:', 'office totals:', 'office totals:', 'office totals:', 'office totals:', 'office totals:', 'oneida', 'outagamie', 'ozaukee', 'pepin', 'pierce', 'polk', 'portage', 'price', 'racine', 'richland', 'richland', 'rock', 'rock', 'rusk', 'sauk', 'sawyer', 'shawano', 'sheboygan', 'st. croix', 'taylor', 'trempealeau', 'vernon', 'vilas', 'walworth', '

In [19]:
# We can see things like 'office totals' present here, along with duplicate counties ('waukesha' appears multiple times, for example)
# The reason for the duplicate counties is clear: Our process for joining the multiple sheets in the original data did not account for multiple congressional districts (each sheet corresponded to a different district) containing the same counties
# This can be confirmed by counting the occurances of 'office totals' and seeing it equals the number of congressional districts

# Removing the 'office totals' is quiet simple
# Grouping the final dataframe by 'County' will reduce the duplicate counties without combining any data, so long as there are no duplicate column names

In [20]:
for year, i in WI_cong:
    if len(i.columns) == len(set(i.columns)):
        print(f"{year} has no duplicate candidates")
    else:
        print(f"{year} has DOES HAVE duplicate candidates")

WI_2012_Cong_raw.xlsx has no duplicate candidates
WI_2014_Cong_raw.xlsx has no duplicate candidates
WI_2016county.xlsx has no duplicate candidates
WI_2018county.xlsx has no duplicate candidates
WI_2020county.xlsx has no duplicate candidates


In [21]:
WI_cong = []

for i in WI_files:
    # Create name based on filename
    name = i[:7]
    
    # Read file to get sheet names
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    WI_sheetnames = [i for i in file.sheet_names if i.startswith('Sheet')]
    
    # expected column names to reformat
    col_dic = {'Unnamed: 0':'County'} 
    drop_col = ['Unnamed: 1', 'Unnamed: 2','SCATTERING']
    
    # Empty dataframe to hold sheets as they are concatinated
    data = pd.DataFrame()
    
    for sheet in WI_sheetnames:
        current_sheet = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name=sheet, header=5)
        data = pd.concat([data, current_sheet], ignore_index=True)
    
    # All sheets should be read in and concatinated
    
    # Drop the columns containing vote totals per county and the empty 2rd column
    data_copy = data.drop(columns=drop_col)
    # Rename 'County' column to 'County'
    data_copy = data_copy.rename(columns=col_dic) 
    data_copy = data_copy[~data_copy['County'].str.contains('Total')].copy()
    data_copy = data_copy.groupby('County').sum().reset_index()
    data_copy['County'] = [i.lower().strip() for i in data_copy['County'].tolist()]
    
    # To begin, candidate names will be rendered in all lowers to match the FEC data
    cols = data_copy.columns.tolist()
    cols = [i.lower() for i in cols]
    
    # Drop party/write-in lables and remove middle names
    cols = trim_party(cols)
    cols = remove_middle_name(cols)
    cols = [''.join(char for char in i if char.isalpha() or char.isspace()) for i in cols]
    data_copy.columns = cols
    data_copy = data_copy.fillna(0)
    
    WI_cong.append((name,data_copy))

In [22]:
for year, i in WI_cong:
    sort_list = list(i['county'])
    sort_list.sort()
    
    print(f"{year} has {len(sort_list)} counties")
    print(sort_list)

WI_2012 has 72 counties
['adams', 'ashland', 'barron', 'bayfield', 'brown', 'buffalo', 'burnett', 'calumet', 'chippewa', 'clark', 'columbia', 'crawford', 'dane', 'dodge', 'door', 'douglas', 'dunn', 'eau claire', 'florence', 'fond du lac', 'forest', 'grant', 'green', 'green lake', 'iowa', 'iron', 'jackson', 'jefferson', 'juneau', 'kenosha', 'kewaunee', 'la crosse', 'lafayette', 'langlade', 'lincoln', 'manitowoc', 'marathon', 'marinette', 'marquette', 'menominee', 'milwaukee', 'monroe', 'oconto', 'oneida', 'outagamie', 'ozaukee', 'pepin', 'pierce', 'polk', 'portage', 'price', 'racine', 'richland', 'rock', 'rusk', 'sauk', 'sawyer', 'shawano', 'sheboygan', 'st. croix', 'taylor', 'trempealeau', 'vernon', 'vilas', 'walworth', 'washburn', 'washington', 'waukesha', 'waupaca', 'waushara', 'winnebago', 'wood']
WI_2014 has 72 counties
['adams', 'ashland', 'barron', 'bayfield', 'brown', 'buffalo', 'burnett', 'calumet', 'chippewa', 'clark', 'columbia', 'crawford', 'dane', 'dodge', 'door', 'douglas'

In [23]:
# Lastly, let's check the candidate names here agains the formatted names in the FEC data

In [24]:
fec_folder_path = 'data/FEC/'

fec_files = [file for file in os.listdir(fec_folder_path) if os.path.isfile(os.path.join(fec_folder_path, file))]


# Empty dictionary to hold FEC files
FEC_files = {}

# format FEC data
print('Formatting FEC data')
for i in fec_files: # Call item in the file list
    
    # Read each file from the FEC file list
    file = pd.read_csv(fr'{fec_folder_path}{i}')
    
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = i.split('.')[0]
    
    # Add the dataframe to a dictionary with the key being the filename
    FEC_files[name] = file
print("FEC files formatted")

Formatting FEC data
FEC files formatted


In [25]:
for year, i in FEC_files.items():
    print(year)
    print(list(i.columns))

FEC_2012
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']
FEC_2014
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']
FEC_2016
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']
FEC_2018
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']
FEC_2020
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']


In [26]:
WI_fec = {}
for year,i in FEC_files.items():
    name = f"{year}_WI"
    fec = i.loc[i['STATE ABBREVIATION']=='WI']
    WI_fec[name] = fec

In [27]:
for year, i in WI_fec.items():
    print(year)
    print(list(i.columns))

FEC_2012_WI
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']
FEC_2014_WI
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']
FEC_2016_WI
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']
FEC_2018_WI
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']
FEC_2020_WI
['STATE ABBREVIATION', 'D', 'CANDIDATE NAME (First)', 'CANDIDATE NAME (Last)', 'CANDIDATE NAME(f)', 'CANDIDATE NAME', 'PARTY', '(I)', 'GENERAL VOTES ']


In [28]:
for i in WI_fec:
    print(i)
    print(WI_fec[i]['CANDIDATE NAME(f)'])

FEC_2012_WI
1503          paul ryan
1504         rob zerban
1505     keith deschler
1506         mark pocan
1507           chad lee
1508        joe kopsick
1509           ron kind
1510         ray boland
1511         gwen moore
1512        dan sebring
1513     robert raymond
1514    f sensenbrenner
1515       dave heaster
1516          tom petri
1517         joe kallas
1518         sean duffy
1519       pat kreitlow
1520        dale lehner
1521        reid ribble
1522         jamie wall
1523      tammy baldwin
1524     tommy thompson
1525       joseph kexel
1526       allen nimrod
1527         riley hood
1528    diane lorbiecki
Name: CANDIDATE NAME(f), dtype: object
FEC_2014_WI
1238          paul ryan
1239         rob zerban
1240     keith deschler
1241         mark pocan
1242       peter theron
1243           ron kind
1244         tony kurtz
1245      ken van doren
1246         gwen moore
1247        dan sebring
1248     robert raymond
1249    f sensenbrenner
1250     chris rockwood
1

In [29]:
for year, i in WI_cong:
    print(year)
    print(list(i.columns))

WI_2012
['county', 'paul ryan', 'rob zerban', 'keith deschler', 'chad lee', 'mark pocan', 'joe kopsick', 'ray boland', 'ron kind', 'dan sebring', 'gwen moore', 'robert raymond', 'f sensenbrenner', 'dave heaster', 'tom petri', 'joe kallas', 'sean duffy', 'pat kreitlow', 'dale lehner', 'reid ribble', 'jamie wall']
WI_2014
['county', 'rob zerban', 'paul ryan', 'keith deschler', 'mark pocan', 'peter theron', 'ron kind', 'tony kurtz', 'ken doren', 'gwen moore', 'dan sebring', 'robert raymond', 'chris rockwood', 'f sensenbrenner', 'mark harris', 'glenn grothman', 'gus fahrendorf', 'kelly westlund', 'sean duffy', 'lawrence dale', 'john schiess', 'rob taylor', 'ron gruett', 'reid ribble']
WI_2016
['county', 'paul ryan', 'ryan solen', 'jason lebeck', 'spencer zimmerman', 'peter theron', 'mark pocan', 'ron kind', 'ryan peterson', 'gwen moore', 'andy craig', 'robert raymond', 'f sensenbrenner', 'khary penebaker', 'john arndt', 'glenn grothman', 'sarah lloyd', 'jeff dahlke', 'sean duffy', 'mary ho

In [30]:
WI_cong_df = []
for year, i in WI_cong:
    WI_cong_df.append(i)

In [43]:
for wi_cong, wi_fec in zip(WI_cong_df, WI_fec):
    cands = []
    fec_cands = []
    missing_cands = []
    missing_fec_cands = []
    cand_list = [col for col in wi_cong.columns if col != 'county']  # Corrected to use WI_cong_df
    fec_cands = WI_fec[wi_fec]['CANDIDATE NAME(f)'].tolist()  # Assuming 'CANDIDATE NAME(f)' is the correct column name
    
    for cand in cand_list:
        if cand in fec_cands:
            cands.append(cand)
        else:
            missing_cands.append(cand)
    
    for fec_cand in fec_cands:  # Corrected variable name to avoid conflict
        if fec_cand not in cand_list:  # Corrected condition
            missing_fec_cands.append(fec_cand)
    
    if len(cand_list) == len(cands):
        print(f'All candidates in {wi_fec} are present in the FEC data')
    else:
        print(f'{missing_cands} are missing from {wi_fec} FEC data')
        print(f'{missing_fec_cands} are present in {wi_fec} but not in the State data')


All candidates in FEC_2012_WI are present in the FEC data
['ken doren'] are missing from FEC_2014_WI FEC data
['ken van doren'] are present in FEC_2014_WI but not in the State data
['spencer zimmerman', 'robert raymond', 'jeff dahlke'] are missing from FEC_2016_WI FEC data
['w slattery'] are present in FEC_2016_WI but not in the State data
All candidates in FEC_2018_WI are present in the FEC data
['derrick orden'] are missing from FEC_2020_WI FEC data
['derrick van orden'] are present in FEC_2020_WI but not in the State data


In [None]:
# These errors indicate an issue with our processing up to this point: we have been forgetting that removing middle initials and name from our candidates also effects candidates with two-part last names:
## 'ken doren' should be 'ken van doren', etc.
## reducing the two-word lastnames to single words (most easily done by removing one of the words) will increase noise by raising the likelyhood that candidate vote-totals will be mistakenly combined, since candidates that would be been differenciated are not identicle 
## (if there happened to be a 'ken doren' and a 'ken van doren' in the same State, for example)
## In adherence to the principle of non-desctruction, we will add a line removing the first part of two part last names when processing the FEC data for Wisconsin

## The other thing to note here is the confirmation from our EDA of the FEC data  that the 2016 Wisconsin data has some potential errors to deal with

In [47]:
# We will add a line to our processing of the FEC data to remove the first part of two-part names for the WI data
# We can do this by applying the 'remove_middle_name' function also used in the Wisconsin data

WI_fec = {}
for year,i in FEC_files.items():
    name = f"{year}_WI"
    fec = i.loc[i['STATE ABBREVIATION']=='WI']
    cand_list = remove_middle_name(fec['CANDIDATE NAME(f)'].tolist())
    fec['CANDIDATE NAME(f)'] = cand_list
    WI_fec[name] = fec

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fec['CANDIDATE NAME(f)'] = cand_list


In [48]:
WI_cong_df = []
for year, i in WI_cong:
    WI_cong_df.append(i)

In [49]:
for wi_cong, wi_fec in zip(WI_cong_df, WI_fec):
    cands = []
    fec_cands = []
    missing_cands = []
    missing_fec_cands = []
    cand_list = [col for col in wi_cong.columns if col != 'county']  # Corrected to use WI_cong_df
    fec_cands = WI_fec[wi_fec]['CANDIDATE NAME(f)'].tolist()  # Assuming 'CANDIDATE NAME(f)' is the correct column name
    
    for cand in cand_list:
        if cand in fec_cands:
            cands.append(cand)
        else:
            missing_cands.append(cand)
    
    for fec_cand in fec_cands:  # Corrected variable name to avoid conflict
        if fec_cand not in cand_list:  # Corrected condition
            missing_fec_cands.append(fec_cand)
    
    if len(cand_list) == len(cands):
        print(f'All candidates in {wi_fec} are present in the FEC data')
    else:
        print(f'{missing_cands} are missing from {wi_fec} FEC data')
        print(f'{missing_fec_cands} are present in {wi_fec} but not in the State data')


All candidates in FEC_2012_WI are present in the FEC data
All candidates in FEC_2014_WI are present in the FEC data
['spencer zimmerman', 'robert raymond', 'jeff dahlke'] are missing from FEC_2016_WI FEC data
['w slattery'] are present in FEC_2016_WI but not in the State data
All candidates in FEC_2018_WI are present in the FEC data
All candidates in FEC_2020_WI are present in the FEC data


In [51]:
# Looking at the remaining anomalies:
## w slattery is 'W. Michael Slattery', who competed in the 2016 Dem. primary, but lost to Sarah Lloyd, who in turn lost to Glenn Grothman. Lloyd and Grothman do appear in both datasets.
### Unclear why Slattery would be listed in the FEC data.

## spencer zimmerman was a candidate running as an independent in the 2016 general election for the WI 1st district seat

## robert raymond was a candidate running as an independent in the 2016 general election for the WI 4th district seat

## jeff dahlke was a candidate running as an independent in the 2016 general election for the WI 6th district seat


In [52]:
# Since 'w slattery' does not have any votes recorded at in the state data, we will be ignoring him

# All other candidates missing from the FEC data can be tagged as 'Other' and not incumbents.

In [53]:
# Now, let's check each dataset for duplicate candidate names:

In [50]:
for year, i in WI_fec.items():
    cand_list = i['CANDIDATE NAME(f)'].tolist() # all candidate name data
    if len(cand_list) > i['CANDIDATE NAME(f)'].unique().shape[0]: # the number of uniques in the column 'CANDIDATE NAME(f)' should be the same as the length of the list of names, or else there are duplicates
            print(f"{year} has candidates with duplicate names")
            print([item for item in set(cand_list) if cand_list.count(item) > 1])
    else:
        print(f"{year} has all unique candidates")

for year, i in WI_cong:
    cand_list = [col for col in i.columns if col != 'county'] # for the election data our candidates are the column names, except for 'county'
    if len(cand_list) > (len(i.columns) -1): # the length of the columns list will be one larger than the unique candidates because of the 'county' column
            print(f"{year} has candidates with duplicate names")
            print([item for item in set(cand_list) if cand_list.count(item) > 1])
    else:
        print(f"{year} has all unique candidates")


FEC_2012_WI has all unique candidates
FEC_2014_WI has all unique candidates
FEC_2016_WI has candidates with duplicate names
['andy craig', 'jason lebeck']
FEC_2018_WI has all unique candidates
FEC_2020_WI has all unique candidates
WI_2012 has all unique candidates
WI_2014 has all unique candidates
WI_2016 has all unique candidates
WI_2018 has all unique candidates
WI_2020 has all unique candidates


In [None]:
# So! That's interesting:
## In the FEC data there are two duplicate candidate names.
## andy craig ran as an independent candidate in the 4th district in 2016, while jason lebeck ran as an independent in the 1st. 
## Those are the same districts where the FEC data was missing candidate names present in the State data
### (per ballotpedia)

# It is entirely possible that, given the 2016 FEC data is the only compromised dataset, that an error was made when the FEC data was compiled.

# In any case, we will unfortunately just brute-force the correct labelling for those missing candidates

In [None]:
WI_cong = []

for i in WI_files:
    # Create name based on filename
    name = i[:7]
    
    # Read file to get sheet names
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    WI_sheetnames = [i for i in file.sheet_names if i.startswith('Sheet')]
    
    # expected column names to reformat
    col_dic = {'Unnamed: 0':'County'} 
    drop_col = ['Unnamed: 1', 'Unnamed: 2','SCATTERING']
    
    # Empty dataframe to hold sheets as they are concatinated
    data = pd.DataFrame()
    
    for sheet in WI_sheetnames:
        current_sheet = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name=sheet, header=5)
        data = pd.concat([data, current_sheet], ignore_index=True)
    
    # All sheets should be read in and concatinated
    
    # Drop the columns containing vote totals per county and the empty 2rd column
    data_copy = data.drop(columns=drop_col)
    # Rename 'County' column to 'County'
    data_copy = data_copy.rename(columns=col_dic) 
    data_copy = data_copy[~data_copy['County'].str.contains('Total')].copy()
    data_copy = data_copy.groupby('County').sum().reset_index()
    data_copy['County'] = [i.lower().strip() for i in data_copy['County'].tolist()]
    
    # To begin, candidate names will be rendered in all lowers to match the FEC data
    cols = data_copy.columns.tolist()
    cols = [i.lower() for i in cols]
    
    # Drop party/write-in lables and remove middle names
    cols = trim_party(cols)
    cols = remove_middle_name(cols)
    cols = [''.join(char for char in i if char.isalpha() or char.isspace()) for i in cols]
    data_copy.columns = cols
    data_copy = data_copy.fillna(0)
    
    WI_cong.append((name,data_copy))

In [None]:
fec_folder_path = 'data/FEC/'

fec_files = [file for file in os.listdir(fec_folder_path) if os.path.isfile(os.path.join(fec_folder_path, file))]


# Empty dictionary to hold FEC files
FEC_files = {}

# format FEC data
print('Formatting FEC data')
for i in fec_files: # Call item in the file list
    
    # Read each file from the FEC file list
    file = pd.read_csv(fr'{fec_folder_path}{i}')
    
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = i.split('.')[0]
    
    # Add the dataframe to a dictionary with the key being the filename
    FEC_files[name] = file
print("FEC files formatted")

WI_fec = {}
for year,i in FEC_files.items():
    name = f"{year}_WI"
    fec = i.loc[i['STATE ABBREVIATION']=='WI']
    WI_fec[name] = fec

In [78]:
def get_WI_data(sheet_names,filepath):
    
    """
    
    """

    col_dic = {'Unnamed: 0':'County'} #column names to reformat
    data = pd.DataFrame()

    for i in sheet_names:
        d = pd.read_excel(filepath, sheet_name=i, header=5)
        data = pd.concat([data, d], ignore_index=True)
    # # Due to source formatting, 'Counties' column will appear unnamed when first imported, rename to counties:
    data = data.rename(columns=col_dic) 
    data = data.groupby('County').sum().reset_index()
    data['County'] = [i.lower().strip() for i in data['County'].tolist()]
    data = data[~data['County'].str.contains('total')].copy()
    
    return data

In [79]:
cong_sheet = 'U.S. Congress'

In [80]:
WI_data = []

for i in WI_files:
    file = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name=cong_sheet, header=5)
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    
    # Append both to the empty list
    WI_data.append((name, file))


ValueError: Worksheet named 'U.S. Congress' not found

In [None]:
for year, i in WI_data:
    print(i.columns)

In [None]:
def get_WI_data(sheet_names,filepath):
    
    """
    
    """

    col_dic = {'Unnamed: 0':'County'} #column names to reformat
    # bad_counties = ['total','percentage'] #rows with totals rather than county data
    data = pd.DataFrame()

    for i in sheet_names:
        d = pd.read_excel(filepath, sheet_name=i, header=5)
        data = pd.concat([data, d], ignore_index=True)
    # # Due to source formatting, 'Counties' column will appear unnamed when first imported, rename to counties:
    data = data.rename(columns=col_dic) 
    data = data.groupby('County').sum().reset_index()
    # lower_county  = [i.lower() for i in data['County'].tolist()]
    data['County'] = [i.lower().strip() for i in data['County'].tolist()]
    data = data[~data['County'].str.contains('total')].copy()
    
    return data