In [1]:
import numpy as np
import pandas as pd
from src.modules import * #contains functions used in common with processing election and IRS data

import os #Used when reading/writing csv files programatically

Read in each file in the raw IRS folder

In [2]:
irs_raw_folder_path = 'data/irs_data/raw' 

irs_raw_files = [file for file in os.listdir(irs_raw_folder_path ) if os.path.isfile(os.path.join(irs_raw_folder_path , file))]

For each file, get a list of the file name and the dataframe, this will be useful for tracing any inconsistencies in the datasets so they can be evaluated and corrected either with python or, if a programatic approach isn't appropriate, in the csv

In [3]:
#Create an empty list to hold filenames and dataframes

list_irs_files = []

for i in irs_raw_files:
    file = pd.read_csv(f'{irs_raw_folder_path}/{i}', encoding='latin-1') #the original; irs formatting requires this encoding, better to keep this in mind rather than reformatting each CSV file individually
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    # Assign the dataframe to the variable name
    
    # Append both to the empty list
    list_irs_files.append((name, file))


In [4]:
# check the shape of our data, if the number of rows differ, we will need to investigate further
for i in list_irs_files:
    print(f"{i[0]} has a shape of {i[1].shape}")

irs_count_2011_f has a shape of (3193, 74)
irs_count_2012_f has a shape of (3193, 77)
irs_count_2013_f has a shape of (3193, 115)
irs_count_2014_f has a shape of (3192, 128)
irs_count_2015_f has a shape of (3192, 132)
irs_count_2016_f has a shape of (3192, 148)
irs_count_2017_f has a shape of (3192, 154)
irs_count_2018_f has a shape of (3192, 154)
irs_count_2019_f has a shape of (3193, 153)
irs_count_2020_f has a shape of (3193, 166)


In [5]:
# differences in the number of columns is acceptable, but differences in the number of rows represents lost/gained counties, states, or some other geographic boundary

In [6]:
# let's treat the most recent data as the baseline and see what differences there are in the States and Countynames, to locate aany missing or extra entries which will need addressing, esp. with regards for the differences in row counts

# Extract unique pairs of 'STATE' and 'COUNTYNAME' from the first dataframe
base_state_county_pairs = list_irs_files[0][1].groupby('STATE')['COUNTYNAME'].agg(list).to_dict()

states = list(list_irs_files[0][1]['STATE'].unique())

In [7]:
print(len(states))
states

51


['AL',
 'AK',
 'AZ',
 'AR',
 'CA',
 'CO',
 'CT',
 'DE',
 'DC',
 'FL',
 'GA',
 'HI',
 'ID',
 'IL',
 'IN',
 'IA',
 'KS',
 'KY',
 'LA',
 'ME',
 'MD',
 'MA',
 'MI',
 'MN',
 'MS',
 'MO',
 'MT',
 'NE',
 'NV',
 'NH',
 'NJ',
 'NM',
 'NY',
 'NC',
 'ND',
 'OH',
 'OK',
 'OR',
 'PA',
 'RI',
 'SC',
 'SD',
 'TN',
 'TX',
 'UT',
 'VT',
 'VA',
 'WA',
 'WV',
 'WI',
 'WY']

In [8]:
# This does produce a list of State abbr. plus 'DC' for the Dist. of Columbia

In [9]:
# First, let's see if any States have varying numbers of counties by creating lists of each county with a common state name and checking the length of those lists of counties

In [10]:
# Iterate through the rest of the dataframes and compare with the expected pairs
for i, (name, df) in enumerate(list_irs_files[1:], start=2):
    # Extract unique pairs from the current dataframe
    current_state_counties = df.groupby('STATE')['COUNTYNAME'].agg(list).to_dict()
    
    for state in states:
        if len(current_state_counties[state]) != len(base_state_county_pairs[state]):
            print(f"There is a mismatch in {name} for the State {state}")
            print(f"{current_state_counties[state]}")
            for county in current_state_counties[state]:
                if county not in base_state_county_pairs[state]:
                    print(f"{county} in {name} is not in base data")
            for county_base in base_state_county_pairs[state]:
                if county_base not in current_state_counties[state]:
                    print(f"{county_base} is not in {name}")


There is a mismatch in irs_count_2014_f for the State VA
['Virginia', 'Accomack County', 'Albemarle County', 'Alleghany County', 'Amelia County', 'Amherst County', 'Appomattox County', 'Arlington County', 'Augusta County', 'Bath County', 'Bedford County', 'Bland County', 'Botetourt County', 'Brunswick County', 'Buchanan County', 'Buckingham County', 'Campbell County', 'Caroline County', 'Carroll County', 'Charles City Count', 'Charlotte County', 'Chesterfield Count', 'Clarke County', 'Craig County', 'Culpeper County', 'Cumberland County', 'Dickenson County', 'Dinwiddie County', 'Essex County', 'Fairfax County', 'Fauquier County', 'Floyd County', 'Fluvanna County', 'Franklin County', 'Frederick County', 'Giles County', 'Gloucester County', 'Goochland County', 'Grayson County', 'Greene County', 'Greensville County', 'Halifax County', 'Hanover County', 'Henrico County', 'Henry County', 'Highland County', 'Isle of Wight Coun', 'James City County', 'King and Queen Cou', 'King George County'

In [11]:
# Hey! This shows that for the data for 2014-2018, Virginia is missing a row (as shown when we checked the shape above), and for 2019 and 2020 Virgina is (likely) missing a row, but Alaska has an extra one (bringing the shapes in line), 

In [12]:
# Let's see if we can locate these differences, before expanding our search:

# We will alter the codeblock above to also print out the list of missing (or extra) counties in States where the number of counties differs from the group
for i, (name, df) in enumerate(list_irs_files[1:], start=2):
    # Extract unique pairs from the current dataframe
    current_state_counties = df.groupby('STATE')['COUNTYNAME'].agg(list).to_dict()
    
    for state in states:
        if len(current_state_counties[state]) != len(base_state_county_pairs[state]):
            print(f"There is a mismatch in {name} for the State {state}")
            for county in current_state_counties[state]:
                if county not in base_state_county_pairs[state]:
                    print(f"{county} in {name} is not in base data")
            for county_base in base_state_county_pairs[state]:
                if county_base not in current_state_counties[state]:
                    print(f"{county_base} is not in {name}")

There is a mismatch in irs_count_2014_f for the State VA
Charles City Count in irs_count_2014_f is not in base data
Chesterfield Count in irs_count_2014_f is not in base data
Isle of Wight Coun in irs_count_2014_f is not in base data
King and Queen Cou in irs_count_2014_f is not in base data
King William Count in irs_count_2014_f is not in base data
Northumberland Cou in irs_count_2014_f is not in base data
Pittsylvania Count in irs_count_2014_f is not in base data
Prince Edward Coun in irs_count_2014_f is not in base data
Prince George Coun in irs_count_2014_f is not in base data
Prince William Cou in irs_count_2014_f is not in base data
Rappahannock Count in irs_count_2014_f is not in base data
Spotsylvania Count in irs_count_2014_f is not in base data
Westmoreland Count in irs_count_2014_f is not in base data
Charlottesville ci in irs_count_2014_f is not in base data
Colonial Heights c in irs_count_2014_f is not in base data
Fredericksburg cit in irs_count_2014_f is not in base data

In [13]:
# That makes a lot of noise..
# A quick look at the data here suggests that for some years there is a character limit for County Names and that is contributing to our mismatches  

In [14]:
# Well! This raises a number of issues with this data and is a LOT more than we were hoping for
# While the data from 2011, 2012, and 2013 all seems to agree, we find mismatched county names across the other data

In [15]:
# Goals for this initial attempt to homogenize the County Names data:L
## ensure common typographic formatting (reduce all strings to lower case)
## remove non-distinct category names from teh strings ('County' and it's varients, city, Borough, etc.)
### This will have the added effect of (hopefully) reducing the mismatches when the length of strings has been trimmed in the original data
## At the same time, we should consider elliminating the statewide data, as many states have counties named for the State, which when we trim 'County' from them could produce errors
### (or another State, there are 31 Washington counties, which highlights another issue: Data must be grouped by County AND State to avoid contamination)

# See https://en.wikipedia.org/wiki/List_of_United_States_counties_and_county_equivalents for additional details


In [16]:

#Let's re-do the original function reading in the IRS data, including code to remove unneeded data and edit the list of counties to better harmonize across the years
#Create an empty list to hold filenames and dataframes

formatted_irs_files = []

# These states with two-word names will not fit our filter condition in the block below and must be removed by another means
two_word_states = ['New Hampshire', 'New Jersey', 'New York', 'New Mexico', 'North Carolina', 'North Dakota', 'Rhode Island', 'South Carolina', 'South Dakota', 'West Virginia']

for i in irs_raw_files:
    file = pd.read_csv(f'{irs_raw_folder_path}/{i}', encoding='latin-1') #the original; irs formatting requires this encoding, better to keep this in mind rather than reformatting each CSV file individually
    file = file.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True) # drop columns not used for the current analysis, but raw files are presevered for future use.
        
    # Lambda function to generate a boolian mask filtering 'COUNTYNAME' values with a single word, 
    # removing all single name counties and counties from the two_name_state list (should be aggegated state data, rather than County or other sub-division) 
    file = file[~file['COUNTYNAME'].isin(two_word_states)]
    filter_counties = lambda row: len(row['COUNTYNAME'].split()) >=2
    data_counties = file[file.apply(filter_counties, axis=1)]
    
    # As seen in the EDA for our State election data, we should choose a format for the data that will be consistent, opting here for all lower case for counties
    # Here we are splitting the COUNTYNAME at each whitespace and then joining everything except the final word
    lower_countynames = [' '.join(i.lower().split()[:-1]) for i in data_counties['COUNTYNAME']] # As seen in the EDA for our State election data, we should choose a format for the data that will be consistent, opting here for all lower case for counties
    
    # To avoid working on a slice of the dataframe, we will substitute a copy of the original
    data_copy = data_counties.reset_index().copy()
    data_copy['COUNTYNAME'] = lower_countynames
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    # Assign the dataframe to the variable name
    
    # Append both to the empty list
    formatted_irs_files.append((name, data_copy))


In [17]:
# from this formatted data, let's rebuild our base
base_state_county_pairs = formatted_irs_files[0][1].groupby('STATE')['COUNTYNAME'].agg(list).to_dict()

states = list(formatted_irs_files[0][1]['STATE'].unique())

In [18]:
# Let's see if we can locate these differences, before expanding our search:

# We will alter the codeblock above to also print out the list of missing (or extra) counties in States where the number of counties differs from the group
for i, (name, df) in enumerate(formatted_irs_files[1:], start=2):
    # Extract unique pairs from the current dataframe
    current_state_counties = df.groupby('STATE')['COUNTYNAME'].agg(list).to_dict()
    
    for state in states:
        if len(current_state_counties[state]) != len(base_state_county_pairs[state]):
            print(f"There is a mismatch in {name} for the State {state}")
            for county in current_state_counties[state]:
                if county not in base_state_county_pairs[state]:
                    print(f"{county} in {name} is not in base data")
            for county_base in base_state_county_pairs[state]:
                if county_base not in current_state_counties[state]:
                    print(f"{county_base} is not in {name}")

There is a mismatch in irs_count_2014_f for the State AK
aleutians west in irs_count_2014_f is not in base data
dillingham in irs_count_2014_f is not in base data
fairbanks north in irs_count_2014_f is not in base data
hoonah-angoon in irs_count_2014_f is not in base data
ketchikan in irs_count_2014_f is not in base data
kusilvak census in irs_count_2014_f is not in base data
lake and in irs_count_2014_f is not in base data
petersburg in irs_count_2014_f is not in base data
prince of in irs_count_2014_f is not in base data
southeast in irs_count_2014_f is not in base data
valdez-cordova in irs_count_2014_f is not in base data
wrangell city in irs_count_2014_f is not in base data
yukon-koyukuk in irs_count_2014_f is not in base data
aleutians west census is not in irs_count_2014_f
dillingham census is not in irs_count_2014_f
fairbanks north star is not in irs_count_2014_f
hoonah-angoon census is not in irs_count_2014_f
ketchikan gateway is not in irs_count_2014_f
lake and peninsula is n

In [19]:
# So, the Virginia data accounts for the difference in the size of our dataframes, but this does not produce any extra countynames, only a note that the lengths disagree 
# The only reason this should be is if the extra row in the 2011-2013 data has the same name as another row in the Virginia data
# A closer read of some of the outputs above shows a 'Bedford City' missing from the 2014-2020 data, which when we apply the filter removing final words will match 'Bedford County'
# 
# We can also see that Alaska is flagged for the 2019 and 2020 data, suggesting that an extra county-level tax division is included in the data for those years.
#
# We will need to be diligent when adding the election data for these States, which we had already flagged when noting the complexities of their reporting as noted in the Wiki article linked above00

In [20]:
#### FROM CHATGPT


# from this formatted data, let's rebuild our base
base_state_county_pairs = formatted_irs_files[0][1].groupby('STATE')['COUNTYNAME'].agg(list).to_dict()

states = list(formatted_irs_files[0][1]['STATE'].unique())

# Let's see if we can locate these differences, before expanding our search:

# We will alter the codeblock above to also print out the list of missing (or extra) counties in States where the number of counties differs from the group
for i, (name, df) in enumerate(formatted_irs_files[1:], start=2):
    # Extract unique pairs from the current dataframe
    current_state_counties = df.groupby('STATE')['COUNTYNAME'].agg(list).to_dict()
    
    for state in states:
        if set(current_state_counties[state]) != set(base_state_county_pairs[state]):
            print(f"There is a mismatch in {name} for the State {state}")
            missing_counties = set(base_state_county_pairs[state]) - set(current_state_counties[state])
            extra_counties = set(current_state_counties[state]) - set(base_state_county_pairs[state])
            
            if missing_counties:
                print(f"Missing counties in {name}: {', '.join(missing_counties)}")
                
            if extra_counties:
                print(f"Extra counties in {name}: {', '.join(extra_counties)}")

There is a mismatch in irs_count_2014_f for the State AK
Missing counties in irs_count_2014_f: wrangell city and, aleutians west census, matanuska-susitna, hoonah-angoon census, ketchikan gateway, lake and peninsula, petersburg census, yukon-koyukuk census, prince of wales-hyder census, southeast fairbanks census, valdez-cordova census, fairbanks north star, wade hampton census, dillingham census
Extra counties in irs_count_2014_f: dillingham, fairbanks north, petersburg, lake and, ketchikan, prince of, valdez-cordova, hoonah-angoon, kusilvak census, southeast, yukon-koyukuk, wrangell city, aleutians west
There is a mismatch in irs_count_2014_f for the State LA
Missing counties in irs_count_2014_f: la salle, st. john the baptist
Extra counties in irs_count_2014_f: lasalle, st. john the
There is a mismatch in irs_count_2014_f for the State MN
Missing counties in irs_count_2014_f: lake of the woods
Extra counties in irs_count_2014_f: lake of the
There is a mismatch in irs_count_2014_f fo

In [21]:

#Let's re-do the original function reading in the IRS data, including code to remove unneeded data and edit the list of counties to better harmonize across the years
#Create an empty list to hold filenames and dataframes

formatted_irs_files = []

# These states with two-word names will not fit our filter condition in the block below and must be removed by another means
two_word_states = ['New Hampshire', 'New Jersey', 'New York', 'New Mexico', 'North Carolina', 'North Dakota', 'Rhode Island', 'South Carolina', 'South Dakota', 'West Virginia']

# IRS misnamed contains a dictionary corresponding to misnamed counties which are to be homogenized across years
irs_misnamed = {'Matanuska-Susitna':'Matanuska-Susitna Borough'}
sd_misnamed = {'shannon':'oglala lakota'}
nm_misnamed = {'dona ana':'doña ana'}
la_misnamed = {'lasalle':'la salle','st. john the':'st. john the baptist'} 
mn_misnamed = {'lake of the':'lake of the woods'}

for i in irs_raw_files:
    file = pd.read_csv(f'{irs_raw_folder_path}/{i}', encoding='latin-1') #the original; irs formatting requires this encoding, better to keep this in mind rather than reformatting each CSV file individually
    file = file.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True) # drop columns not used for the current analysis, but raw files are presevered for future use.
        
    # Since Matanuska-Susitna is misnamed in the 2014 data, we should rename it
    # We will also most likely need to build this out further as additional misnamed counties are found:
    if 'SD' in file['STATE'].values:
        file['COUNTYNAME'] = file['COUNTYNAME'].replace(sd_misnamed, regex=True)
    elif 'NM' in file['STATE'].values:
        file['COUNTYNAME'] = file['COUNTYNAME'].replace(nm_misnamed, regex=True)
    elif 'LA' in file['STATE'].values:
        file['COUNTYNAME'] = file['COUNTYNAME'].replace(la_misnamed, regex=True)
    elif 'MN' in file['STATE'].values:
        file['COUNTYNAME'] = file['COUNTYNAME'].replace(mn_misnamed, regex=True)    
    
    file['COUNTYNAME'] = file['COUNTYNAME'].replace(irs_misnamed)
    
    # Drop the 'Bedford City' row, which only appears in the pre-2014 data, now:
    file = file[file['COUNTYNAME'] != 'Bedford city']
  
    # Lambda function to generate a boolian mask filtering 'COUNTYNAME' values with a single word, 
    # removing all single name counties and counties from the two_name_state list (should be aggegated state data) 
    file = file[~file['COUNTYNAME'].isin(two_word_states)]
    filter_counties = lambda row: len(row['COUNTYNAME'].split()) >=2
    data_counties = file[file.apply(filter_counties, axis=1)]
    
    # As seen in the EDA for our State election data, we should choose a format for the data that will be consistent, opting here for all lower case for counties
    # Here we are splitting the COUNTYNAME at each whitespace and then joining everything except the final word
    lower_countynames = [' '.join(i.lower().split()[:-1]) for i in data_counties['COUNTYNAME']] # As seen in the EDA for our State election data, we should choose a format for the data that will be consistent, opting here for all lower case for counties
    
    # To avoid working on a slice of the dataframe, we will substitute a copy of the original
    data_copy = data_counties.reset_index().copy()
    data_copy['COUNTYNAME'] = lower_countynames
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    # Assign the dataframe to the variable name
    
    # Append both to the empty list
    formatted_irs_files.append((name, data_copy))


In [22]:
# check the shape of our data, if the number of rows differ, we will need to investigate further
for i in formatted_irs_files:
    print(f"{i[0]} has a shape of {i[1].shape}")

irs_count_2011_f has a shape of (3142, 72)
irs_count_2012_f has a shape of (3142, 75)
irs_count_2013_f has a shape of (3142, 113)
irs_count_2014_f has a shape of (3142, 126)
irs_count_2015_f has a shape of (3142, 130)
irs_count_2016_f has a shape of (3142, 146)
irs_count_2017_f has a shape of (3142, 152)
irs_count_2018_f has a shape of (3142, 152)
irs_count_2019_f has a shape of (3143, 151)
irs_count_2020_f has a shape of (3143, 164)


In [23]:
# from this formatted data, let's rebuild our base
base_state_county_pairs = formatted_irs_files[0][1].groupby('STATE')['COUNTYNAME'].agg(list).to_dict()

states = list(formatted_irs_files[0][1]['STATE'].unique())

In [24]:
# Let's see if we can locate these differences, before expanding our search:

# We will alter the codeblock above to also print out the list of missing (or extra) counties in States where the number of counties differs from the group
for i, (name, df) in enumerate(formatted_irs_files[1:], start=2):
    # Extract unique pairs from the current dataframe
    current_state_counties = df.groupby('STATE')['COUNTYNAME'].agg(list).to_dict()
    
    for state in states:
        if len(current_state_counties[state]) != len(base_state_county_pairs[state]):
            print(f"There is a mismatch in {name} for the State {state}")
            for county in current_state_counties[state]:
                if county not in base_state_county_pairs[state]:
                    print(f"{county} in {name} is not in base data")
            for county_base in base_state_county_pairs[state]:
                if county_base not in current_state_counties[state]:
                    print(f"{county_base} is not in {name}")

There is a mismatch in irs_count_2019_f for the State AK
aleutians west in irs_count_2019_f is not in base data
chugach census in irs_count_2019_f is not in base data
copper river census in irs_count_2019_f is not in base data
fairbanks north in irs_count_2019_f is not in base data
hoonah-angoon in irs_count_2019_f is not in base data
kusilvak census in irs_count_2019_f is not in base data
prince of in irs_count_2019_f is not in base data
southeast in irs_count_2019_f is not in base data
yukon-koyukuk in irs_count_2019_f is not in base data
aleutians west census is not in irs_count_2019_f
fairbanks north star is not in irs_count_2019_f
hoonah-angoon census is not in irs_count_2019_f
prince of wales-hyder census is not in irs_count_2019_f
southeast fairbanks census is not in irs_count_2019_f
valdez-cordova census is not in irs_count_2019_f
wade hampton census is not in irs_count_2019_f
yukon-koyukuk census is not in irs_count_2019_f
There is a mismatch in irs_count_2020_f for the State 

In [None]:
for i in formatted_irs_files:
    va_counties = i[1].loc[i[1]['STATE']=='VA']['COUNTYNAME'].dropna().tolist()
    print(len(va_counties))

In [None]:
base_va_counties = formatted_irs_files[-1][1].loc[formatted_irs_files[-1][1]['STATE']=='VA']['COUNTYNAME'].tolist()
for i in formatted_irs_files[:-1]:
    va_counties = i[1].loc[i[1]['STATE']=='VA']['COUNTYNAME'].tolist()
    for i in va_counties:
        if i not in base_va_counties:
            print(i)

In [None]:
base_va_counties = formatted_irs_files[-1][1].loc[formatted_irs_files[-1][1]['STATE']=='VA']['COUNTYNAME'].tolist()
for i in formatted_irs_files[:-1]:
    va_counties = i[1].loc[i[1]['STATE']=='VA']['COUNTYNAME'].tolist()
    print(va_counties)

In [None]:
# Extract county lists for each dataframe
county_lists = [i[1].loc[i[1]['STATE']=='VA']['COUNTYNAME'].tolist() for i in formatted_irs_files]

# Find the set difference to identify the specific county causing the discrepancy
set_difference = set(county_lists[0])
for county_list in county_lists[1:]:
    set_difference.symmetric_difference_update(county_list)

print(set_difference)


In [None]:
base_va_counties = set(formatted_irs_files[-1][1].loc[formatted_irs_files[-1][1]['STATE']=='VA']['COUNTYNAME'].dropna().tolist())

for i in formatted_irs_files[:-1]:
    va_counties = set(i[1].loc[i[1]['STATE']=='VA']['COUNTYNAME'].dropna().tolist())
    set_difference = va_counties.symmetric_difference(base_va_counties)

    if set_difference:
        print(f"Differences in {i[0]} for the state VA:")
        print(set_difference)


In [None]:
# Iterate through the rest of the dataframes and compare with the expected pairs
for i, (name, df) in enumerate(list_irs_files[1:], start=2):
    # Extract unique pairs from the current dataframe
    current_state_counties = df.groupby('STATE')['COUNTYNAME'].agg(list).to_dict()

    # Identify missing or extra pairs
    missing_pairs = {state: set(base_state_county_pairs.get(state, [])) - set(current_state_counties.get(state, [])) for state in base_state_county_pairs.keys()}
    extra_pairs = {state: set(current_state_counties.get(state, [])) - set(base_state_county_pairs.get(state, [])) for state in current_state_counties.keys()}

    # Print information about missing or extra pairs
    for state, counties in missing_pairs.items():
        if counties:
            print(f"Missing pairs in dataframe {name} for state {state}:\n{counties}")

    for state, counties in extra_pairs.items():
        if counties:
            print(f"Extra pairs in dataframe {name} for state {state}:\n{counties}")

In [None]:
# First, let's get a list of all the common columns across all of our data
ne

# The columns in the first dataframe will for the base we will compare all other dataframes to
# Since we will only be keeping columns common to all dataframes, this starting point is arbitrary

common_col = set(list_irs_files[0][1].columns) 

for i in list_irs_files[1:]:
    # for each dataframe after the first, create a set of the column names and use that to update the original set
    new_col_list = set(i[1].columns) 
    common_col = common_col & new_col_list
    
# Since we might be altering and ordering this, let's transform it into a list

common_col = list(common_col)

In [None]:
print(common_col)
len(common_col)

In [None]:
# If we retain all of these common columns except for the data origin and non-metric ones (State and county names, AGI (tax bracket), and FIPS numbers), we will be adding 62 independent variables to any ML algorith, which... is a lot

In [None]:
# While we don't expect county names top change year-to-year, we should check both to ensure they are formatted identically over years and otherwise remain consistent

base_counties = list_irs_files[-1][1]['COUNTYNAME'].to_list()
for i in list_irs_files[0:-1]:
    unmatched_counties = [county for county in i[1]['COUNTYNAME'].tolist() if county not in base_counties]
    if len(unmatched_counties) > 0:
        print(f"Counties mismatch in {i[0]}")
        print(unmatched_counties)
    else:
        print(f"Counties in base list match {i[0]}")

In [None]:
# Many of these mismatches appear to be a result of different maximum character counts truncating the length of the data, we can try to resolve this by removing the final word, so that if 'County' has been rendered as 'Co', 'Count', or something similar
# the resulting string will match.
# At the same time, we should consider elliminating the statewide data, as many states have counties named for the State, which when we trim 'County' from them could produce errors
# (or another State, there are 31 Washington counties, which highlights another issue: Data must be grouped by County AND State to avoid contamination)

# See https://en.wikipedia.org/wiki/List_of_United_States_counties_and_county_equivalents for additional details

#### ####NOTE for future data - Virginia, Maryland, Missouri and Nevada data will contain county and city data (seen above in cases like 'Petersburg Census Area'), so relying on 'County' or similar to be the final word of 'COUNTYNAME' will not hold.
As well, Louisianna uses 'Parishes',
A brief examination of this data appears below
####

In [None]:
IRS_2012_data = list_irs_files[1][1]

In [None]:
print(IRS_2012_data.loc[IRS_2012_data['STATE'] == 'VA']['COUNTYNAME'].to_list())

In [None]:
print(IRS_2012_data.loc[IRS_2012_data['STATE'] == 'MO']['COUNTYNAME'].to_list())

In [None]:
print(IRS_2012_data.loc[IRS_2012_data['STATE'] == 'NV']['COUNTYNAME'].to_list())

In [None]:
print(IRS_2012_data.loc[IRS_2012_data['STATE'] == 'MD']['COUNTYNAME'].to_list())

In [None]:
print(IRS_2012_data.loc[IRS_2012_data['STATE'] == 'AK']['COUNTYNAME'].to_list())

In [None]:

#Let's re-do the original function reading in the IRS data, including code to remove unneeded data and edit the list of counties to better harmonize across the years
#Create an empty list to hold filenames and dataframes

formatted_irs_files = []

# These states with two-word names will not fit our filter condition in the block below and must be removed by another means
two_word_states = ['New Hampshire', 'New Jersey', 'New York', 'New Mexico', 'North Carolina', 'North Dakota', 'Rhode Island', 'South Carolina', 'South Dakota', 'West Virginia']

for i in irs_raw_files:
    file = pd.read_csv(f'{irs_raw_folder_path}/{i}', encoding='latin-1') #the original; irs formatting requires this encoding, better to keep this in mind rather than reformatting each CSV file individually
    file = file.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True) # drop columns not used for the current analysis, but raw files are presevered for future use.
        
    # Lambda function to generate a boolian mask filtering 'COUNTYNAME' values with a single word, 
    # removing all single name counties and counties from the two_name_state list (should be aggegated state data) 
    file = file[~file['COUNTYNAME'].isin(two_word_states)]
    filter_counties = lambda row: len(row['COUNTYNAME'].split()) >=2
    data_counties = file[file.apply(filter_counties, axis=1)]
    
    # As seen in the EDA for our State election data, we should choose a format for the data that will be consistent, opting here for all lower case for counties
    # Here we are splitting the COUNTYNAME at each whitespace and then joining everything except the final word
    lower_countynames = [' '.join(i.lower().split()[:-1]) for i in data_counties['COUNTYNAME']] # As seen in the EDA for our State election data, we should choose a format for the data that will be consistent, opting here for all lower case for counties
    
    # To avoid working on a slice of the dataframe, we will substitute a copy of the original
    data_copy = data_counties.reset_index().copy()
    data_copy['COUNTYNAME'] = lower_countynames
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    # Assign the dataframe to the variable name
    
    # Append both to the empty list
    formatted_irs_files.append((name, data_copy))


In [None]:
for i in formatted_irs_files:
    print(i[1].shape)

In [None]:
# The differences here compared to above: If every state has the aggregate data included in the IRS sheets, we would expect the number of lines to be reduced by 50
# Since we see an anomanlous drop of 51 in the 2014 data, we should query the original data to see what exactly is happening

In [None]:
two_word_states = ['New Hampshire', 'New Jersey', 'New York', 'New Mexico', 'North Carolina', 'North Dakota', 'Rhode Island', 'South Carolina', 'South Dakota', 'West Virginia']

# Iterate through the dataframes
for name, df in list_irs_files:
    # Filter rows where 'COUNTYNAME' has fewer than 2 words or is in the list of two-word states
    filtered_rows = df[df['COUNTYNAME'].apply(lambda x: len(str(x).split()) < 2 or x in two_word_states)][['STATE','COUNTYNAME']]

    # Print all items which we belive should correspond to aggregate State data
    
    if not filtered_rows.empty:
        print(f"Filtered rows in dataframe {name}:\n{filtered_rows}")
        print(f"number of rows:{len(filtered_rows)}")

In [None]:
# Here we can see above that 'Matanuska-Susitna' in the 2014 data was being selected by the filter removing all single-word COUNTYNAME
# We can correct that by either creating an exception for that name, or altering the countyname in the dataframe so that 'Matanuska-Susitna' matches the other years' data

In [None]:
# let's run the block of code looking for missing or misformatted COUNTNAME data in our dataframes:

# Extract unique pairs of 'STATE' and 'COUNTYNAME' from the first dataframe
expected_pairs = formatted_irs_files[0][1].groupby('STATE')['COUNTYNAME'].agg(list).to_dict()

# Iterate through the rest of the dataframes and compare with the expected pairs
for i, (name, df) in enumerate(formatted_irs_files[1:], start=2):
    # Extract unique pairs from the current dataframe
    current_pairs = df.groupby('STATE')['COUNTYNAME'].agg(list).to_dict()

    # Identify missing or extra pairs
    missing_pairs = {state: set(expected_pairs.get(state, [])) - set(current_pairs.get(state, [])) for state in expected_pairs.keys()}
    extra_pairs = {state: set(current_pairs.get(state, [])) - set(expected_pairs.get(state, [])) for state in current_pairs.keys()}

    # Print information about missing or extra pairs
    for state, counties in missing_pairs.items():
        if counties:
            print(f"Missing pairs in dataframe {name} for state {state}:\n{counties}")

    for state, counties in extra_pairs.items():
        if counties:
            print(f"Extra pairs in dataframe {name} for state {state}:\n{counties}")

In [None]:
# Let's build up our dictionary of county names to replace
    irs_misnamed = {'Matanuska-Susitna':'Matanuska-Susitna Borough','dona ana':'doña ana'}

In [None]:
# Let's continue exploring though, before 
# While we don't expect county names top change year-to-year, we should check both to ensure they are formatted identically over years and otherwise remain consistent

base_counties = formatted_irs_files[-1][1]['COUNTYNAME'].to_list()
for i in formatted_irs_files[0:-1]:
    unmatched_counties = [county for county in i[1]['COUNTYNAME'].tolist() if county not in base_counties]
    if len(unmatched_counties) > 0:
        print(f"Counties mismatch in {i[0]}")
        print(i[1].loc[i[1]['COUNTYNAME'].isin(unmatched_counties)][['STATE','COUNTYNAME']])
    else:
        print(f"Counties in base list match {i[0]}")

In [None]:
# First, let's get a list of all the common columns across all of our data
# This will give us an idea of how large the final datasets we will be adding to the election data might be and allow us to anticipate issues and better imagine solutions down the line

# The columns in the first dataframe will for the base we will compare all other dataframes to
# Since we will only be keeping columns common to all dataframes, this starting point is arbitrary

common_col = set(formatted_irs_files[0][1].columns) 

for i in formatted_irs_files[1:]:
    # for each dataframe after the first, create a set of the column names and use that to update the original set
    new_col_list = set(i[1].columns) 
    common_col = common_col & new_col_list
    
# Since we might be altering and ordering this, let's transform it into a list

common_col = list(common_col)

In [None]:
print(common_col)
len(common_col)

In [None]:
# If we retain all of these common columns except for the data origin and non-metric ones (State and county names, AGI (tax bracket), and FIPS numbers), we will be adding 62 independent variables to any ML algorith, which... is a lot

In [None]:
formatted_irs_files[-1][1].loc[formatted_irs_files[-1][1]['STATE']=='AK'][['STATE', 'COUNTYNAME']]