In [1]:
import numpy as np
import pandas as pd
from modules import * 
#contains functions used in common with processing election and IRS data

import os #Used when reading/writing csv files programatically

In [3]:
irs_folder_path = '../data/irs_data/'

irs_files = [file for file in os.listdir(irs_folder_path) if os.path.isfile(os.path.join(irs_folder_path, file)) and file.endswith('f_d.csv')]

In [4]:
irs_files

['irs_count_2012_f_d.csv',
 'irs_count_2014_f_d.csv',
 'irs_count_2016_f_d.csv',
 'irs_count_2018_f_d.csv',
 'irs_count_2020_f_d.csv']

In [7]:
merged = pd.DataFrame()
for i in irs_files:
    df = pd.read_csv(irs_folder_path + i)
    merged = pd.concat([merged, df], ignore_index=True)

# Optionally, you may want to reset the index of the merged DataFrame
merged.reset_index(drop=True, inplace=True)

In [16]:
print(merged.isnull().sum())

STATE             5
COUNTYNAME        5
N1                8
MARS1             8
MARS2             8
              ...  
A10970        12613
N10971        12613
A10971        12613
N10973        12613
A10973        12613
Length: 179, dtype: int64


In [19]:
merged.loc[merged['N1'].isna()]

Unnamed: 0,STATE,COUNTYNAME,N1,MARS1,MARS2,MARS4,PREP,N2,NUMDEP,A00100,...,N02910,A02910,N11450,A11450,N10970,A10970,N10971,A10971,N10973,A10973
6114,WA,adams county,,,,,,,,,...,,,,,,,,,,
6209,WI,adams county,,,,,,,,,...,,,,,,,,,,
6281,WY,albany county,,,,,,,,,...,,,,,,,,,,
6304,,,,,,,,,,,...,,,,,,,,,,
6305,,,,,,,,,,,...,,,,,,,,,,
6306,,,,,,,,,,,...,,,,,,,,,,
6307,,,,,,,,,,,...,,,,,,,,,,
6308,,,,,,,,,,,...,,,,,,,,,,


In [20]:
irs_raw_folder_path = '../data/irs_data/raw' 
irs_folder_path = '../data/irs_data/'

irs_raw_files = [file for file in os.listdir(irs_raw_folder_path ) if os.path.isfile(os.path.join(irs_raw_folder_path , file))]

# reverse the order, so the newest data is at the front of this list 
# No real need for this, but it helps with thinking about the values we're examining
# We want growth to be positive and loss to be negative to keep with the common understanding
irs_raw_files = irs_raw_files[::-1]

In [25]:
# Create empty list to hold tuples of filenames|dataframes
processed_irs_files = []

print("Formatting raw IRS data")
for i in irs_raw_files:
    file = pd.read_csv(f'{irs_raw_folder_path}/{i}', encoding='latin-1') #irs formatting requires this encoding
    file = file.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True) # drop columns not used for the current analysis, but raw files are presevered for future use.
    
    # Lambda function to generate a boolian mask filtering 'COUNTYNAME' values with a single word, 
    # removing all single name counties (should be agg. state data)
    filter_counties = lambda row: len(row['COUNTYNAME'].split()) >=2
    data_counties = file[file.apply(filter_counties, axis=1)]
    lower_countynames = [' '.join(i.split()[:-1]).lower() for i in data_counties['COUNTYNAME']]
    data_copy = data_counties.copy()
    data_copy['COUNTYNAME'] = lower_countynames
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    # Assign the dataframe to the variable name
    # globals()[name] = data_copy # from the documentation: 'the globals() function is a built-in function that returns a dictionary representing the current global symbol table' only half understand this, but it works (#programming)
    
    # Append both to the empty list
    processed_irs_files.append((name, data_copy))

Formatting raw IRS data


In [26]:
processed_irs_files[0][1]

Unnamed: 0,STATE,COUNTYNAME,N1,MARS1,MARS2,MARS4,ELF,CPREP,PREP,DIR_DEP,...,N85300,A85300,N11901,A11901,N11900,A11900,N11902,A11902,N12000,A12000
1,AL,autauga,26320.0,11410.0,10130.0,4100.0,24310.0,700.0,11760.0,19040.0,...,290.0,604.0,5020.0,19136.0,20570.0,64502.0,20310.0,60927.0,340.0,3181.0
2,AL,baldwin,112470.0,49740.0,46300.0,13310.0,102570.0,3060.0,55050.0,73130.0,...,3070.0,10401.0,24830.0,157074.0,82860.0,293170.0,79960.0,247067.0,3810.0,45965.0
3,AL,barbour,10760.0,5490.0,2790.0,2270.0,9340.0,250.0,6410.0,6810.0,...,100.0,273.0,1480.0,6578.0,8510.0,27152.0,8410.0,26521.0,140.0,529.0
4,AL,bibb,9330.0,4440.0,3170.0,1550.0,8180.0,250.0,4720.0,6490.0,...,50.0,124.0,1310.0,3650.0,7620.0,23846.0,7560.0,22534.0,90.0,1194.0
5,AL,blount,24670.0,10120.0,11400.0,2630.0,23210.0,530.0,14920.0,18100.0,...,190.0,357.0,4130.0,16496.0,19730.0,60471.0,19480.0,57848.0,340.0,2522.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,WY,sweetwater,19850.0,9070.0,8110.0,2320.0,18770.0,460.0,9440.0,14510.0,...,330.0,684.0,3030.0,13647.0,16320.0,56128.0,16020.0,52219.0,430.0,3431.0
3189,WY,teton,15130.0,8940.0,4910.0,930.0,14080.0,340.0,8770.0,7740.0,...,1910.0,136918.0,4260.0,109426.0,10420.0,271359.0,8820.0,60800.0,1960.0,207001.0
3190,WY,uinta,9510.0,4040.0,4320.0,990.0,8930.0,230.0,4640.0,6810.0,...,130.0,441.0,1570.0,7215.0,7640.0,27463.0,7500.0,24596.0,200.0,2597.0
3191,WY,washakie,3790.0,1700.0,1680.0,330.0,3540.0,80.0,1980.0,2550.0,...,50.0,303.0,710.0,4081.0,2900.0,8495.0,2800.0,7769.0,140.0,594.0


In [27]:
for i in processed_irs_files:
    print(i[1].loc[i[1]['N1'].isna()])

Empty DataFrame
Columns: [STATE, COUNTYNAME, N1, MARS1, MARS2, MARS4, ELF, CPREP, PREP, DIR_DEP, VRTCRIND, N2, TOTAL_VITA, VITA, TCE, VITA_EIC, RAC, ELDERLY, A00100, N02650, A02650, N00200, A00200, N00300, A00300, N00600, A00600, N00650, A00650, N00700, A00700, N00900, A00900, N01000, A01000, N01400, A01400, N01700, A01700, SCHF, N02300, A02300, N02500, A02500, N26270, A26270, N02900, A02900, N03220, A03220, N03300, A03300, N03270, A03270, N03150, A03150, N03210, A03210, N02910, A02910, N04450, A04450, N04100, A04100, N04200, A04200, N04470, A04470, A00101, N17000, A17000, N18425, A18425, N18450, A18450, N18500, A18500, N18800, A18800, N18460, A18460, N18300, A18300, N19300, A19300, N19500, A19500, N19530, A19530, N19550, A19550, N19570, A19570, N19700, A19700, N20950, A20950, N04475, A04475, N04800, ...]
Index: []

[0 rows x 163 columns]
Empty DataFrame
Columns: [STATE, COUNTYNAME, N1, MARS1, MARS2, MARS4, ELF, CPREP, PREP, DIR_DEP, N2, TOTAL_VITA, VITA, TCE, VITA_EIC, RAC, ELDERLY, A

In [None]:
def prev_year_change2(data1, data2):
    """
    This function will substitute the difference between a year and the previous year's IRS records, 
    Giving us the change-over-year and letting us use that as an indication of economic growth/loss
    
    NOTE: data1 should be the more recent year, data2 should be the older year.
    """
    
    # Create a set containing columns common to both dataframes 
    # Note for future use, the '&' here only keeps items that intersect both lists
    col_common = set(data1.columns) & set(data2.columns)
    
    data1 = data1[col_common]
    data2 = data2[col_common]
    #merge the dataframes, keeping data1 if the datatype is 'Object' or if the column is not in the set (all columns held in column should appear in the set..)
    merged = pd.concat([
        data1[c] if data1[c].dtype == 'O' else data1[c] - data2[c]
        for c in data1.columns
        
    ], axis=1)
    

    merged_df = pd.merge(data1, data2, on=['STATE', 'COUNTYNAME'], suffixes=('_df1', '_df2'))

    # Subtract values in df1 from df2 for columns where the names match
    for column in data1.columns:
        if column not in ['STATE', 'COUNTYNAME']:
            merged_df[column] = merged_df[column + '_df2'] - merged_df[column + '_df1']

    # Drop redundant columns
    merged_df.drop(columns=[col + '_df1' for col in data1.columns] + [col + '_df2' for col in data2.columns], inplace=True)

    return merged

In [None]:
# Read IRS data
print("Reading IRS data")

irs_raw_folder_path = '../data/irs_data/raw' 
irs_folder_path = '../data/irs_data/'

irs_raw_files = [file for file in os.listdir(irs_raw_folder_path ) if os.path.isfile(os.path.join(irs_raw_folder_path , file))]

# reverse the order, so the newest data is at the front of this list 
# No real need for this, but it helps with thinking about the values we're examining
# We want growth to be positive and loss to be negative to keep with the common understanding
irs_raw_files = irs_raw_files[::-1]

In [None]:
# Create empty list to hold tuples of filenames|dataframes
processed_irs_files = []

print("Formatting raw IRS data")
for i in irs_raw_files:
    file = pd.read_csv(f'{irs_raw_folder_path}/{i}', encoding='latin-1') #irs formatting requires this encoding
    file = file.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True) # drop columns not used for the current analysis, but raw files are presevered for future use.
        
    # Lambda function to generate a boolian mask filtering 'COUNTYNAME' values with a single word, 
    # removing all single name counties (should be agg. state data)
    filter_counties = lambda row: len(row['COUNTYNAME'].split()) >=2
    data_counties = file[file.apply(filter_counties, axis=1)]
    lower_countynames = [i.lower() for i in data_counties['COUNTYNAME']]
    data_copy = data_counties.copy()
    data_copy['COUNTYNAME'] = lower_countynames
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    # Assign the dataframe to the variable name
    # globals()[name] = data_copy # from the documentation: 'the globals() function is a built-in function that returns a dictionary representing the current global symbol table' only half understand this, but it works (#programming)
    
    # Append both to the empty list
    processed_irs_files.append((name, data_copy))

In [None]:
print("Processing IRS data, Writing IRS data to /irs_data")
for i in range(0, len(processed_irs_files), 2): # Call every-other item in the processed files list
    
    # Since these are from a list of tuples, we assign the dataframes to these variables, to be passed as arguments 
    arg1 = processed_irs_files[i][1] 
    arg2 = processed_irs_files[i + 1][1] if i + 1 < len(processed_irs_files) else None #if there is an odd number, it should throw an error, IMPLIMENT LATER

    # get the formatted irs file names for each year
    name = f'{processed_irs_files[i][0]}'
    
    # Run the function to subtract one year's data from the previous year's data
    
    df_diff = prev_year_change2(arg1, arg2) # Placeholder variable
    new_df = f'{processed_irs_files[i][0]}_d' # Name of new variable
    
    # Create the directory if it doesn't exist
    os.makedirs(irs_folder_path, exist_ok=True)
    
    # Save the DataFrames as CSV files
    csv_filename_f = f'{name}.csv'
    csv_filename_d = f'{new_df}.csv'
    arg1.to_csv(os.path.join(irs_folder_path, csv_filename_f), index=False) #the 'original' formatted files
    df_diff.to_csv(os.path.join(irs_folder_path, csv_filename_d), index=False) #the file processed by the prev_year_change function