In [1]:
import numpy as np
import pandas as pd
from src.modules import *

In [2]:
import os

In [3]:
irs_raw_folder_path = 'data/irs_data/raw' 
irs_folder_path = 'data/irs_data/'

In [4]:
irs_raw_files = [file for file in os.listdir(irs_raw_folder_path ) if os.path.isfile(os.path.join(irs_raw_folder_path , file))]

In [5]:
# reverse the order, so the newest data is at the front of this list 
# No real need for this, but it helps with thinking about the values we're examining
# We want growth to be positive and loss to be negative to keep with the common understanding
irs_raw_files = irs_raw_files[::-1]

In [6]:
irs_raw_files # just checking...

['irs_count_2020.csv',
 'irs_count_2019.csv',
 'irs_count_2018.csv',
 'irs_count_2017.csv',
 'irs_count_2016.csv',
 'irs_count_2015.csv',
 'irs_count_2014.csv',
 'irs_count_2013.csv',
 'irs_count_2012.csv',
 'irs_count_2011.csv']

In [7]:

# Create empty list to hold tuples of filenames|dataframes
processed_irs_files = []

for i in irs_raw_files: # Call item in the file list
    file = pd.read_csv(f'{irs_raw_folder_path}/{i}', encoding='latin-1') #irs formatting requires this encoding
    file = file.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True) # not used for the current analysis, but raw files are presevered for future use.
    
    # Lambda function to apply the filter line, removing all single name counties (should be agg. state data)
    filter_counties = lambda row: len(row['COUNTYNAME'].split()) >=2
    data_counties = file[file.apply(filter_counties, axis=1)]
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    # Assign the dataframe to the variable name
    globals()[name] = data_counties # from the documentation: 'the globals() function is a built-in function that returns a dictionary representing the current global symbol table' only half understand this, but it works (#programming)
    
    # Append both to the empty list
    processed_irs_files.append((name, data_counties))

In [None]:
# def prev_year_change(data1, data2):
#     """
#     This function will substitute the difference between a year and the previous year's IRS records, 
#     Giving us the change-over-year and letting us use that as an indication of economic growth/loss
    
#     NOTE: data1 should be the more recent year, data2 should be the older year.
#     """
    
#     # Create a set containing columns common to both dataframes 
#     # Note for future use, the '&' here only keeps items that intersect both lists
#     col_common = set(data1.columns) & set(data2.columns)
    
#     #merge the dataframes, keeping data1 if the datatype is 'Object' or if the column is not in the set (all columns held in column should appear in the set..)
#     merged = pd.concat([
#         data1[c] if c not in col_common or data1[c].dtype == 'O' else data1[c] - data2[c]
#         for c in data1.columns
#     ], axis=1)
    
#     return merged

In [None]:
formatted_irs_files = [i[0] for i in processed_irs_files]

In [None]:
formatted_irs_dataframes = [i[1] for i in processed_irs_files]

In [9]:
for i in range(0, len(processed_irs_files), 2): # Call every-other item in the processed files list
    
    # Since these are from a list of tuples, we assign the dataframes to these variables, to be passed as arguments 
    arg1 = processed_irs_files[i][1] 
    arg2 = processed_irs_files[i + 1][1] if i + 1 < len(processed_irs_files) else None #if there is an odd number, it should throw an error, IMPLIMENT LATER

    # get the formatted irs file names for each year
    name = f'{processed_irs_files[i][0]}'
    
    # Run the function to subtract one year's data from the previous year's data
    prev_year_change(arg1, arg2) 
    df_diff = prev_year_change(arg1, arg2) # Placeholder variable
    new_df = f'{processed_irs_files[i][0]}_d' # Name of new variable
    
    # Create the directory if it doesn't exist
    os.makedirs(irs_folder_path, exist_ok=True)
    
    # Save the DataFrames as CSV files
    csv_filename_f = f'{name}.csv'
    csv_filename_d = f'{new_df}.csv'
    arg1.to_csv(os.path.join(irs_folder_path, csv_filename_f), index=False) #the 'original' formatted files
    df_diff.to_csv(os.path.join(irs_folder_path, csv_filename_d), index=False) #the file processed by the prev_year_change function

In [None]:
for i in range(0, len(processed_irs_files), 2): # Call every-other item in the processed files list
    arg1 = processed_irs_files[i][1] # Since these are from a list of tuples, we assign the dataframe to these variables, to be passed as arguments 
    arg2 = processed_irs_files[i + 1][1] if i + 1 < len(processed_irs_files) else None #if there is an odd number, it should throw an error, IMPLIMENT LATER
    prev_year_change(arg1, arg2) # Run the function to subtract one year's data from the previous year's data
    df_diff = prev_year_change(arg1, arg2) # Placeholder variable
    new_df = f'{processed_irs_files[i][0]}_d' # Name of new variable
    
    # Create the directory if it doesn't exist
    os.makedirs(irs_folder_path, exist_ok=True)
    
    # Save the DataFrame as a CSV file
    csv_filename = f'{new_df}.csv'
    df_diff.to_csv(os.path.join(irs_folder_path, csv_filename), index=False)

In [None]:
irs_county_2012_d = prev_year_change2(filtered_irs_dataframes[0],filtered_irs_dataframes[1])
irs_county_2012_d = irs_county_2012_d[irs_count_2011_f.columns.tolist()]

In [None]:
irs_county_2014_d = prev_year_change2(filtered_irs_dataframes[2],filtered_irs_dataframes[3])
irs_county_2014_d = irs_county_2014_d[irs_count_2013_f.columns.tolist()]

In [None]:
irs_county_2016_d = prev_year_change2(filtered_irs_dataframes[4],filtered_irs_dataframes[5])
irs_county_2016_d = irs_county_2016_d[irs_count_2015_f.columns.tolist()]

In [None]:
irs_county_2018_d = prev_year_change2(filtered_irs_dataframes[6],filtered_irs_dataframes[7])
irs_county_2018_d = irs_county_2018_d[irs_count_2017_f.columns.tolist()]

In [None]:
irs_county_2020_d = prev_year_change2(filtered_irs_dataframes[8],filtered_irs_dataframes[9])
irs_county_2020_d = irs_county_2020_d[irs_count_2019_f.columns.tolist()]

In [None]:
irs_county_2016_d

In [None]:
filtered_irs_files

In [None]:
# Call the prev_year_chagne with consecutive pairs year data from IRS raw folder 
for i in range(0, len(filtered_irs_dataframes), 2): # Call every-other item in the file list
    arg1 = filtered_irs_dataframes[i]
    arg2 = filtered_irs_dataframes[i + 1] if i + 1 < len(filtered_irs_dataframes) else None #if there is an odd number, it should throw an error, IMPLIMENT LATER
    prev_year_change2(arg1, arg2) # Run the function to subtract one year's data from the previous year's data
    df_diff = prev_year_change(arg1, arg2) # Placeholder variable
    new_df = f'{arg1}_d' #name of new variable
    globals()[new_df] = df_diff 
    


In [None]:
irs_2012 = pd.read_csv(r'data/irs_data/raw/irs_count_2012.csv', encoding='latin-1')
irs_2011 = pd.read_csv(r'data/irs_data/raw/irs_count_2011.csv', encoding='latin-1')

In [None]:
irs_2011 = irs_2011.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True)
irs_2012 = irs_2012.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True)

In [None]:
irs_2011_f = irs_2011[irs_2011.apply(filter_counties, axis=1)]
irs_2012_f = irs_2012[irs_2012.apply(filter_counties, axis=1)]

In [None]:
irs_2012_f

In [None]:
col_order = irs_2011_f.columns.tolist()

In [None]:
df_diff = prev_year_change(irs_2012_f, irs_2011_f)

In [None]:
df_diff = df_diff[col_order]

In [None]:
df_diff

In [None]:
# for i in irs_raw_files: # Call item in the file list
#     file = pd.read_csv('{irs_raw_folder_path}/{i}', encoding='latin-1')
#     file = file.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True)
#     filter_counties = lambda row: len(row['COUNTYNAME'].split()) >=2
#     data_counties = file[file.apply(filter_counties, axis=1)]
#     name = f'{i}'
#     name = name[:-3]+'_f'
#     globals()[name] = data_counties

In [None]:
# for i in range(0, len(irs_raw_files), 2): # Call every-other item in the file list
#     arg1 = irs_raw_files[i]
#     arg2 = irs_raw_files[i + 1] if i + 1 < len(irs_raw_files) else None #if there is an odd number, it should throw an error, IMPLIMENT LATER
#     prev_year_change(arg1, arg2) # Run the function to subtract one year's data from the previous year's data
#     df_diff = prev_year_change(arg1, arg2) # Placeholder variable
#     new_df = f'{arg1}_d' #name of new variable
#     globals()[new_df] = df_diff # from the documentation: 'the globals() function is a built-in function that returns a dictionary representing the current global symbol table' only half understand this, but it works (#programming)
    


In [None]:
irs_2012.columns

In [None]:
il_irs_2012 = irs_2012.loc[irs_2012['STATE']=='IL'].copy()

In [None]:
len(il_irs_2012)

In [None]:
# il_irs_2012['COUNTYNAME'] = remove_keyword(il_irs_2012['COUNTYNAME'],'County')

In [None]:
il_irs_2012 = il_irs_2012.drop(['STATEFIPS','AGI_STUB','COUNTYFIPS'],axis=1).reset_index(drop=True)

In [None]:
il_irs_2012

In [None]:
filter_counties = lambda row: len(row['COUNTYNAME'].split()) >=2

In [None]:
data_counties = irs_2012[irs_2012.apply(filter_counties, axis=1)]

In [None]:
data_counties

In [None]:
irs_2012

In [None]:
irs_2012['STATE'].unique()

In [None]:
data_counties['STATE'].unique()

In [None]:
# merge dfs, get col = col-col where they match

# Sample dataframes
data1 = {'ID': [1, 2, 3], 'Value': [10, 20, 30]}
data2 = {'ID': [1, 2, 3], 'Value': [5, 10, 15]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Merge dataframes on the 'ID' column
merged_df = pd.merge(df1, df2, on='ID', suffixes=('_df1', '_df2'))

# Calculate the difference for matching columns
for column in df1.columns.intersection(df2.columns):
    merged_df[column] = merged_df[f'{column}_df1'] - merged_df[f'{column}_df2']

# Drop the redundant columns
merged_df = merged_df.drop(columns=['Value_df1', 'Value_df2'])

print(merged_df)

In [None]:
irs_raw_files

In [None]:
irs_raw_files = irs_raw_files[::-1]

In [None]:
# def prev_year_change(data1, data2):
#     """
#     This function will substitute the difference between a year and the previous year's IRS records, 
#     Giving us the change-over-year and letting us use that as an indication of economic growth/loss
    
#     NOTE: data1 should be the more recent year, data2 should be the older year.
#     """
    
#     #merge the dataframes, for columns with common names add a suffix to denote the source
#     data_diff = pd.merge(data1, data2, on='COUNTYNAME', suffixes=('_data1','_data2'))
#     common_col = set(data1.columns) & set(data2.columns)
#     for c in common_col:
#         if data1[c].dtype == data2[c].dtype and pd.api.types.is_numeric_dtype(data1[c]): # Lucky me there's such as thing as this 'types.is_numeric_dtype'...
#             data_diff = data_diff[f'{c}_data1']-data_diff[f'{c}_data2']
#     data_diff = data_diff.drop(columns=[c + '_data1' for c in common_columns] + [c + '_data2' for c in common_columns], inplace=True)
#     return data_diff

In [None]:
# def your_function(arg1, arg2):
#     # Your function logic here
#     print(f"Function called with arguments: {arg1}, {arg2}")

# # Example list of filenames
# filenames = ['file1.txt', 'file2.txt', 'file3.txt', 'file4.txt', 'file5.txt']



In [None]:
prev_year_change()