In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
income_raw = pd.read_csv('Public Data/census_med_income.csv', index_col=0)

parish_zip = pd.read_excel('parish_tracts_zips.xlsx')

In [None]:
income_clean = income_raw.copy()

income_clean = income_clean[['Geographic Area Name', 'Estimate!!Median income (dollars)!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households',
                             'Estimate!!Median income (dollars)!!FAMILIES!!Families', 'year']]
income_clean.rename(columns={'Geographic Area Name':'zip_code',
                           'Estimate!!Median income (dollars)!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households':'all households',
                           'Estimate!!Median income (dollars)!!FAMILIES!!Families':'Med income families'}, inplace=True)

income_clean['zip_code'] = income_clean['zip_code'].astype(str).str.extract(r'(\b\d{5}\b)')

income_clean['zip_code'] = income_clean['zip_code'].astype(str)

In [None]:
income_clean.columns = [income_clean.columns[0]] + \
                       ['Med inc_' + col for col in income_clean.columns[1:42]]

income_clean.rename(columns={'Med inc_year':'year'},inplace=True)

income_parish = income_clean

In [None]:
# Convert the comma-separated string to a list of ZIPs
parish_zip['zip_codes_clean'] = parish_zip['zip_codes'].astype(str).apply(
    lambda x: [z.strip() for z in x.split(',') if z.strip()]
)


# Flatten the list into a set of unique ZIP codes
all_unique_zips = set(
    zip_code for zip_list in parish_zip['zip_codes_clean']
    for zip_code in zip_list
)

In [None]:
present_zips = set(income_parish[income_parish['year']== '2018_19']['zip_code'].unique())
missing_zips_2018 = all_unique_zips - present_zips
print(len(missing_zips_2018))

present_zips = set(income_parish[income_parish['year']== '2019_20']['zip_code'].unique())
missing_zips_2019 = all_unique_zips - present_zips
print(len(missing_zips_2019))

present_zips = set(income_parish[income_parish['year']== '2020_21']['zip_code'].unique())
missing_zips_2020 = all_unique_zips - present_zips
print(len(missing_zips_2020))

present_zips = set(income_parish[income_parish['year']== '2021_22']['zip_code'].unique())
missing_zips_2021 = all_unique_zips - present_zips
print(len(missing_zips_2021))

present_zips = set(income_parish[income_parish['year']== '2022_23']['zip_code'].unique())
missing_zips_2022 = all_unique_zips - present_zips
print(len(missing_zips_2022))

In [None]:
all_missing_zips = (
    missing_zips_2018
    | missing_zips_2019
    | missing_zips_2020
    | missing_zips_2021
    | missing_zips_2022
)

income_parish = income_parish[~income_parish['zip_code'].isin(all_missing_zips)]

In [None]:
parish_zip['zip_codes'] = parish_zip['zip_codes'].astype(str)

parish_zip['zip_codes_split'] = parish_zip['zip_codes'].apply(
    lambda x: [z.strip() for z in x.split(',') if z.strip()]
)

zips_exploded = parish_zip.explode('zip_codes_split')

zips_exploded = zips_exploded.rename(columns={'zip_codes_split': 'zip_code'})


In [None]:
merge = zips_exploded.merge(income_parish, on='zip_code', how='left')

merge = merge.dropna()

#### Aggregation

In [None]:
change_cols = ['Med inc_all households', 'Med inc_Med income families']

for col in change_cols:
    merge[col] = (
        merge[col]
        .astype(str)
        .str.replace(',', '', regex=False)
        .str.replace('+', '', regex=False)
        .replace('nan', np.nan)
        .astype(float)
    )

In [None]:
group_cols = ['school_id', 'year'] 

cols_to_mean = ['Med inc_all households', 'Med inc_Med income families']
merge[cols_to_mean] = merge[cols_to_mean].astype(int)

grouped_df = merge.groupby(group_cols, as_index=False)[cols_to_mean].mean()


In [None]:
grouped_df.to_csv('census_med_income_by_parish.csv')