In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read in data
housing_raw = pd.read_csv('Public Data/combined_housing_2018_2023.csv')

parish_zip = pd.read_excel('parish_tracts_zips.xlsx')
parish_zip = parish_zip[['school_id', 'zip_codes']]

In [None]:
# Number of unique parishes 
num_unique_parishes = parish_zip['school_id'].nunique()
print(f"Number of unique parishes: {num_unique_parishes}")

In [None]:
# Get parish names to create dummy variable 
parish_columns = parish_zip['school_id'].unique()


In [None]:
housing_year = housing_raw.copy()

# Change year 
housing_year['Year'] = housing_year['Year'].astype(str).replace({
    '2018': '2018_19',
    '2019': '2019_20',
    '2020': '2020_21',
    '2021': '2021_22',
    '2022': '2022_23'
})

housing_year = housing_year[housing_year['Year'] != '2023']

# Drop duplicates if there are any 
duplicates = housing_year[housing_year.duplicated(subset=['Zip Code', 'Year'], keep=False)]

# Check year
housing_year['Year'].unique()

In [None]:
parish_zip['zip_codes'] = parish_zip['zip_codes'].astype(str)

parish_zip['zip_codes_split'] = parish_zip['zip_codes'].apply(
    lambda x: [z.strip() for z in x.split(',') if z.strip()]
)

zips_exploded = parish_zip.explode('zip_codes_split')

zips_exploded = zips_exploded.rename(columns={'zip_codes_split': 'zip_code'})

housing_year['Zip Code'] = housing_year['Zip Code'].astype(int)
zips_exploded['zip_code'] = zips_exploded['zip_code'].astype(int)

In [None]:
merge = zips_exploded.merge(housing_year, left_on='zip_code', right_on='Zip Code', how='left')

In [None]:
housing_parish = housing_year.copy()

# Change zipcodes into string and remove whitespace
housing_parish['Zip Code'] = housing_parish['Zip Code'].astype(str).str.strip() 
parish_zip['zip_codes'] = parish_zip['zip_codes'].astype(str).str.strip() 

In [None]:
# Convert the comma-separated string to a list of ZIPs
parish_zip['zip_codes_clean'] = parish_zip['zip_codes'].astype(str).apply(
    lambda x: [z.strip() for z in x.split(',') if z.strip()]
)


# Flatten the list into a set of unique ZIP codes
all_unique_zips = set(
    zip_code for zip_list in parish_zip['zip_codes_clean']
    for zip_code in zip_list
)

In [None]:
present_zips = set(housing_parish[housing_parish['Year']== '2018-19']['Zip Code'].unique())
missing_zips_2018 = all_unique_zips - present_zips
print(len(missing_zips_2018))

present_zips = set(housing_parish[housing_parish['Year']== '2019-20']['Zip Code'].unique())
missing_zips_2019 = all_unique_zips - present_zips
print(len(missing_zips_2019))

present_zips = set(housing_parish[housing_parish['Year']== '2020-21']['Zip Code'].unique())
missing_zips_2020 = all_unique_zips - present_zips
print(len(missing_zips_2020))

present_zips = set(housing_parish[housing_parish['Year']== '2021-22']['Zip Code'].unique())
missing_zips_2021 = all_unique_zips - present_zips
print(len(missing_zips_2021))

present_zips = set(housing_parish[housing_parish['Year']== '2022-23']['Zip Code'].unique())
missing_zips_2022 = all_unique_zips - present_zips
print(len(missing_zips_2022))

In [None]:
all_missing_zips = (
    missing_zips_2018
    | missing_zips_2019
    | missing_zips_2020
    | missing_zips_2021
    | missing_zips_2022
)

housing_parish = housing_parish[~housing_parish['Zip Code'].isin(all_missing_zips)]

### Aggregation

In [None]:
merge

In [None]:
# Columns to sum 
cols_to_sum = ['Total Households', 'Total Families', 'Households with Children Under 18', 'Owner-Occupied Units', 'Renter-Occupied Units', 'Nonfamily Total Households']

cols_to_mean = ['Avg Household Size', 'Avg Family Size', 'Married Avg Household Size', 'Nonfamily Avg Household Size', 'Married Avg Family Size']

cols_to_fix = cols_to_sum + cols_to_mean
merge[cols_to_fix] = merge[cols_to_fix].replace('-', np.nan)



In [None]:
merge[cols_to_fix] = merge[cols_to_fix].astype(float)

In [None]:

for col in cols_to_fix:
    bad_vals = merge[~merge[col].apply(lambda x: pd.api.types.is_number(x) or pd.isna(x))]
    if not bad_vals.empty:
        print(f"\nNon-numeric values found in '{col}':")
        print(bad_vals[col].unique())


In [None]:
# Columns to sum 
cols_to_sum = ['Total Households', 'Total Families', 'Households with Children Under 18', 'Owner-Occupied Units', 'Renter-Occupied Units', 'Nonfamily Total Households']

cols_to_mean = ['Avg Household Size', 'Avg Family Size', 'Married Avg Household Size', 'Nonfamily Avg Household Size', 'Married Avg Family Size']

agg_dict = {col: 'sum' for col in cols_to_sum}
agg_dict.update({col: 'mean' for col in cols_to_mean})

# Group and aggregate
grouped_df = merge.groupby(['school_id', 'Year'], as_index=False).agg(agg_dict)

In [None]:
grouped_df['school_id'].unique()

In [None]:
grouped_df.to_csv('census_housing_by_parish.csv')