In [44]:
import pandas as pd
import numpy as np
from scipy import stats
import ast

# Data Loading
## cz-connect: demographics/income

In [None]:
# Make the geoID 5 charachters becaues the preceding 0s have been removed
def char5(num):
    # One of the values in meanMeds in NA which would throw an error
    try:
        num = int(num)
        return f'{num:05d}'
    except:
        return '00000'

czconnect = pd.read_csv(r'..\loadData\data\external\Citizen_Connect_-_County_data__live__20241210.csv')
# Filter for only commuting and demographic data
commuters = czconnect[czconnect['Category'] == 'Commuting']
demographics = czconnect[czconnect['Category'] == 'Demographics']
demographics['GEOID'] = demographics['GEOID'].apply(char5)
del czconnect

resPermits = pd.read_csv(r'..\loadData\data\external\Residential_Construction_Permits_by_County_5026727375813176131.csv')

# years are separated by columns, so filter for only applicable years
dataRange = ['GEOID']
for year in range(2018, 2023):
    dataRange.append(f'ALL_PERMITS_{year}')
    dataRange.append(f'SINGLE_FAMILY_PERMITS_{year}')
    dataRange.append(f'ALL_MULTIFAMILY_PERMITS_{year}')
    dataRange.append(f'MULTIFAMILY_PERMITS_2_UNITS_{year}')
    dataRange.append(f'MULTIFAMILY_PERMITS_3_4_UNITS_{year}')
    dataRange.append(f'MULTIFAMILY_PERMITS_5_OR_MORE_UNITS_{year}')
# These two are labeled slightly differently, no idea why
dataRange[-2], dataRange[-1] = ['MULTIFAMILY_PERMITS_3_4_UNIT_2022', 'MULTIFAMILY_PERMITS_5_OR_MORE_2022']
resPermits = resPermits[dataRange]

resPermits.index = resPermits['GEOID'].apply(char5)

In [46]:
demographics = demographics[(demographics['Year'] == 2018) | (demographics['Year'] == 2019)]
demo = demographics.pivot_table(index='GEOID', columns=['Year', 'Variable'], values=['Value', 'Denominator'])

# pivot and get median household income from dataset
income = demographics.pivot_table(index='GEOID', columns=['Year', 'Variable'], values='Value')[[(2018, 'Median Household Income'), 
(2019, 'Median Household Income')]]

In [47]:
commuters = commuters[(commuters['Year'] == 2018) | (commuters['Year'] == 2019)]
df = commuters.pivot_table(index='GEOID', columns=['Year', 'Variable'], values='Value')

def chi2com(year):
    pvalues = []
    for i, row in df.iterrows():

        table = [[row[(2018, 'Commute by car/ truck/van')],
                row[(2018, 'Commute by public transportation')],
                row[(2018, 'Work at home')]], 
                [row[(2019, 'Commute by car/ truck/van')],
                row[(2019, 'Commute by public transportation')],
                row[(2019, 'Work at home')]]]

        try:
            stat, p, dof, expected = stats.chi2_contingency(table)
            pvalues.append(p)
        except ValueError:
            pvalues.append(1.0)
            continue
    
    return pvalues

df['workerchange'] = chi2com(2019)

# quantify commuter change

## Meanmeds statistical analyses

In [48]:
meanMeds = pd.read_csv('meanMedsFiltered.csv')
meanMeds.index = meanMeds['county'].apply(char5)

# Z-test for statiscical significance of median increase
for year in range(2019, 2024):
    meanMeds[f'z-MedIncrease{year}'] = stats.zscore(meanMeds[f'medIncrease{year}'])

    # Find p-value from z-score, multiplied by 2 for 2 tailed test
    meanMeds[f'p-MedIncrease{year}'] = stats.norm.sf(meanMeds[f'z-MedIncrease{year}'])



In [49]:
def chi2(year):
    pvalues = []
    for i, row in meanMeds.iterrows():
        # get dicts from the string they are stored as
        dict1 = ast.literal_eval(row[f'race{year-1}'])
        dict2 = ast.literal_eval(row[f'race{year}'])

        # get a set of all keys
        keys = set(dict1.keys()).union(dict2.keys())

        dict1F = {key: dict1.get(key, 0) for key in keys}
        dict2F = {key: dict2.get(key, 0) for key in keys}


        table = [[dict1F[k], dict2F[k]] for k in keys]

        stat, p, dof, expected = stats.chi2_contingency(table)
        pvalues.append(p)
    
    return pvalues

for i in range(2019,2024):
    meanMeds[f'chi{i}'] = chi2(i)

# Plots

In [50]:
# this cell is a function that I used for graphing all of the maps. 
# I find it neater to use one line with all of the fields I need than several repetetive calls.

from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/refs/heads/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

import plotly.express as px

def choro(data, var, label):
    fig = px.choropleth(data, geojson=counties, locations=data.index, color=var,
                            color_continuous_scale="Viridis", featureidkey="id",
                                projection="mercator",
                            labels={var:label}
                            )
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

    fig.show()

In [None]:
income['test'] = income[(2019, 'Median Household Income')]
choro(data=income, var='test', label='Median Household Income 2019')

In [52]:
# Calculate the value to income ratio
income['med2018'] = meanMeds['median2018']
income['med2019'] = meanMeds['median2019']
income['percent2018'] = income['med2018']/income[(2018, 'Median Household Income')]
income['percent2019'] = income['med2018']/income[(2019, 'Median Household Income')]

In [None]:
choro(data=income, var='percent2018', label='Value to Income Ratio 2018')
choro(data=income, var='percent2019', label='Value to Income Ratio 2019')

In [None]:
# Graph the confidence of change in lending demographic
# Note that low value means high confidence of change
for i in range(2019, 2024):
    choro(data=meanMeds, var=f'chi{i}', label=f'Change in lending demographic {i}')