This file reads socioeconomic data, assigns to BGs, and computes distribution measures.

In [None]:
import os
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import geopandas as geopd
import us
import matplotlib.pyplot as plt
from tqdm import tqdm
import state_name_crs_mappings_ML as crsm
from pysal.explore import esda
from pysal.lib import weights
from inequality.gini import Gini
from inequality.gini import Gini_Spatial

In [None]:
root = ''
path = root + 'Data/'
path_US_data = root + 'Data/geodata/'
result_path = root + 'final_data/'
path_IRA = root + 'Data/IRA/1.0-shapefile-codebook/usa/'

In [None]:
# Check whether folder exists, if not create it
folder = result_path + 'BGlevel/'
if not os.path.exists(folder):
    os.makedirs(folder)

# Read data

In [None]:
# Socio-economic data (output of 10_assemble_socioeconomicdata.ipynb)
file_socioecon = result_path + 'CENSUS_selected_cols_EV.csv'
df_socioecon = pd.read_csv(file_socioecon,index_col=0)
df_socioecon.head(3)

In [None]:
# Add FIPS codes
df_socioecon['BGFP'] = df_socioecon.index.astype(str).str.zfill(12)
df_socioecon['COUNTYFP'] = df_socioecon['BGFP'].str[:5]
df_socioecon['STATEFP'] = df_socioecon['BGFP'].str[:2]

# Assign socioeconomics to BGs and compute state-level distribution measures

In [None]:
# States of interest
states = []
for state in us.states.STATES:
	states +=[state.abbr]
states += ['DC']
print(states)

In [None]:
# For each state, read official BG shapefile, merge socio-economic data, and save separately
for state in (states):
	print(state)

	# Read BG shapefile
	if state == 'DC':
		fips = '11'
	else:
		fips = us.states.lookup(state).fips
	file_bg = path + 'geodata/tl_bg/tl_2020_'+fips+'_bg/tl_2020_'+fips+'_bg.shp'
	gdf_bg = geopd.read_file(file_bg)
	df_bg = gdf_bg[['GEOID','ALAND','geometry']]
	df_bg.rename(columns={'GEOID':'BGFIPS','ALAND':'BGALAND'},inplace=True)

	# Merge socio-economic data
	df_bg = df_bg.merge(df_socioecon,how='left',left_on=['BGFIPS'],right_on=['BGFP'])
	no_NaN = len(df_bg.loc[df_bg['BGFP'].isna()])
	if no_NaN > 0:
		print('Warning: '+str(no_NaN)+' BGs in '+state+' have no socio-economic data')
	df_bg['PopDensity_byBG'] = df_bg['total_pop_byBG']/df_bg['BGALAND']

	# Compute weighted average of income by countyfp (needed for quantile calculation)
	df_bg['median_household_income_byCNTY'] = np.nan
	df_socioecon_noNaN = df_bg.dropna(subset=['median_household_income_byBG','total_pop_byBG'])
	df_socioecon_noNaN['income_weighted'] = df_socioecon_noNaN['total_pop_byBG'] * df_socioecon_noNaN['median_household_income_byBG']
	for countyfp in tqdm(df_socioecon_noNaN['COUNTYFP'].unique()):
		# Weigh incomeXpopulation in BG by total population in county
		df = df_socioecon_noNaN.loc[df_socioecon_noNaN['COUNTYFP'] == countyfp]
		df['income_weighted'] = df['income_weighted'] / df['total_pop_byBG'].sum()
		# Sum up
		df_bg.loc[df_bg['COUNTYFP'] == countyfp, 'median_household_income_byCNTY'] = int(np.round(df['income_weighted'].sum(),0))

	# Compute state-level income quantiles
	df_bg['income_quantile_county_state'] = np.nan
	df_bg['income_quantile_bg_state'] = np.nan
	df_socioecon_noNaNCNTY = df_bg.dropna(subset=['median_household_income_byCNTY'])
	df_socioecon_noNaNBG = df_bg.dropna(subset=['median_household_income_byBG'])

	i = 1.
	for qu in np.arange(0.,1.,0.2):
		# US county
		quantile = np.quantile(df_socioecon_noNaNCNTY['median_household_income_byCNTY'],qu)
		df_bg.loc[(df_bg['median_household_income_byCNTY'] >= quantile), 'income_quantile_county_state'] = i
		# BG
		quantile = np.quantile(df_socioecon_noNaNBG['median_household_income_byBG'],qu)
		df_bg.loc[(df_bg['median_household_income_byBG'] >= quantile), 'income_quantile_bg_state'] = i
		i += 1

	# Include Gini coefficient
	df_bg['Gini_state'] = Gini(df_bg.loc[~df_bg['median_household_income_byBG'].isna()]['median_household_income_byBG'].values).g
	df_bg['Gini_county'] = np.nan
	for county in tqdm(df_bg['COUNTYFP'].unique()):
		df_bg_county = df_bg.loc[df_bg['COUNTYFP']==county]
		df_bg_county = df_bg_county.loc[~df_bg_county['median_household_income_byBG'].isna()]
		if len(df_bg_county) > 1:
			gini_county = Gini(df_bg_county['median_household_income_byBG'].values).g
			if gini_county != 0:
				df_bg.loc[df_bg['COUNTYFP']==county, 'Gini_county'] = gini_county
			else:
				df_bg.loc[df_bg['COUNTYFP']==county, 'Gini_county'] = np.nan # seems to be mistake; Moran fails in this case

	# Include geographical seggregation measure: Moran's I
    # Impute NaNs
	df_bg['median_household_income_imputed'] = df_bg['median_household_income_byBG']
	df_bg.loc[df_bg['median_household_income_byBG'].isna(),'median_household_income_imputed'] = df_bg['median_household_income_byBG'].mean()
	# Compute spatial weights
	wq = weights.Queen.from_dataframe(df_bg, use_index=True) # https://pysal.org/notebooks/lib/libpysal/weights.html
	df_bg['Moran_state'] = esda.Moran(df_bg["median_household_income_imputed"], w=wq).I # value of Moran’s I: perfectly dispersed: -1; perfectly clustered: 1; random: 0
	# County level
	df_bg['Moran_county'] = 0.
	for county in tqdm(df_bg['COUNTYFP'].unique()):
		df_bg_county = df_bg.loc[df_bg['COUNTYFP']==county]
		if len(df_bg_county) > 1:
			wq = weights.Queen.from_dataframe(df_bg_county, use_index=True) # https://pysal.org/notebooks/lib/libpysal/weights.html
			try:
				moran_county = esda.Moran(df_bg_county["median_household_income_imputed"], w=wq).I
			except:
				moran_county = np.nan
			df_bg.loc[df_bg['COUNTYFP']==county,'Moran_county'] = moran_county

	# Drop columns
	df_bg.drop(columns=['BGFIPS','geometry'],inplace=True)

	# Save
	file_bg = result_path + 'BGlevel/level_BG_'+state+'.csv'
	df_bg.to_csv(file_bg)

# Combine all states

In [None]:
# Combine
df_bg_US = pd.DataFrame()
for state in tqdm(states):
    # Read file
    file_bg = result_path + 'BGlevel/level_BG_'+state+'.csv'
    df_bg = pd.read_csv(file_bg,index_col=0)
    # Concatenate
    if len(df_bg_US) == 0:
        df_bg_US = df_bg.copy()
    else:
        df_bg_US = pd.concat([df_bg_US,df_bg])

In [None]:
# Set index
df_bg_US.set_index('BGFP',inplace=True)

# Compute US-level quantiles

In [None]:
# County quantiles US
df_bg_US['income_quantile_county_US'] = np.nan
df_dataset_CNTY = df_bg_US[['COUNTYFP','median_household_income_byCNTY']].copy()

# Remove implicit weighting by number of BGs - each county counts once
df_dataset_CNTY.drop_duplicates(inplace=True)

# Add county-level quantile
i = 1.
for qu in np.arange(0.,1.,0.2):
    print(qu)
    county_income_lower = np.quantile(df_dataset_CNTY.loc[~df_dataset_CNTY['median_household_income_byCNTY'].isna()]['median_household_income_byCNTY'],qu)
    print(county_income_lower)
    df_bg_US.loc[(df_bg_US['median_household_income_byCNTY'] >= county_income_lower),'income_quantile_county_US'] = i
    i += 1.

In [None]:
# BG quantiles US
df_bg_US['income_quantile_bg_US'] = np.nan

# Add BG-level quantile
i = 1.
for qu in np.arange(0.,1.,0.2):
    print(qu)
    BG_income_lower = np.quantile(df_bg_US.loc[~df_bg_US['median_household_income_byBG'].isna()]['median_household_income_byBG'],qu)
    print(BG_income_lower)
    df_bg_US.loc[(df_bg_US['median_household_income_byBG'] >= BG_income_lower),'income_quantile_bg_US'] = i
    i += 1.

# Save

In [None]:
df_bg_US.to_csv(result_path + 'BGlevel/level_BG.csv')