# Recalculate income quintiles using imputed data
Imputed data for BG-level income is included in "median_household_income_imputed" and has no missing values, but the quantile data we use in some analysis still has missing values because of missing values in the CT- or County-level data. Here we use the imputed BG-level data to imputed the CT- and County-level values, then recalculate the income quantiles with no missing values. 

In [None]:
root = ''
folder = root + 'final_data/'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import us
import numpy as np

In [None]:
# Read latest compiled data
df_bg_level = pd.read_csv(folder + 'BGlevel/22_level_BG_US_imputed_250702.csv', index_col=0)
df_ct_level = pd.read_csv(folder + 'CTlevel/22_level_CT_US_compiled_wlabels_250702.csv', index_col=0)

# 1. Impute missing values

### Fill CTs with missing data based on imputed BG-level values

df_bg_level['median_household_income_imputed'] has no missing values.


In [None]:
df_dataset_CT_pop = df_bg_level[['TRACTFP','total_pop_byBG']].groupby('TRACTFP').sum()
df_dataset_CT_pop.rename(columns={'total_pop_byBG':'total_pop_byCT'}, inplace=True)

df_dataset_BG_imputed_weigh = df_bg_level.loc[:, ['TRACTFP', 'total_pop_byBG', 'median_household_income_imputed']].copy().merge(df_dataset_CT_pop, on='TRACTFP', how='left')

inds_zero = df_dataset_BG_imputed_weigh[df_dataset_BG_imputed_weigh['total_pop_byCT']==0].index
inds_nonzero = df_dataset_BG_imputed_weigh[~(df_dataset_BG_imputed_weigh['total_pop_byCT']==0)].index

df_dataset_BG_imputed_weigh.loc[inds_nonzero, 'weights'] = df_dataset_BG_imputed_weigh.loc[inds_nonzero, 'total_pop_byBG']/df_dataset_BG_imputed_weigh.loc[inds_nonzero, 'total_pop_byCT']
df_dataset_BG_imputed_weigh.loc[inds_zero, 'weights'] = 0

df_dataset_BG_imputed_weigh['median_household_income_imputed_weighted'] = df_dataset_BG_imputed_weigh['median_household_income_imputed']*df_dataset_BG_imputed_weigh['weights']

df_dataset_CT_weigh = df_dataset_BG_imputed_weigh[['TRACTFP', 'median_household_income_imputed_weighted']].groupby('TRACTFP').sum(min_count=1).reset_index()

df_ct_level = df_ct_level.merge(df_dataset_CT_weigh, on='TRACTFP', how='outer')
df_ct_level = df_ct_level.rename(columns={'median_household_income_imputed_weighted':'median_household_income_byCT_imputed'})

### Fill counties with missing data

#### First in BG-level dataset

In [None]:
counties_with_na = df_bg_level[df_bg_level['median_household_income_byCNTY'].isna()]['COUNTYFP'].unique()
counties_with_na

In [None]:
df_bg_level['median_household_income_byCNTY_imputed'] = df_bg_level['median_household_income_byCNTY'].copy()
for county in counties_with_na:
    inds = df_bg_level[df_bg_level['COUNTYFP']==county].index
    weights = df_bg_level.loc[inds, 'total_pop_byBG'] / df_bg_level.loc[inds, 'total_pop_byBG'].sum()
    county_weighted = (df_bg_level.loc[inds, 'median_household_income_imputed'] * weights).sum()
    print(df_bg_level.loc[inds, 'median_household_income_imputed'].mean(), county_weighted)
    df_bg_level.loc[inds, 'median_household_income_byCNTY_imputed'] = county_weighted

#### Repeat in CT-level dataset

In [None]:
counties_with_na_ct = df_ct_level[df_ct_level['median_household_income_byCNTY'].isna()]['COUNTYFP'].unique()
counties_with_na_ct

In [None]:
df_ct_level['median_household_income_byCNTY_imputed'] = df_ct_level['median_household_income_byCNTY'].copy()
for county in counties_with_na_ct:
    inds = df_ct_level[df_ct_level['COUNTYFP']==county].index
    weights = df_ct_level.loc[inds, 'total_pop_byCT'] / df_ct_level.loc[inds, 'total_pop_byCT'].sum()
    county_weighted = (df_ct_level.loc[inds, 'median_household_income_byCT_imputed'] * weights).sum()
    print(df_ct_level.loc[inds, 'median_household_income_byCT_imputed'].mean(), county_weighted)
    df_ct_level.loc[inds, 'median_household_income_byCNTY_imputed'] = county_weighted

# 2. Re-compute income quantiles

## Re-compute US-level quantiles

In [None]:
# County quantiles US
df_dataset_CNTY = df_bg_level[['COUNTYFP','median_household_income_byCNTY_imputed']].copy()
df_dataset_CNTY.drop_duplicates(inplace=True)
df_dataset_CNTY['income_quantile_county_US_imputed'] = pd.qcut(df_dataset_CNTY['median_household_income_byCNTY_imputed'], q=5, labels=np.arange(1, 6))
df_bg_level = df_bg_level.merge(df_dataset_CNTY[['COUNTYFP', 'income_quantile_county_US_imputed']], on='COUNTYFP', how='outer')

# BG quantiles US
df_bg_level['income_quantile_bg_US_imputed'] = pd.qcut(df_bg_level['median_household_income_imputed'], q=5, labels=np.arange(1, 6))

# CT county quantiles US
df_dataset_CNTY = df_ct_level[['COUNTYFP','median_household_income_byCNTY_imputed']].copy()
df_dataset_CNTY.drop_duplicates(inplace=True)
df_dataset_CNTY['income_quantile_county_US_imputed'] = pd.qcut(df_dataset_CNTY['median_household_income_byCNTY_imputed'], q=5, labels=np.arange(1, 6))
df_ct_level = df_ct_level.merge(df_dataset_CNTY[['COUNTYFP', 'income_quantile_county_US_imputed']], on='COUNTYFP', how='outer')

# CT quantiles US
df_ct_level['income_quantile_CT_US_imputed'] = pd.qcut(df_ct_level['median_household_income_byCT_imputed'], q=5, labels=np.arange(1, 6))


## Re-compute State-level Quantiles

In [None]:
for state in df_bg_level['STATEFP'].unique():
    
    bg_inds = df_bg_level[df_bg_level['STATEFP']==state].index
    df_bg_level.loc[bg_inds, 'income_quantile_bg_state_imputed'] = pd.qcut(df_bg_level.loc[bg_inds, 'median_household_income_imputed'], q=5, labels=np.arange(1, 6))
    
    ct_inds = df_ct_level[df_ct_level['STATEFP']==state].index
    df_ct_level.loc[ct_inds, 'income_quantile_CT_state_imputed'] = pd.qcut(df_ct_level.loc[ct_inds, 'median_household_income_byCT_imputed'], q=5, labels=np.arange(1, 6))
    

In [None]:
for state in df_bg_level['STATEFP'].unique():
    print('--'*20)
    print(state)
    
    bg_inds = df_bg_level[df_bg_level['STATEFP']==state].index
    ct_inds = df_ct_level[df_ct_level['STATEFP']==state].index
    
    if state == 11:
        
        print('Unique case with DC, as it has only one county. Map to 3.')
        df_bg_level.loc[bg_inds, 'income_quantile_county_state_imputed'] = 3
        df_ct_level.loc[ct_inds, 'income_quantile_county_state_imputed'] = 3

    else:

        df_bg_CNTY = df_bg_level.loc[bg_inds, ['COUNTYFP','median_household_income_byCNTY_imputed']].copy()
        df_bg_CNTY.drop_duplicates(inplace=True)
        df_bg_CNTY['income_quantile_county_state_imputed'] = pd.qcut(df_bg_CNTY['median_household_income_byCNTY_imputed'], q=5, labels=np.arange(1, 6))

        mapping = df_bg_CNTY.loc[:, ['COUNTYFP', 'income_quantile_county_state_imputed']].set_index('COUNTYFP').to_dict()['income_quantile_county_state_imputed']
        for key, val in mapping.items():
            df_bg_level.loc[df_bg_level[df_bg_level['COUNTYFP']==key].index, 'income_quantile_county_state_imputed'] = val

        df_ct_CNTY = df_ct_level.loc[ct_inds, ['COUNTYFP','median_household_income_byCNTY_imputed']].copy()
        df_ct_CNTY.drop_duplicates(inplace=True)
        df_ct_CNTY['income_quantile_county_state_imputed'] = pd.qcut(df_ct_CNTY['median_household_income_byCNTY_imputed'], q=5, labels=np.arange(1, 6))
        mapping = df_ct_CNTY.loc[:, ['COUNTYFP', 'income_quantile_county_state_imputed']].set_index('COUNTYFP').to_dict()['income_quantile_county_state_imputed']
        for key, val in mapping.items():
            df_ct_level.loc[df_ct_level[df_ct_level['COUNTYFP']==key].index, 'income_quantile_county_state_imputed'] = val
        


# 3. Save

In [None]:
df_bg_level.to_csv(folder + 'BGlevel/23_level_BG_US_imputedquantiles_20250722.zip')
df_ct_level.to_csv(folder + 'CTlevel/23_level_CT_US_imputedquantiles_20250722.zip')