# MCIB Replication from Jargowsky and Wheeler
paper can be found here: https://journals.sagepub.com/doi/pdf/10.1177/0081175018782579

the functions found here are utimately pretty specific to the data we were working with and did not end up providing accurate results

code in stata can be found here from authors: http://fmwww.bc.edu/repec/bocode/m/mcib.ado

In [1]:
import pandas as pd
import numpy as np
import array
import math
import matplotlib.pyplot as plt
import cmath

In [2]:
%matplotlib inline

In [3]:
acs_est = pd.read_csv("../../../data/block_group_census_estimates_wide_pc_income.csv")
acs_county = pd.read_csv("../../../data/block_group_census_estimates_wide_county.csv")
acs_county.head()

Unnamed: 0.1,Unnamed: 0,county_name,inc_000_010k,inc_010_015k,inc_015_020k,inc_020_025k,inc_025_030k,inc_030_035k,inc_035_040k,inc_040_045k,...,inc_050_060k,inc_060_075k,inc_075_100k,inc_100_125k,inc_125_150k,inc_150_200k,inc_200_infk,population,households,pc_income
0,1,King,0.050167,0.028469,0.028581,0.029249,0.030444,0.034254,0.032954,0.034701,...,0.06489,0.088212,0.125066,0.108286,0.077348,0.101171,0.133966,2118119,851077,66343139
1,2,Kitsap,0.05159,0.031547,0.036951,0.038693,0.037658,0.04126,0.040275,0.038285,...,0.080003,0.10948,0.156612,0.110664,0.066667,0.064548,0.057133,258903,100484,5607210
2,3,Kittitas,0.108284,0.054794,0.050714,0.051054,0.052584,0.039495,0.037795,0.044198,...,0.089529,0.115481,0.136163,0.081199,0.044424,0.039551,0.022042,43726,17648,702714
3,4,Pierce,0.056371,0.036635,0.035955,0.039742,0.040804,0.043128,0.039966,0.045554,...,0.087157,0.112751,0.145196,0.099447,0.06518,0.061172,0.048594,845193,312839,17476813
4,5,Skagit,0.062673,0.04231,0.042057,0.04776,0.042331,0.045415,0.04907,0.050675,...,0.084874,0.115862,0.137492,0.087789,0.057878,0.050738,0.044549,121725,47341,2415691


## Run this cell for ACS fips level data

In [6]:
col_names = list(acs_est.columns)
inc_groups = [col for col in col_names if col.startswith("inc")]
var_names = np.append(inc_groups, ["population", "pc_income", "households", "fips_code"])
widths = [10,
           5,
           5,
           5,
           5,
           5,
           5,
           5,
           5,
           10,
           15,
           25,
           25,
           25,
           50,
         0]
bins = acs_est[var_names]
locator = "fips_code"

## Run this cell for ACS county-level data

In [4]:
col_names = list(acs_county.columns)
inc_groups = [col for col in col_names if col.startswith("inc")]
var_names = np.append(inc_groups, ["population", "pc_income", "households", "county_name"])
widths = [10,
           5,
           5,
           5,
           5,
           5,
           5,
           5,
           5,
           10,
           15,
           25,
           25,
           25,
           50,
         0]
bins = acs_county[var_names]
locator = "county_name"

## Function for estimating bin means
This sample takes in a row from a dataframe of census information at some geographic level. It creates an object of arrays with the bins of each income group with an estimated mean using the replicated methodology from the paper above. 
inputs:
sample = a dataframe row

In [160]:
def mean_est(sample):
    sample = sample.to_frame()
    sample = sample.transpose()

    location = sample.iloc[0][locator]
    print(location)

    households = sample.iloc[0]["households"]
    population = sample.iloc[0]["population"]
    grand_mean = sample.iloc[0]["pc_income"]
    top_households = sample.iloc[0]["inc_200_infk"] * households # converts top income percentage to households
    sample[inc_groups] = sample[inc_groups] * households #converts income percentages to households
    sample = sample.drop(columns = ['population', 'pc_income', 'households', locator])
    
    # assign households widths and relative frequency
    freq_table = sample.transpose()
    freq_table.columns= ['households'] # renames column
    freq_table["households"] = freq_table.households.astype(int) #county estimates convert households to object - this converts it back to an int
    freq_table = freq_table.assign(width = widths)
    freq_table = freq_table.assign(rel_freq = lambda x: x.households / x.width)
    
    # initializes arrays with predetermined bin widths and zeros
    slopes = np.zeros([len(widths), 1])
    mean = np.zeros([len(widths), 1])
    mean_num = np.zeros([len(widths), 1])
    mean_den = np.zeros([len(widths), 1])
    y_left = np.empty([len(widths), 1])
    y_right = np.empty([len(widths), 1])
    constant = np.empty([len(widths), 1])
    x_left = np.array([0, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, 75000, 100000, 125000, 150000, 0])
    x_mid = np.array([5000, 12500, 17500, 22500, 27500, 32500, 37500, 42500, 47500, 55000, 67500, 87500, 112500, 137500, 175000, 0])
    x_right = np.array([10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, 75000, 100000, 125000, 150000, 200000, 0])

    # calculates density function of each bin
    closed_brackets = 15
    for i in (range(0,len(freq_table) - 1)):
        #for closed brackets, take the average of the slope of the bin before to the current bin and the current bin to the next bin
        if i == 0:
            slope = 0
        elif i < closed_brackets - 1:
            slope1 = (freq_table.iloc[i]["rel_freq"] - freq_table.iloc[i - 1]["rel_freq"]) / (x_mid[i] - x_mid[i - 1])
            slope2 = (freq_table.iloc[i + 1]["rel_freq"] - freq_table.iloc[i]["rel_freq"]) / (x_mid[i + 1] - x_mid[i])
            slope = (slope1 + slope2) / 2
        #for last closed bracket, take the slope of the bin before to the current bin since there is not next bin
        elif i == closed_brackets - 1:
            slope = (freq_table.iloc[i]["rel_freq"] - freq_table.iloc[i - 1]["rel_freq"]) / (x_mid[i] - x_mid[i - 1]) 

        slopes[i] = slope
        
        # calculates y-value of the far left and right of the bin using slope and frequency point
        y_left[i] = freq_table.iloc[i]["rel_freq"] + slopes[i] * (x_left[i] - x_mid[i])
        y_right[i] = y_left[i] - slopes[i] * (x_left[i] - x_right[i]) 
            
        constant[i] = y_left[i] - slopes[i] * x_left[i]

        # calculates means from integration
#         mean[i] = ( (slopes[i] / 3) * (math.pow(x_right[i], 3) - math.pow(x_left[i], 3)) + 
#                    ((y_left[i] - (slopes[i] * x_left[i])) / 2) * 
#                    (x_right[i] * x_right[i] - x_left[i] * x_left[i]) ) / np.trapz(np.concatenate([y_left[i], y_right[i]]), [x_left[i], x_right[i]])
               
        mean[i] = (1 / freq_table.iloc[i]["households"]) * ( ( slopes[i] * math.pow(x_right[i], 3) / 3 + constant[i] * math.pow(x_right[i], 2) / 2 ) - \
                                                            ( slopes[i] * math.pow(x_left[i], 3) / 3 + constant[i] * math.pow(x_left[i], 2) / 2) )
                                                                                                                                    
        if math.isnan(mean[i]) | math.isinf(mean[i]):
            mean[i] = 0
    
    # appends results to table
    freq_table = freq_table.assign(slope = slopes)
    freq_table = freq_table.assign(est_mean = mean)

    print(freq_table)
    # estimates open bracket top mean
    total_est_mean = 0
    # adds all previous closed bracket means
    for i in range(0, len(freq_table)):
        if (math.isnan(freq_table.iloc[i]["est_mean"]) == False | math.isinf(freq_table.iloc[i]["est_mean"]) == False):
            total_est_mean = total_est_mean + (freq_table.iloc[i]["est_mean"] * freq_table.iloc[i]["households"])
    
    # takes total income (grand_mean * population) and subracts already caluclated means of closed brackets and divides by households in the top
    # bracket to estimate the top mean
    top_est_mean = ( (grand_mean * population) -
                   total_est_mean ) / top_households
    
    # appends results to table
    freq_table.loc["inc_200_infk", "est_mean"] = top_est_mean
    
    # calculates median income
    freq_table = freq_table.assign(cum_sum = np.cumsum(np.array(freq_table["households"])))
    cum_sum = np.cumsum(np.array(freq_table["households"]))
    
    percentile = .5
#     percentile =  (freq_table.iloc[1]["households"] + freq_table.iloc[0]["households"] )/ households
    quant_household = households * percentile
    index = np.max(np.where(cum_sum < quant_household)) + 1
    diff = cum_sum[index] - quant_household
    
    a = slopes[index] / ( 2 * households)
    b = constant[index] / households
    c = - ( (slopes[index] * (x_left[index] * x_left[index]))  / ( 2 * households ) + ( ( constant[index] * x_left[index] ) / households)) \
              - ((percentile * 100 - (100 * (cum_sum[index] / households))) / 100)
    print(-(percentile * 100 - (100 * (cum_sum[index] / households))) / 100)
    d = math.pow(b, 2) - 4 * a * c
    sol1 = (-b-math.pow(d, .5))/(2*a)
    sol2 = (-b+math.pow(d, .5))/(2*a)
    print(sol1, sol2)    
    
    freq_table = freq_table.drop(columns = ["households", "width", "rel_freq", "slope", "cum_sum"])
    mean_array = freq_table.transpose()
    mean_array = np.array(mean_array)
    mean_array = np.append(mean_array, [[location]], axis = 1) 
    
    return mean_array

    
    
    

In [161]:
means_estimate = bins.apply(lambda x: mean_est(x), axis = 1)

King
              households  width     rel_freq     slope      est_mean
inc_000_010k       42696     10  4269.600000  0.000000  5.000000e+06
inc_010_015k       24228      5  4845.600000  0.040320  1.251734e+07
inc_015_020k       24324      5  4864.800000  0.013300  1.750570e+07
inc_020_025k       24893      5  4978.600000  0.031720  2.251327e+07
inc_025_030k       25910      5  5182.000000  0.085180  2.753425e+07
inc_030_035k       29152      5  5830.400000  0.042700  3.251526e+07
inc_035_040k       28045      5  5609.000000  0.007620  3.750283e+07
inc_040_045k       29533      5  5906.600000 -0.012060  4.249575e+07
inc_045_050k       27442      5  5488.400000 -0.039547  4.748499e+07
inc_050_060k       55225     10  5522.500000 -0.018427  5.497219e+07
inc_060_075k       75075     15  5005.000000 -0.039385  6.735245e+07
inc_075_100k      106440     25  4257.600000 -0.030109  8.713168e+07
inc_100_125k       92160     25  3686.400000 -0.032489  1.120410e+08
inc_125_150k       65829     