In [2]:
from policyengine_us import Microsimulation

sim = Microsimulation(dataset = "hf://policyengine/test/sparse_cd_stacked_2023.h5")
cd_geoids = sim.calculate("congressional_district_geoid").values
correct_state_fips = cd_geoids // 100
sim.set_input("state_fips", 2023, correct_state_fips)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd
import numpy as np
from policyengine_us import Microsimulation
from policyengine_us.variables.input.geography import StateName

sim = Microsimulation(dataset = "hf://policyengine/test/sparse_cd_stacked_2023.h5")
YEAR = 2023

STATE_FIPS_TO_NAME = {
    1: StateName.AL, 2: StateName.AK, 4: StateName.AZ, 5: StateName.AR, 6: StateName.CA,
    8: StateName.CO, 9: StateName.CT, 10: StateName.DE, 11: StateName.DC,
    12: StateName.FL, 13: StateName.GA, 15: StateName.HI, 16: StateName.ID, 17: StateName.IL,
    18: StateName.IN, 19: StateName.IA, 20: StateName.KS, 21: StateName.KY, 22: StateName.LA,
    23: StateName.ME, 24: StateName.MD, 25: StateName.MA, 26: StateName.MI,
    27: StateName.MN, 28: StateName.MS, 29: StateName.MO, 30: StateName.MT,
    31: StateName.NE, 32: StateName.NV, 33: StateName.NH, 34: StateName.NJ,
    35: StateName.NM, 36: StateName.NY, 37: StateName.NC, 38: StateName.ND,
    39: StateName.OH, 40: StateName.OK, 41: StateName.OR, 42: StateName.PA,
    44: StateName.RI, 45: StateName.SC, 46: StateName.SD, 47: StateName.TN,
    48: StateName.TX, 49: StateName.UT, 50: StateName.VT, 51: StateName.VA, 53: StateName.WA,
    54: StateName.WV, 55: StateName.WI, 56: StateName.WY
}


cd_geoids = sim.calculate("congressional_district_geoid").values
correct_state_fips = cd_geoids // 100
correct_state_names = pd.Series(correct_state_fips).map(STATE_FIPS_TO_NAME).values

sim.set_input("state_fips", YEAR, correct_state_fips)

# Delete any cached calculations to force recalculation
if "state_name" in sim.tax_benefit_system.variables:
    sim.delete_arrays("state_name", YEAR)
if "state_code" in sim.tax_benefit_system.variables:
    sim.delete_arrays("state_code", YEAR)



In [5]:
household_id = sim.calculate("household_id", map_to="household", period=2026)

In [6]:
state_fips = sim.calculate("state_fips", map_to="household", period=2026)


In [7]:
congressional_district_geoid = sim.calculate("congressional_district_geoid", map_to="household", period=2026)


In [8]:
income_tax = sim.calculate("income_tax", map_to="household", period=2026)


In [9]:
state_name = sim.calculate("state_name", map_to="household", period=2026)


In [10]:
state_code = sim.calculate("state_code", map_to="household", period=2026)


In [15]:
weights = sim.calculate("household_weight", map_to="household", period=2026)

In [18]:
in_nj = state_code == "NJ"

In [25]:
Nj_agi = sim.calculate("nj_agi", map_to="household", period=2026)

In [27]:
avg_tax_by_district = (
    pd.DataFrame({
        "district": congressional_district_geoid[in_nj],
        "Nj_agi": Nj_agi[in_nj],
        "state": state_fips,
    })
    .groupby("district")["Nj_agi"]
    .median()
)

print(avg_tax_by_district)

district
3401.0    448377.273682
3402.0    474426.890625
3403.0    277227.515625
3404.0    267904.515625
3405.0    158957.747673
3406.0    462872.368164
3407.0    162854.945801
3408.0    450174.226074
3409.0    429199.406250
3410.0    477127.222656
3411.0    130707.304688
3412.0    178237.250000
Name: Nj_agi, dtype: float64


In [7]:
from policyengine_core.reforms import Reform


In [8]:


reform = Reform.from_dict({
  
  "gov.irs.deductions.itemized.interest.mortgage.cap.JOINT": {
    "2023-01-01.2100-12-31": 1000000
  },
  "gov.irs.deductions.itemized.interest.mortgage.cap.SINGLE": {
    "2023-01-01.2100-12-31": 1000000
  },
  "gov.irs.deductions.itemized.interest.mortgage.cap.SEPARATE": {
    "2023-01-01.2100-12-31": 500000
  },
  "gov.irs.deductions.itemized.salt_and_real_estate.cap.JOINT": {
    "2025-01-01.2025-12-31": 10000,
    "2023-01-01.2100-12-31": 1000000000000
  },
  "gov.irs.deductions.itemized.salt_and_real_estate.cap.SINGLE": {
    "2025-01-01.2025-12-31": 10000,
    "2023-01-01.2100-12-31": 1000000000000
  },
  "gov.irs.deductions.itemized.salt_and_real_estate.cap.SEPARATE": {
    "2025-01-01.2025-12-31": 5000,
    "2023-01-01.2100-12-31": 1000000000000
  },
  "gov.irs.deductions.itemized.interest.mortgage.cap.SURVIVING_SPOUSE": {
    "2023-01-01.2100-12-31": 1000000
  },
  "gov.irs.deductions.itemized.interest.mortgage.cap.HEAD_OF_HOUSEHOLD": {
    "2023-01-01.2100-12-31": 1000000
  },
  "gov.irs.deductions.itemized.salt_and_real_estate.phase_out.in_effect": {
    "2025-01-01.2029-12-31": False
  },
  "gov.irs.deductions.itemized.salt_and_real_estate.cap.SURVIVING_SPOUSE": {
    "2025-01-01.2025-12-31": 10000,
    "2023-01-01.2100-12-31": 1000000000000
  },
  "gov.irs.deductions.itemized.salt_and_real_estate.cap.HEAD_OF_HOUSEHOLD": {
    "2025-01-01.2025-12-31": 10000,
    "2023-01-01.2100-12-31": 1000000000000
  },
  "gov.irs.deductions.itemized.salt_and_real_estate.phase_out.floor.applies": {
    "2025-01-01.2029-12-31": False
  }
}, country_id="us")


reformed = Microsimulation(reform=reform, dataset = "hf://policyengine/test/sparse_cd_stacked_2023.h5")

In [9]:
# Apply the same state_fips correction to the reformed simulation
cd_geoids_reform = reformed.calculate("congressional_district_geoid").values
correct_state_fips_reform = cd_geoids_reform // 100
reformed.set_input("state_fips", 2023, correct_state_fips_reform)

# Delete any cached calculations to force recalculation
if "state_name" in reformed.tax_benefit_system.variables:
    reformed.delete_arrays("state_name", 2023)
if "state_code" in reformed.tax_benefit_system.variables:
    reformed.delete_arrays("state_code", 2023)

In [10]:
r_df = reformed.calculate_dataframe(['household_id', 'state_fips', 'congressional_district_geoid', 'income_tax', 'state_name', 'state_code', 'household_net_income', 'household_weight'])


In [11]:
r_state_df = r_df.loc[r_df.state_fips == 34]
r_state_df

Unnamed: 0,household_id,state_fips,congressional_district_geoid,income_tax,state_name,state_code,household_net_income,household_weight
54,203,34,3406,3.611006e+05,NJ,NJ,254792.531250,21.920219
100,324,34,3410,8.984263e+05,NJ,NJ,520829.937500,38.141525
117,373,34,3402,3.622267e+04,NJ,NJ,116267.265625,179.311432
243,655,34,3401,1.157711e+04,NJ,NJ,181396.546875,42.934647
244,657,34,3402,1.157711e+04,NJ,NJ,181396.546875,2995.783203
...,...,...,...,...,...,...,...,...
88774,271829,34,3410,1.740626e+05,NJ,NJ,743414.687500,58.284195
88808,271914,34,3409,1.529304e+06,NJ,NJ,74466.750000,37.558510
88832,272046,34,3408,8.131955e+04,NJ,NJ,427765.562500,178.973404
88883,272263,34,3404,5.986858e+04,NJ,NJ,317212.906250,66.759209


In [12]:
r_avg_net_income_by_cd = (
      r_state_df.groupby('congressional_district_geoid')
      .apply(lambda x: (x['income_tax'] *
  x['household_weight']).sum() / x['household_weight'].sum())
      .reset_index(name='income_tax')
  )

In [13]:
print(r_avg_net_income_by_cd)

    congressional_district_geoid    income_tax
0                           3401  37503.864332
1                           3402  30258.588773
2                           3403  51999.651513
3                           3404  68042.135731
4                           3405  55298.933111
5                           3406  49727.539093
6                           3407  60044.451366
7                           3408  32163.931612
8                           3409  45049.938094
9                           3410  41262.861869
10                          3411  66339.066182
11                          3412  62295.689690


In [None]:
# Let's examine the data from your notebook more carefully
import pandas as pd
import numpy as np

# Recreate some of the values from cell 10 to analyze
income_tax_values = [3.611006e+05, 8.984263e+05, 3.622267e+04, 1.157711e+04, 1.740626e+05, 
                     1.529304e+06, 8.131955e+04, 5.986858e+04]
weights = [21.920219, 38.141525, 179.311432, 42.934647, 58.284195, 
           37.558510, 178.973404, 66.759209]

# These are some of your actual values
print("Sample income tax values from your data:")
for i, val in enumerate(income_tax_values[:5]):
    print(f"  ${val:,.0f} (weight: {weights[i]:.1f})")
    
print(f"\nMaximum value shown: ${max(income_tax_values):,.0f}")
print(f"That's household 271914 with income tax of $1,529,304!")