In [27]:
from policyengine_us import Microsimulation
from policyengine_core.reforms import Reform
import pandas as pd
import numpy as np

# Create baseline simulation with the correct dataset
baseline = Microsimulation(dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5")

In [28]:
# Test that basic calculations are working correctly
year = 2023

# Get household-level variables with proper mapping
state_code = baseline.calculate("state_code", map_to="household", period=year)
household_weight = baseline.calculate("household_weight", period=year)
household_id = baseline.calculate("household_id", map_to="household", period=year)

# Check the data
print(f"Total households: {len(household_id)}")
print(f"Total weighted households: {household_weight.sum():,.0f}")
print(f"Unique states: {len(state_code.unique())}")
print(f"\nSample of state codes:")
print(state_code.value_counts().head(10))

Total households: 21108
Total weighted households: 0
Unique states: 1

Sample of state codes:
CA    21108
Name: count, dtype: int64


In [29]:
# Calculate baseline income tax and create a dataframe
income_tax = baseline.calculate("income_tax", map_to="household", period=year)

df_baseline = pd.DataFrame({
    "household_id": household_id.values,
    "state_code": state_code.values,
    "income_tax": income_tax.values,
    "weight": household_weight.values
})

print("Baseline data sample:")
print(df_baseline.head())
print(f"\nTotal weighted income tax: ${(df_baseline['income_tax'] * df_baseline['weight']).sum()/1e9:.1f} billion")

Baseline data sample:
   household_id state_code  income_tax  weight
0             0         CA         0.0     0.0
1             0         CA         0.0     0.0
2             0         CA         0.0     0.0
3             0         CA         0.0     0.0
4             0         CA         0.0     0.0

Total weighted income tax: $0.0 billion


In [30]:
# Get household-level variables for analysis
year = 2023

household_market_income = baseline.calculate("household_market_income", map_to="household", period=year)
household_net_income = baseline.calculate("household_net_income", map_to="household", period=year)
state_code = baseline.calculate("state_code", map_to="household", period=year)
household_weight = baseline.calculate("household_weight", period=year)

# Create analysis dataframe
df_household = pd.DataFrame({
    "state_code": state_code.values,
    "household_market_income": household_market_income.values,
    "household_net_income": household_net_income.values,
    "weight": household_weight.values
})

print(f"Data shape: {df_household.shape}")
print(f"\nState distribution (top 10 states by weighted count):")
state_summary = df_household.groupby('state_code')['weight'].sum().sort_values(ascending=False).head(10)
for state, weight in state_summary.items():
    print(f"  {state}: {weight:,.0f}")

Data shape: (21108, 4)

State distribution (top 10 states by weighted count):
  CA: 0


In [31]:
# Aggregate statistics by state
df_counts = df_household.groupby('state_code').agg({
    'weight': ['count', 'sum']
}).reset_index()

df_counts.columns = ['state_code', 'unweighted_households', 'weighted_households']

print("Household counts by state (top 10):")
print(df_counts.sort_values('weighted_households', ascending=False).head(10))

Household counts by state (top 10):
  state_code  unweighted_households  weighted_households
0         CA                  21108                  0.0


In [32]:
# Create SALT deduction cap reform (remove the $10,000 cap)
reform = Reform.from_dict({
    "gov.irs.deductions.itemized.salt_and_real_estate.cap.JOINT": {
        "2023-01-01.2100-12-31": 0
    },
    "gov.irs.deductions.itemized.salt_and_real_estate.cap.SINGLE": {
        "2023-01-01.2100-12-31": 0
    },
    "gov.irs.deductions.itemized.salt_and_real_estate.cap.SEPARATE": {
        "2023-01-01.2100-12-31": 0
    },
    "gov.irs.deductions.itemized.salt_and_real_estate.cap.SURVIVING_SPOUSE": {
        "2023-01-01.2100-12-31": 0
    },
    "gov.irs.deductions.itemized.salt_and_real_estate.cap.HEAD_OF_HOUSEHOLD": {
        "2023-01-01.2100-12-31": 0
    },
    "gov.irs.deductions.itemized.salt_and_real_estate.phase_out.floor.applies": {
        "2023-01-01.2029-12-31": False
    }
}, country_id="us")

print("Reform created: Removing SALT deduction cap")

Reform created: Removing SALT deduction cap


In [33]:
# Create reformed simulation
reformed = Microsimulation(
    reform=reform,
    dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"
)

print("Reformed simulation created")

Reformed simulation created


In [34]:
# Calculate reformed variables
year = 2023

state_code_r = reformed.calculate("state_code", map_to="household", period=year)
household_market_income_r = reformed.calculate("household_market_income", map_to="household", period=year)
household_net_income_r = reformed.calculate("household_net_income", map_to="household", period=year)
household_weight_r = reformed.calculate("household_weight", period=year)

# Get baseline values for comparison
household_net_income_b = baseline.calculate("household_net_income", map_to="household", period=year)

print("Reformed variables calculated")
print(f"Total households: {len(household_net_income_r)}")
print(f"Total weighted households: {household_weight_r.sum():,.0f}")

Reformed variables calculated
Total households: 21108
Total weighted households: 0


In [35]:
# Create comprehensive analysis dataframe
df_analysis = pd.DataFrame({
    "state_code": state_code_r.values,
    "household_net_income_baseline": household_net_income_b.values,
    "household_net_income_reform": household_net_income_r.values,
    "weight": household_weight_r.values,
    "household_market_income": household_market_income_r.values
})

# Calculate net change
df_analysis['net_change'] = df_analysis['household_net_income_reform'] - df_analysis['household_net_income_baseline']

# Define weighted median function
def weighted_median(values, weights):
    # Remove NaN values
    mask = ~np.isnan(values)
    values = values[mask]
    weights = weights[mask]
    
    if len(values) == 0:
        return np.nan
    
    i = np.argsort(values)
    c = np.cumsum(weights[i])
    return values[i[np.searchsorted(c, 0.5 * c[-1])]]

# Calculate state-level statistics
df_state_summary = df_analysis.groupby('state_code').apply(
    lambda x: pd.Series({
        'median_baseline': weighted_median(x['household_net_income_baseline'].values, x['weight'].values),
        'median_reform': weighted_median(x['household_net_income_reform'].values, x['weight'].values),
        'mean_net_change': np.average(x['net_change'].values, weights=x['weight'].values),
        'total_weighted_households': x['weight'].sum(),
        'households_gaining': ((x['net_change'] > 0) * x['weight']).sum(),
        'households_losing': ((x['net_change'] < 0) * x['weight']).sum()
    })
).reset_index()

print("State-level impact of SALT cap removal (top 10 states by average gain):")
print(df_state_summary.sort_values('mean_net_change', ascending=False).head(10).to_string())

ZeroDivisionError: Weights sum to zero, can't be normalized

In [36]:
# Test the basic imports and setup from the working notebook
from policyengine_us import Microsimulation
baseline = Microsimulation(dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5")

# Check if basic calculation works
year = 2024
test_weight = baseline.calculate("household_weight", period=year)
print(f"Weight calculation works: {test_weight is not None}")
print(f"Total weighted households: {test_weight.sum():,.0f}")

Weight calculation works: True
Total weighted households: 17,907,623,107,028
