In [1]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

# Load the baseline simulation
baseline = Microsimulation(dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set the year for analysis
year = 2026

# Get household-level variables
household_id = baseline.calculate("household_id", map_to="household", period=year)
state = baseline.calculate("state_code", map_to="household", period=year)
num_dependents = baseline.calculate("tax_unit_dependents", map_to="household", period=year)
married = baseline.calculate("is_married", map_to="household", period=year)
employment_income = baseline.calculate("employment_income", map_to="household", period=year)
self_employment_income = baseline.calculate("self_employment_income", map_to="household", period=year)
aca_ptc = baseline.calculate("aca_ptc", map_to="household", period=year)
household_weight = baseline.calculate("household_weight", map_to="household", period=year)

# Calculate total income (employment + self-employment)
total_income = employment_income + self_employment_income

In [3]:
# Create a DataFrame with the outputs
df = pd.DataFrame({
    "household_id": household_id,
    "state": state,
    "married": married,
    "num_dependents": num_dependents,
    "employment_income": employment_income,
    "self_employment_income": self_employment_income,
    "total_income": total_income,
    "aca_ptc": aca_ptc,
    "weight": household_weight
})

In [4]:
# Filter for households that:
# 1. Have weight > 100 (reasonably representative)
# 2. Receive premium tax credit (aca_ptc > 0)
ptc_recipients = df[(df['weight'] > 100) & (df['aca_ptc'] > 0)].copy()

print(f"Total households with weight > 100 receiving PTC: {len(ptc_recipients)}")
print(f"Weighted count: {ptc_recipients['weight'].sum():,.0f}")
print(f"Average PTC amount: ${ptc_recipients['aca_ptc'].mean():,.2f}")
print(f"Weighted average PTC: ${(ptc_recipients['aca_ptc'] * ptc_recipients['weight']).sum() / ptc_recipients['weight'].sum():,.2f}")

Total households with weight > 100 receiving PTC: 1396
Weighted count: 10,098,276
Average PTC amount: $8,036.41
Weighted average PTC: $6,823.57


In [5]:
# Find the highest income households receiving PTC
# Sort by total income descending
highest_income_ptc = ptc_recipients.nlargest(20, 'total_income')

print("="*80)
print("TOP 20 HIGHEST INCOME HOUSEHOLDS RECEIVING PREMIUM TAX CREDIT")
print("(With weight > 100 for representativeness)")
print("="*80)

# Display relevant columns
display_cols = ['household_id', 'state', 'total_income', 'employment_income', 
                'self_employment_income', 'aca_ptc', 'weight', 'married', 'num_dependents']
highest_income_ptc[display_cols]

TOP 20 HIGHEST INCOME HOUSEHOLDS RECEIVING PREMIUM TAX CREDIT
(With weight > 100 for representativeness)


Unnamed: 0,household_id,state,total_income,employment_income,self_employment_income,aca_ptc,weight,married,num_dependents
20396,169627,CA,1855321.0,495434.0,1359887.0,5944.546387,171.459244,1.0,3.0
8766,58382,OK,1252050.0,1206578.0,45471.72,3546.526611,119.734879,0.0,0.0
417,4587,MA,1069470.0,1069470.0,0.0,14769.839844,7003.146484,0.0,0.0
1435,10592,NJ,637665.5,499085.0,138580.5,841.781494,230.419098,1.0,1.0
4347,25198,MO,547348.2,547348.2,0.0,5799.687012,11673.164062,1.0,0.0
12168,83562,CA,475012.7,312613.7,162399.0,2893.667969,256.603333,1.0,0.0
7376,47815,FL,435465.4,435465.4,0.0,6488.514648,1477.654663,1.0,0.0
8671,57583,LA,331260.8,331260.8,0.0,3354.018799,15023.986328,1.0,0.0
6449,42141,GA,331050.3,6252.273,324798.0,9192.425781,843.286621,1.0,2.0
3239,19933,IL,312613.7,312613.7,0.0,4493.443359,857.764832,1.0,0.0


In [6]:
# Analyze income distribution of PTC recipients
print("="*80)
print("INCOME DISTRIBUTION OF PTC RECIPIENTS (weight > 100)")
print("="*80)

# Calculate percentiles
percentiles = [25, 50, 75, 90, 95, 99]
income_pcts = np.percentile(ptc_recipients['total_income'], percentiles)

print("\nIncome percentiles among PTC recipients:")
for p, val in zip(percentiles, income_pcts):
    print(f"  {p}th percentile: ${val:,.0f}")

# Basic statistics
print("\nSummary statistics:")
print(ptc_recipients['total_income'].describe())

INCOME DISTRIBUTION OF PTC RECIPIENTS (weight > 100)

Income percentiles among PTC recipients:
  25th percentile: $32,923
  50th percentile: $57,038
  75th percentile: $89,882
  90th percentile: $142,596
  95th percentile: $176,355
  99th percentile: $295,393

Summary statistics:
count    1.396000e+03
mean     7.285001e+04
std      8.689812e+04
min     -1.082552e+04
25%      3.292305e+04
50%      5.703829e+04
75%      8.988183e+04
max      1.855321e+06
Name: total_income, dtype: float64


In [7]:
# Calculate FPL ratios for context
# 2026 FPL estimates (rough approximations)
fpl_2026 = {
    1: 15570,   # Single person
    2: 21130,   # Couple
    3: 26650,   # Family of 3
    4: 32200,   # Family of 4
    5: 37750,   # Family of 5
    6: 43300,   # Family of 6
    7: 48850,   # Family of 7
    8: 54400,   # Family of 8
}

# Calculate household size and FPL ratio
ptc_recipients['household_size'] = ptc_recipients.apply(
    lambda row: (1 + row['married'] + row['num_dependents']) if not pd.isna(row['married']) else 1,
    axis=1
)

# Map FPL based on household size
ptc_recipients['fpl_threshold'] = ptc_recipients['household_size'].map(
    lambda x: fpl_2026.get(min(int(x), 8), 54400)
)
ptc_recipients['fpl_ratio'] = (ptc_recipients['total_income'] / ptc_recipients['fpl_threshold']) * 100

# Show the highest income recipients with FPL context
print("="*80)
print("HIGHEST INCOME PTC RECIPIENTS WITH FPL CONTEXT")
print("="*80)

highest_with_fpl = ptc_recipients.nlargest(20, 'total_income')[[
    'household_id', 'state', 'total_income', 'fpl_ratio', 'aca_ptc', 
    'weight', 'household_size'
]]

# Format FPL ratio for display
highest_with_fpl['fpl_ratio'] = highest_with_fpl['fpl_ratio'].round(0).astype(int)
highest_with_fpl

HIGHEST INCOME PTC RECIPIENTS WITH FPL CONTEXT


Unnamed: 0,household_id,state,total_income,fpl_ratio,aca_ptc,weight,household_size
20396,169627,CA,1855321.0,4915,5944.546387,171.459244,5.0
8766,58382,OK,1252050.0,8041,3546.526611,119.734879,1.0
417,4587,MA,1069470.0,6869,14769.839844,7003.146484,1.0
1435,10592,NJ,637665.5,2393,841.781494,230.419098,3.0
4347,25198,MO,547348.2,2590,5799.687012,11673.164062,2.0
12168,83562,CA,475012.7,2248,2893.667969,256.603333,2.0
7376,47815,FL,435465.4,2061,6488.514648,1477.654663,2.0
8671,57583,LA,331260.8,1568,3354.018799,15023.986328,2.0
6449,42141,GA,331050.3,1028,9192.425781,843.286621,4.0
3239,19933,IL,312613.7,1479,4493.443359,857.764832,2.0


In [8]:
# Group by income ranges to see distribution
income_ranges = [
    (0, 50000, "$0-50K"),
    (50000, 100000, "$50K-100K"),
    (100000, 150000, "$100K-150K"),
    (150000, 200000, "$150K-200K"),
    (200000, 250000, "$200K-250K"),
    (250000, 300000, "$250K-300K"),
    (300000, float('inf'), "$300K+")
]

print("="*80)
print("PTC RECIPIENTS BY INCOME RANGE (weight > 100)")
print("="*80)

for low, high, label in income_ranges:
    mask = (ptc_recipients['total_income'] >= low) & (ptc_recipients['total_income'] < high)
    range_data = ptc_recipients[mask]
    
    if len(range_data) > 0:
        weighted_count = range_data['weight'].sum()
        avg_ptc = (range_data['aca_ptc'] * range_data['weight']).sum() / range_data['weight'].sum()
        
        print(f"\n{label}:")
        print(f"  Households: {len(range_data)}")
        print(f"  Weighted count: {weighted_count:,.0f}")
        print(f"  Average PTC: ${avg_ptc:,.2f}")
        print(f"  Max income in range: ${range_data['total_income'].max():,.0f}")

PTC RECIPIENTS BY INCOME RANGE (weight > 100)

$0-50K:
  Households: 571
  Weighted count: 4,168,472
  Average PTC: $8,059.05
  Max income in range: $49,360

$50K-100K:
  Households: 535
  Weighted count: 4,072,566
  Average PTC: $5,586.87
  Max income in range: $99,818

$100K-150K:
  Households: 172
  Weighted count: 1,339,190
  Average PTC: $6,625.66
  Max income in range: $149,835

$150K-200K:
  Households: 64
  Weighted count: 271,233
  Average PTC: $8,763.68
  Max income in range: $196,315

$200K-250K:
  Households: 23
  Weighted count: 59,271
  Average PTC: $6,760.74
  Max income in range: $244,168

$250K-300K:
  Households: 14
  Weighted count: 38,336
  Average PTC: $5,667.78
  Max income in range: $295,063

$300K+:
  Households: 14
  Weighted count: 43,780
  Average PTC: $6,057.11
  Max income in range: $1,855,321


In [9]:
# Find outliers - very high income households receiving substantial PTC
# Define "high income" as above $200K and "substantial PTC" as above $5K
outliers = ptc_recipients[
    (ptc_recipients['total_income'] > 200000) & 
    (ptc_recipients['aca_ptc'] > 5000)
].copy()

print("="*80)
print("HIGH-INCOME HOUSEHOLDS (>$200K) RECEIVING SUBSTANTIAL PTC (>$5K)")
print("="*80)

if len(outliers) > 0:
    outliers_sorted = outliers.sort_values('total_income', ascending=False)
    print(f"\nFound {len(outliers)} such households")
    print(f"Weighted count: {outliers['weight'].sum():,.0f}")
    
    # Show details
    outliers_display = outliers_sorted[[
        'household_id', 'state', 'total_income', 'aca_ptc', 
        'fpl_ratio', 'weight', 'married', 'num_dependents'
    ]].head(10)
    
    outliers_display
else:
    print("No households found meeting these criteria")

HIGH-INCOME HOUSEHOLDS (>$200K) RECEIVING SUBSTANTIAL PTC (>$5K)

Found 22 such households
Weighted count: 85,983
