In [8]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

In [9]:
# Load RI dataset
sim = Microsimulation(dataset="hf://policyengine/test/RI.h5")

In [10]:
# Check dataset size
household_weight = sim.calculate("household_weight", period=2025)
household_count = sim.calculate("household_count", period=2025, map_to="household")
person_count = sim.calculate("person_count", period=2025, map_to="household")

print(f"Number of households in dataset: {len(household_weight):,}")
print(f"Household count (mapped): {household_count.sum():,.0f}")
print(f"Person count (mapped): {person_count.sum():,.0f}")

Number of households in dataset: 2,368
Household count (mapped): 388,376
Person count (mapped): 1,106,390


In [11]:
# Check household income distribution (aggregate to household level using map_to)
agi = sim.calculate("adjusted_gross_income", period=2025, map_to="household")
print(f"Income distribution:")
print(f"  Median AGI: ${agi.median():,.0f}")
print(f"  75th percentile: ${agi.quantile(0.75):,.0f}")
print(f"  90th percentile: ${agi.quantile(0.90):,.0f}")
print(f"  95th percentile: ${agi.quantile(0.95):,.0f}")
print(f"  Max AGI: ${agi.max():,.0f}")
print(f"\nHouseholds by income threshold:")
print(f"  Households over $80k: {(agi > 80_000).sum():,}")
print(f"  Households over $120k: {(agi > 120_000).sum():,}")
print(f"  Households over $160k: {(agi > 160_000).sum():,}")
print(f"  Households over $240k: {(agi > 240_000).sum():,}")

Income distribution:
  Median AGI: $73,149
  75th percentile: $152,410
  90th percentile: $271,943
  95th percentile: $400,420
  Max AGI: $1,740,956

Households by income threshold:
  Households over $80k: 180,850.09423405278
  Households over $120k: 128,983.09166995375
  Households over $160k: 88,148.79416422347
  Households over $240k: 47,853.16035188042


In [12]:
# Check households with children (count at person level, aggregate to household)
is_child = sim.calculate("is_child", period=2025, map_to="person")
household_id = sim.calculate("household_id", period=2025, map_to="person")
household_weight = sim.calculate("household_weight", period=2025, map_to="person")

# Create DataFrame for easier manipulation
df_households = pd.DataFrame({
    'household_id': household_id,
    'is_child': is_child,
    'household_weight': household_weight
})

# Count children per household
children_per_household = df_households.groupby('household_id').agg({
    'is_child': 'sum',
    'household_weight': 'first'  # household_weight is same for all members
}).reset_index()

# Calculate weighted household counts
total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()
households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()
households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()
households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()

print(f"\nHouseholds with children (weighted):")
print(f"  Total households with children: {total_households_with_children:,.0f}")
print(f"  Households with 1 child: {households_with_1_child:,.0f}")
print(f"  Households with 2 children: {households_with_2_children:,.0f}")
print(f"  Households with 3+ children: {households_with_3plus_children:,.0f}")


Households with children (weighted):
  Total households with children: 121,492
  Households with 1 child: 66,113
  Households with 2 children: 37,589
  Households with 3+ children: 17,790


In [13]:
# Check children by age groups using Ben's workaround
import pandas as pd
df = pd.DataFrame({
    "household_id": sim.calculate("household_id", map_to="person"),
    "tax_unit_id": sim.calculate("tax_unit_id", map_to="person"),
    "person_id": sim.calculate("person_id", map_to="person"),
    "age": sim.calculate("age", map_to="person"),
    "person_weight": sim.calculate("person_weight", map_to="person")
})

# Filter for children and apply weights
children_under_4_df = df[df['age'] < 4]
children_under_6_df = df[df['age'] < 6]
children_under_18_df = df[df['age'] < 18]
children_6_17_df = df[(df['age'] >= 6) & (df['age'] < 18)]

# Calculate weighted totals
is_child = sim.calculate("is_child", period=2025)
total_children = is_child.sum()
children_under_4 = children_under_4_df['person_weight'].sum()
children_under_6 = children_under_6_df['person_weight'].sum()
children_6_17 = children_6_17_df['person_weight'].sum()

print(f"\nChildren by age:")
print(f"  Total children under 18: {total_children:,.0f}")
print(f"  Children under 4: {children_under_4:,.0f}")
print(f"  Children under 6: {children_under_6:,.0f}")
print(f"  Children ages 6-17: {children_6_17:,.0f}")

print(f"\nSample of children under 4:")
print(children_under_4_df[['household_id', 'tax_unit_id', 'person_id', 'age']].head(10))


Children by age:
  Total children under 18: 203,860
  Children under 4: 41,525
  Children under 6: 61,595
  Children ages 6-17: 138,203

Sample of children under 4:
     household_id  tax_unit_id  person_id  age
7         2730011            5    7730007  1.0
16        2730006           10    7730016  2.0
37        2730018           26    7730037  3.0
83        2730035           55    7730083  0.0
101       2730041           66    7730101  3.0
102       2730041           66    7730102  2.0
103       2730041           66    7730103  0.0
108       2730043           69    7730108  1.0
111       2730045           71    7730111  1.0
115       2730045           71    7730115  3.0


In [14]:
# Create weighted summary table
weighted_summary_data = {
    'Metric': [
        'Household count (weighted)',
        'Person count (weighted)',
        'Median AGI',
        '75th percentile AGI',
        '90th percentile AGI',
        '95th percentile AGI',
        'Max AGI',
        'Households over $80k',
        'Households over $120k',
        'Households over $160k',
        'Households over $240k',
        'Total households with children',
        'Households with 1 child',
        'Households with 2 children',
        'Households with 3+ children',
        'Total children under 18',
        'Children under 4',
        'Children under 6',
        'Children ages 6-17'
    ],
    'Value': [
        f"{household_count.sum():,.0f}",
        f"{person_count.sum():,.0f}",
        f"${agi.median():,.0f}",
        f"${agi.quantile(0.75):,.0f}",
        f"${agi.quantile(0.90):,.0f}",
        f"${agi.quantile(0.95):,.0f}",
        f"${agi.max():,.0f}",
        f"{(agi > 80_000).sum():,.0f}",
        f"{(agi > 120_000).sum():,.0f}",
        f"{(agi > 160_000).sum():,.0f}",
        f"{(agi > 240_000).sum():,.0f}",
        f"{total_households_with_children:,.0f}",
        f"{households_with_1_child:,.0f}",
        f"{households_with_2_children:,.0f}",
        f"{households_with_3plus_children:,.0f}",
        f"{total_children:,.0f}",
        f"{children_under_4:,.0f}",
        f"{children_under_6:,.0f}",
        f"{children_6_17:,.0f}"
    ]
}

# Get unique counts for unweighted table
unique_households = df['household_id'].nunique()
unique_persons = len(df)

# Create unweighted summary table
unweighted_summary_data = {
    'Metric': [
        'Number of households in dataset',
        'Number of persons in dataset',
        'Households with children (unweighted)',
        'Households with 1 child (unweighted)',
        'Households with 2 children (unweighted)',
        'Households with 3+ children (unweighted)',
        'Children under 18 (unweighted)',
        'Children under 4 (unweighted)',
        'Children under 6 (unweighted)',
        'Children ages 6-17 (unweighted)'
    ],
    'Value': [
        f"{unique_households:,}",
        f"{unique_persons:,}",
        f"{(children_per_household['is_child'] > 0).sum():,}",
        f"{(children_per_household['is_child'] == 1).sum():,}",
        f"{(children_per_household['is_child'] == 2).sum():,}",
        f"{(children_per_household['is_child'] >= 3).sum():,}",
        f"{len(children_under_18_df):,}",
        f"{len(children_under_4_df):,}",
        f"{len(children_under_6_df):,}",
        f"{len(children_6_17_df):,}"
    ]
}

weighted_df = pd.DataFrame(weighted_summary_data)
unweighted_df = pd.DataFrame(unweighted_summary_data)

print("\n" + "="*60)
print("RI DATASET SUMMARY - WEIGHTED (Population Estimates)")
print("="*60)
print(weighted_df.to_string(index=False))
print("="*60)

print("\n" + "="*60)
print("RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)")
print("="*60)
print(unweighted_df.to_string(index=False))
print("="*60)

# Save both tables
weighted_df.to_csv('ri_dataset_summary_weighted.csv', index=False)
unweighted_df.to_csv('ri_dataset_summary_unweighted.csv', index=False)
print("\nSummaries saved to:")
print("  - ri_dataset_summary_weighted.csv")
print("  - ri_dataset_summary_unweighted.csv")


RI DATASET SUMMARY - WEIGHTED (Population Estimates)
                        Metric      Value
    Household count (weighted)    388,376
       Person count (weighted)  1,106,390
                    Median AGI    $73,149
           75th percentile AGI   $152,410
           90th percentile AGI   $271,943
           95th percentile AGI   $400,420
                       Max AGI $1,740,956
          Households over $80k    180,850
         Households over $120k    128,983
         Households over $160k     88,149
         Households over $240k     47,853
Total households with children    121,492
       Households with 1 child     66,113
    Households with 2 children     37,589
   Households with 3+ children     17,790
       Total children under 18    203,860
              Children under 4     41,525
              Children under 6     61,595
            Children ages 6-17    138,203

RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)
                                  Metric Value
         N

In [18]:
# Compare median AGI at different aggregation levels
agi_household = sim.calculate("adjusted_gross_income", period=2025, map_to="household")
agi_tax_unit = sim.calculate("adjusted_gross_income", period=2025, map_to="tax_unit")
agi_person = sim.calculate("adjusted_gross_income", period=2025, map_to="person")

print("Median AGI by aggregation level:")
print(f"  Household level: ${agi_household.median():,.0f}")
print(f"  Tax unit level:  ${agi_tax_unit.median():,.0f}")
print(f"  Person level:    ${agi_person.median():,.0f}")

# Calculate total AGI - just sum the values (weights are already built into the arrays)
total_agi_tax_unit = agi_tax_unit.sum()
total_agi_household = agi_household.sum()
total_agi_person = agi_person.sum()

print(f"\nTotal AGI for Rhode Island (by aggregation level):")
print(f"  Using tax unit level:  ${total_agi_tax_unit:,.0f}")
print(f"  Using household level: ${total_agi_household:,.0f}")
print(f"  Using person level:    ${total_agi_person:,.0f}")

Median AGI by aggregation level:
  Household level: $73,149
  Tax unit level:  $35,546
  Person level:    $47,592

Total AGI for Rhode Island (by aggregation level):
  Using tax unit level:  $43,501,430,523
  Using household level: $43,501,430,523
  Using person level:    $77,844,195,552


In [20]:
# Break down AGI components at tax unit level
print("AGI Component Breakdown (Tax Unit Level)")
print("="*60)

# Calculate key income components
employment_income = sim.calculate("employment_income", period=2025, map_to="tax_unit")
self_employment_income = sim.calculate("self_employment_income", period=2025, map_to="tax_unit")
capital_gains = sim.calculate("capital_gains", period=2025, map_to="tax_unit")
qualified_dividend_income = sim.calculate("qualified_dividend_income", period=2025, map_to="tax_unit")
interest_income = sim.calculate("interest_income", period=2025, map_to="tax_unit")
taxable_social_security = sim.calculate("taxable_social_security", period=2025, map_to="tax_unit")
pension_income = sim.calculate("pension_income", period=2025, map_to="tax_unit")
adjusted_gross_income = sim.calculate("adjusted_gross_income", period=2025, map_to="tax_unit")

print("\nTotal Income (Statewide):")
print(f"  Employment Income:           ${employment_income.sum():>15,.0f}")
print(f"  Self-Employment Income:      ${self_employment_income.sum():>15,.0f}")
print(f"  Capital Gains:               ${capital_gains.sum():>15,.0f}")
print(f"  Qualified Dividends:         ${qualified_dividend_income.sum():>15,.0f}")
print(f"  Interest Income:             ${interest_income.sum():>15,.0f}")
print(f"  Taxable Social Security:     ${taxable_social_security.sum():>15,.0f}")
print(f"  Pension Income:              ${pension_income.sum():>15,.0f}")
print(f"  Adjusted Gross Income (AGI): ${adjusted_gross_income.sum():>15,.0f}")

print("\nMedian Values:")
print(f"  Employment Income:           ${employment_income.median():>15,.0f}")
print(f"  Self-Employment Income:      ${self_employment_income.median():>15,.0f}")
print(f"  Capital Gains:               ${capital_gains.median():>15,.0f}")
print(f"  Qualified Dividends:         ${qualified_dividend_income.median():>15,.0f}")
print(f"  Interest Income:             ${interest_income.median():>15,.0f}")
print(f"  Taxable Social Security:     ${taxable_social_security.median():>15,.0f}")
print(f"  Pension Income:              ${pension_income.median():>15,.0f}")
print(f"  Adjusted Gross Income (AGI): ${adjusted_gross_income.median():>15,.0f}")

# Calculate sum of components to compare with AGI
total_components = (employment_income + self_employment_income + capital_gains + 
                   qualified_dividend_income + interest_income + taxable_social_security + pension_income)
print(f"\nSum of income components:    ${total_components.sum():>15,.0f}")
print(f"AGI (for comparison):        ${adjusted_gross_income.sum():>15,.0f}")
print(f"Difference (potential missing income or deductions): ${(total_components.sum() - adjusted_gross_income.sum()):>15,.0f}")

AGI Component Breakdown (Tax Unit Level)

Total Income (Statewide):
  Employment Income:           $ 31,034,426,346
  Self-Employment Income:      $  1,890,240,187
  Capital Gains:               $  1,086,347,982
  Qualified Dividends:         $  1,002,331,804
  Interest Income:             $    670,462,607
  Taxable Social Security:     $  1,123,366,624
  Pension Income:              $  1,384,610,313
  Adjusted Gross Income (AGI): $ 43,501,430,523

Median Values:
  Employment Income:           $         29,531
  Self-Employment Income:      $              0
  Capital Gains:               $              0
  Qualified Dividends:         $              0
  Interest Income:             $              0
  Taxable Social Security:     $              0
  Pension Income:              $              0
  Adjusted Gross Income (AGI): $         35,546

Sum of income components:    $ 38,191,785,863
AGI (for comparison):        $ 43,501,430,523
Difference (potential missing income or deductions): $ 