# PA Dataset Exploration

This notebook explores the Pennsylvania (PA) dataset to understand household counts, income distribution, and demographic characteristics.

In [1]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load PA dataset
sim = Microsimulation(dataset='hf://policyengine/test/PA.h5')

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [3]:
# Check dataset size
household_weight = sim.calculate("household_weight", period=2025)
household_count = sim.calculate("household_count", period=2025, map_to="household")
person_count = sim.calculate("person_count", period=2025, map_to="household")

print(f"Number of households in dataset: {len(household_weight):,}")
print(f"Household count (weighted): {household_count.sum():,.0f}")
print(f"Person count (weighted): {person_count.sum():,.0f}")

Number of households in dataset: 20,180
Household count (weighted): 4,435,467
Person count (weighted): 12,863,313


In [4]:
# Check household income distribution
agi = sim.calculate("adjusted_gross_income", period=2025, map_to="household")
print(f"Income distribution:")
print(f"  Median AGI: ${agi.median():,.0f}")
print(f"  75th percentile: ${agi.quantile(0.75):,.0f}")
print(f"  90th percentile: ${agi.quantile(0.90):,.0f}")
print(f"  95th percentile: ${agi.quantile(0.95):,.0f}")
print(f"  Max AGI: ${agi.max():,.0f}")

Income distribution:
  Median AGI: $71,734
  75th percentile: $149,456
  90th percentile: $268,015
  95th percentile: $379,910
  Max AGI: $1,838,621


In [5]:
# Check households with children
is_child = sim.calculate("is_child", period=2025, map_to="person")
household_id = sim.calculate("household_id", period=2025, map_to="person")
household_weight = sim.calculate("household_weight", period=2025, map_to="person")

# Create DataFrame
df_households = pd.DataFrame({
    'household_id': household_id,
    'is_child': is_child,
    'household_weight': household_weight
})

# Count children per household
children_per_household = df_households.groupby('household_id').agg({
    'is_child': 'sum',
    'household_weight': 'first'
}).reset_index()

# Calculate weighted household counts
total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()
households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()
households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()
households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()

print(f"\nHouseholds with children (weighted):")
print(f"  Total households with children: {total_households_with_children:,.0f}")
print(f"  Households with 1 child: {households_with_1_child:,.0f}")
print(f"  Households with 2 children: {households_with_2_children:,.0f}")
print(f"  Households with 3+ children: {households_with_3plus_children:,.0f}")


Households with children (weighted):
  Total households with children: 1,457,610
  Households with 1 child: 734,446
  Households with 2 children: 481,892
  Households with 3+ children: 241,273


In [6]:
# Check children by age groups
df = pd.DataFrame({
    "household_id": sim.calculate("household_id", map_to="person"),
    "tax_unit_id": sim.calculate("tax_unit_id", map_to="person"),
    "person_id": sim.calculate("person_id", map_to="person"),
    "age": sim.calculate("age", map_to="person"),
    "person_weight": sim.calculate("person_weight", map_to="person")
})

# Filter for children and apply weights
children_under_18_df = df[df['age'] < 18]
children_under_6_df = df[df['age'] < 6]

# Calculate weighted totals
total_children = children_under_18_df['person_weight'].sum()
children_under_6 = children_under_6_df['person_weight'].sum()

print(f"\nChildren by age:")
print(f"  Total children under 18: {total_children:,.0f}")
print(f"  Children under 6: {children_under_6:,.0f}")


Children by age:
  Total children under 18: 2,494,202
  Children under 6: 780,623


In [7]:
# Create weighted summary table
weighted_summary_data = {
    'Metric': [
        'Household count (weighted)',
        'Person count (weighted)',
        'Median AGI',
        '75th percentile AGI',
        '90th percentile AGI',
        '95th percentile AGI',
        'Max AGI',
        'Total households with children',
        'Households with 1 child',
        'Households with 2 children',
        'Households with 3+ children',
        'Total children under 18',
        'Children under 6'
    ],
    'Value': [
        f"{household_count.sum():,.0f}",
        f"{person_count.sum():,.0f}",
        f"${agi.median():,.0f}",
        f"${agi.quantile(0.75):,.0f}",
        f"${agi.quantile(0.90):,.0f}",
        f"${agi.quantile(0.95):,.0f}",
        f"${agi.max():,.0f}",
        f"{total_households_with_children:,.0f}",
        f"{households_with_1_child:,.0f}",
        f"{households_with_2_children:,.0f}",
        f"{households_with_3plus_children:,.0f}",
        f"{total_children:,.0f}",
        f"{children_under_6:,.0f}"
    ]
}

weighted_df = pd.DataFrame(weighted_summary_data)

print("\n" + "="*60)
print("PA DATASET SUMMARY - WEIGHTED (Population Estimates)")
print("="*60)
print(weighted_df.to_string(index=False))
print("="*60)

# Save table
weighted_df.to_csv('pa_dataset_summary_weighted.csv', index=False)
print("\nSummary saved to: pa_dataset_summary_weighted.csv")


PA DATASET SUMMARY - WEIGHTED (Population Estimates)
                        Metric      Value
    Household count (weighted)  4,435,467
       Person count (weighted) 12,863,313
                    Median AGI    $71,734
           75th percentile AGI   $149,456
           90th percentile AGI   $268,015
           95th percentile AGI   $379,910
                       Max AGI $1,838,621
Total households with children  1,457,610
       Households with 1 child    734,446
    Households with 2 children    481,892
   Households with 3+ children    241,273
       Total children under 18  2,494,202
              Children under 6    780,623

Summary saved to: pa_dataset_summary_weighted.csv


In [None]:
# Households with $0 income (uses agi and household_weight from earlier cells)
zero_income_mask = agi == 0
zero_income_weighted = household_weight[zero_income_mask].sum()
zero_income_unweighted = zero_income_mask.sum()

print("\n" + "="*70)
print("HOUSEHOLDS WITH $0 INCOME")
print("="*70)
print(f"Weighted count:   {zero_income_weighted:,.0f}")
print(f"Unweighted count: {zero_income_unweighted:,}")
print(f"\nPercentage of all households with $0 income:")
print(f"  {zero_income_weighted / household_weight.sum() * 100:.2f}%")
print("="*70)

In [13]:
# Household counts by income brackets (uses agi and household_weight from earlier cells)
# Define income brackets from $0-$10k up to $50k-$60k
income_brackets = [
    (0, 10000, "$0-$10k"),
    (10000, 20000, "$10k-$20k"),
    (20000, 30000, "$20k-$30k"),
    (30000, 40000, "$30k-$40k"),
    (40000, 50000, "$40k-$50k"),
    (50000, 60000, "$50k-$60k")
]

# Get total households for percentage calculation
total_households_weighted = household_weight.sum()

# Calculate weighted household counts for each bracket
bracket_data = []
for lower, upper, label in income_brackets:
    mask = (agi >= lower) & (agi < upper)
    weighted_count = household_weight[mask].sum()
    unweighted_count = mask.sum()
    pct_of_total = (weighted_count / total_households_weighted) * 100
    
    bracket_data.append({
        "Income Bracket": label,
        "Households (Weighted)": f"{weighted_count:,.0f}",
        "% of All Households": f"{pct_of_total:.2f}%",
        "Households (Unweighted)": f"{unweighted_count:,}"
    })

income_df = pd.DataFrame(bracket_data)

print("\n" + "="*90)
print("HOUSEHOLD COUNTS BY INCOME BRACKET")
print("="*90)
print(income_df.to_string(index=False))
print("="*90)

# Also calculate total across all brackets
total_weighted = sum([household_weight[(agi >= lower) & (agi < upper)].sum() for lower, upper, _ in income_brackets])
total_unweighted = sum([((agi >= lower) & (agi < upper)).sum() for lower, upper, _ in income_brackets])
print(f"\nTotal households in $0-$60k range:")
print(f"  Weighted: {total_weighted:,.0f}")
print(f"  Unweighted: {total_unweighted:,}")
print(f"\nPercentage of all households in $0-$60k range:")
print(f"  {total_weighted / total_households_weighted * 100:.2f}%")


HOUSEHOLD COUNTS BY INCOME BRACKET
Income Bracket Households (Weighted) % of All Households Households (Unweighted)
       $0-$10k           239,725,630              15.27%                   4,540
     $10k-$20k            37,046,016               2.36%                     765
     $20k-$30k            44,020,114               2.80%                     723
     $30k-$40k           108,601,465               6.92%                   1,264
     $40k-$50k            77,534,722               4.94%                   1,034
     $50k-$60k            66,831,837               4.26%                     937

Total households in $0-$60k range:
  Weighted: 573,759,784
  Unweighted: 9,263

Percentage of all households in $0-$60k range:
  36.56%
