# PA Dataset Exploration

This notebook explores the Pennsylvania (PA) dataset to understand household counts, income distribution, and demographic characteristics.

In [1]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

PA_DATASET = "hf://policyengine/policyengine-us-data/states/PA.h5"

In [2]:
# Load PA dataset
sim = Microsimulation(dataset=PA_DATASET)

PA.h5:   0%|          | 0.00/149M [00:00<?, ?B/s]

In [3]:
# Check dataset size
household_weight = sim.calculate("household_weight", period=2025)
household_count = sim.calculate("household_count", period=2025, map_to="household")
person_count = sim.calculate("person_count", period=2025, map_to="household")

print(f"Number of households in dataset: {len(household_weight):,}")
print(f"Household count (weighted): {household_count.sum():,.0f}")
print(f"Person count (weighted): {person_count.sum():,.0f}")

Number of households in dataset: 68,351
Household count (weighted): 4,662,650
Person count (weighted): 13,217,679


In [4]:
# Check household income distribution
agi = sim.calculate("adjusted_gross_income", period=2025, map_to="household")
print(f"Income distribution:")
print(f"  Median AGI: ${agi.median():,.0f}")
print(f"  75th percentile: ${agi.quantile(0.75):,.0f}")
print(f"  90th percentile: ${agi.quantile(0.90):,.0f}")
print(f"  95th percentile: ${agi.quantile(0.95):,.0f}")
print(f"  Max AGI: ${agi.max():,.0f}")

Income distribution:
  Median AGI: $73,962
  75th percentile: $169,351
  90th percentile: $404,412
  95th percentile: $511,573
  Max AGI: $3,229,514


In [5]:
# Check households with children
is_child = sim.calculate("is_child", period=2025, map_to="person")
household_id = sim.calculate("household_id", period=2025, map_to="person")
household_weight = sim.calculate("household_weight", period=2025, map_to="person")

# Create DataFrame
df_households = pd.DataFrame({
    'household_id': household_id,
    'is_child': is_child,
    'household_weight': household_weight
})

# Count children per household
children_per_household = df_households.groupby('household_id').agg({
    'is_child': 'sum',
    'household_weight': 'first'
}).reset_index()

# Calculate weighted household counts
total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()
households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()
households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()
households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()

print(f"\nHouseholds with children (weighted):")
print(f"  Total households with children: {total_households_with_children:,.0f}")
print(f"  Households with 1 child: {households_with_1_child:,.0f}")
print(f"  Households with 2 children: {households_with_2_children:,.0f}")
print(f"  Households with 3+ children: {households_with_3plus_children:,.0f}")


Households with children (weighted):
  Total households with children: 1,489,087
  Households with 1 child: 720,370
  Households with 2 children: 504,238
  Households with 3+ children: 264,479


In [6]:
# Check children by age groups
df = pd.DataFrame({
    "household_id": sim.calculate("household_id", map_to="person"),
    "tax_unit_id": sim.calculate("tax_unit_id", map_to="person"),
    "person_id": sim.calculate("person_id", map_to="person"),
    "age": sim.calculate("age", map_to="person"),
    "person_weight": sim.calculate("person_weight", map_to="person")
})

# Filter for children and apply weights
children_under_18_df = df[df['age'] < 18]
children_under_6_df = df[df['age'] < 6]

# Calculate weighted totals
total_children = children_under_18_df['person_weight'].sum()
children_under_6 = children_under_6_df['person_weight'].sum()

print(f"\nChildren by age:")
print(f"  Total children under 18: {total_children:,.0f}")
print(f"  Children under 6: {children_under_6:,.0f}")


Children by age:
  Total children under 18: 2,597,022
  Children under 6: 799,168


In [7]:
# Create weighted summary table
weighted_summary_data = {
    'Metric': [
        'Household count (weighted)',
        'Person count (weighted)',
        'Median AGI',
        '75th percentile AGI',
        '90th percentile AGI',
        '95th percentile AGI',
        'Max AGI',
        'Total households with children',
        'Households with 1 child',
        'Households with 2 children',
        'Households with 3+ children',
        'Total children under 18',
        'Children under 6'
    ],
    'Value': [
        f"{household_count.sum():,.0f}",
        f"{person_count.sum():,.0f}",
        f"${agi.median():,.0f}",
        f"${agi.quantile(0.75):,.0f}",
        f"${agi.quantile(0.90):,.0f}",
        f"${agi.quantile(0.95):,.0f}",
        f"${agi.max():,.0f}",
        f"{total_households_with_children:,.0f}",
        f"{households_with_1_child:,.0f}",
        f"{households_with_2_children:,.0f}",
        f"{households_with_3plus_children:,.0f}",
        f"{total_children:,.0f}",
        f"{children_under_6:,.0f}"
    ]
}

weighted_df = pd.DataFrame(weighted_summary_data)

print("\n" + "="*60)
print("PA DATASET SUMMARY - WEIGHTED (Population Estimates)")
print("="*60)
print(weighted_df.to_string(index=False))
print("="*60)

# Save table
weighted_df.to_csv('pa_dataset_summary_weighted.csv', index=False)
print("\nSummary saved to: pa_dataset_summary_weighted.csv")


PA DATASET SUMMARY - WEIGHTED (Population Estimates)
                        Metric      Value
    Household count (weighted)  4,662,650
       Person count (weighted) 13,217,679
                    Median AGI    $73,962
           75th percentile AGI   $169,351
           90th percentile AGI   $404,412
           95th percentile AGI   $511,573
                       Max AGI $3,229,514
Total households with children  1,489,087
       Households with 1 child    720,370
    Households with 2 children    504,238
   Households with 3+ children    264,479
       Total children under 18  2,597,022
              Children under 6    799,168

Summary saved to: pa_dataset_summary_weighted.csv


In [14]:
# Households with $0 income
agi_hh = np.array(sim.calculate("adjusted_gross_income", period=2025, map_to="household"))
weights = np.array(sim.calculate("household_weight", period=2025))

zero_income_mask = agi_hh == 0
zero_income_count = weights[zero_income_mask].sum()
total_households = weights.sum()

print("\n" + "="*70)
print("HOUSEHOLDS WITH $0 INCOME")
print("="*70)
print(f"Household count: {zero_income_count:,.0f}")
print(f"Percentage of all households: {zero_income_count / total_households * 100:.2f}%")
print("="*70)


HOUSEHOLDS WITH $0 INCOME
Household count: 368,283
Percentage of all households: 7.90%


In [15]:
# Household counts by income brackets
income_brackets = [
    (0, 10000, "$0-$10k"),
    (10000, 20000, "$10k-$20k"),
    (20000, 30000, "$20k-$30k"),
    (30000, 40000, "$30k-$40k"),
    (40000, 50000, "$40k-$50k"),
    (50000, 60000, "$50k-$60k")
]

bracket_data = []
for lower, upper, label in income_brackets:
    mask = (agi_hh >= lower) & (agi_hh < upper)
    count = weights[mask].sum()
    pct_of_total = (count / total_households) * 100
    
    bracket_data.append({
        "Income Bracket": label,
        "Households": f"{count:,.0f}",
        "% of All Households": f"{pct_of_total:.2f}%"
    })

income_df = pd.DataFrame(bracket_data)

print("\n" + "="*70)
print("HOUSEHOLD COUNTS BY INCOME BRACKET")
print("="*70)
print(income_df.to_string(index=False))
print("="*70)

# Total in $0-$60k range
total_in_range = sum([weights[(agi_hh >= lower) & (agi_hh < upper)].sum() for lower, upper, _ in income_brackets])
print(f"\nTotal households in $0-$60k range: {total_in_range:,.0f}")
print(f"Percentage of all households in $0-$60k range: {total_in_range / total_households * 100:.2f}%")


HOUSEHOLD COUNTS BY INCOME BRACKET
Income Bracket Households % of All Households
       $0-$10k    786,029              16.86%
     $10k-$20k    177,932               3.82%
     $20k-$30k    151,871               3.26%
     $30k-$40k    394,030               8.45%
     $40k-$50k    240,967               5.17%
     $50k-$60k    200,283               4.30%

Total households in $0-$60k range: 1,951,112
Percentage of all households in $0-$60k range: 41.85%
