# DC Dataset Exploration

This notebook explores the DC dataset to understand household counts and income distribution.

In [1]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load DC dataset
sim = Microsimulation(dataset='hf://policyengine/test/DC.h5')

In [3]:
# Check dataset size
household_weight = sim.calculate("household_weight", period=2025)
household_count = sim.calculate("household_count", period=2025, map_to="household")
person_count = sim.calculate("person_count", period=2025, map_to="household")

print(f"Number of households in dataset: {len(household_weight):,}")
print(f"Household count (mapped): {household_count.sum():,.0f}")
print(f"Person count (mapped): {person_count.sum():,.0f}")

Number of households in dataset: 5,015
Household count (mapped): 220,449
Person count (mapped): 668,071


In [4]:
# Check household income distribution (aggregate to household level using map_to)
agi = sim.calculate("adjusted_gross_income", period=2025, map_to="household")
print(f"Income distribution:")
print(f"  Median AGI: ${agi.median():,.0f}")
print(f"  75th percentile: ${agi.quantile(0.75):,.0f}")
print(f"  90th percentile: ${agi.quantile(0.90):,.0f}")
print(f"  95th percentile: ${agi.quantile(0.95):,.0f}")
print(f"  Max AGI: ${agi.max():,.0f}")
print(f"\nHouseholds by income threshold:")
print(f"  Households over $80k: {(agi > 80_000).sum():,}")
print(f"  Households over $120k: {(agi > 120_000).sum():,}")
print(f"  Households over $160k: {(agi > 160_000).sum():,}")
print(f"  Households over $240k: {(agi > 240_000).sum():,}")

Income distribution:
  Median AGI: $120,221
  75th percentile: $327,906
  90th percentile: $546,746
  95th percentile: $915,476
  Max AGI: $3,229,514

Households by income threshold:
  Households over $80k: 135,495.6000130782
  Households over $120k: 110,301.79662270736
  Households over $160k: 91,709.29688587465
  Households over $240k: 68,994.44427188052


In [5]:
# Check households with children (count at person level, aggregate to household)
is_child = sim.calculate("is_child", period=2025)
household_id = sim.calculate("household_id", period=2025)

# Count children per household
children_per_household = pd.Series(is_child).groupby(household_id).sum()

# Map back to household array
household_children = children_per_household.reindex(household_id.unique()).fillna(0).values

print(f"\nHouseholds with children:")
print(f"  Total households with children: {(household_children > 0).sum():,}")
print(f"  Households with 1 child: {(household_children == 1).sum():,}")
print(f"  Households with 2 children: {(household_children == 2).sum():,}")
print(f"  Households with 3+ children: {(household_children >= 3).sum():,}")


Households with children:
  Total households with children: 984
  Households with 1 child: 984
  Households with 2 children: 0
  Households with 3+ children: 0


In [6]:
# Check children by age groups (count at person level, aggregate to household)
age = sim.calculate("age", period=2025)
is_child_age = age < 18

# Calculate totals
total_children = is_child.sum()
children_under_4 = (age < 4).sum()
children_under_6 = (age < 6).sum()
children_6_17 = ((age >= 6) & (age < 18)).sum()

print(f"\nChildren by age:")
print(f"  Total children under 18: {total_children:,}")
print(f"  Children under 4: {children_under_4:,}")
print(f"  Children under 6: {children_under_6:,}")
print(f"  Children ages 6-17: {children_6_17:,}")


Children by age:
  Total children under 18: 136,125.36159773025
  Children under 4: 32,540.336470582824
  Children under 6: 48,094.121104372265
  Children ages 6-17: 1,834


In [7]:
# Create summary table with all findings
summary_data = {
    'Metric': [
        'Number of households in dataset',
        'Household count (mapped to household)',
        'Person count (mapped to household)',
        'Median AGI',
        '75th percentile AGI',
        '90th percentile AGI',
        '95th percentile AGI',
        'Max AGI',
        'Households over $80k',
        'Households over $120k',
        'Households over $160k',
        'Households over $240k',
        'Total households with children',
        'Households with 1 child',
        'Households with 2 children',
        'Households with 3+ children',
        'Total children under 18',
        'Children under 4',
        'Children under 6',
        'Children ages 6-17'
    ],
    'Value': [
        f"{len(household_weight):,}",
        f"{household_count.sum():,.0f}",
        f"{person_count.sum():,.0f}",
        f"${agi.median():,.0f}",
        f"${agi.quantile(0.75):,.0f}",
        f"${agi.quantile(0.90):,.0f}",
        f"${agi.quantile(0.95):,.0f}",
        f"${agi.max():,.0f}",
        f"{(agi > 80_000).sum():,.0f}",
        f"{(agi > 120_000).sum():,.0f}",
        f"{(agi > 160_000).sum():,.0f}",
        f"{(agi > 240_000).sum():,.0f}",
        f"{(household_children > 0).sum():,.0f}",
        f"{(household_children == 1).sum():,.0f}",
        f"{(household_children == 2).sum():,.0f}",
        f"{(household_children >= 3).sum():,.0f}",
        f"{total_children:,.0f}",
        f"{children_under_4:,.0f}",
        f"{children_under_6:,.0f}",
        f"{children_6_17:,.0f}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*60)
print("DC DATASET SUMMARY (Household Level)")
print("="*60)
print(summary_df.to_string(index=False))
print("="*60)

# Also save as CSV for easy sharing
summary_df.to_csv('dc_dataset_summary.csv', index=False)
print("\nSummary saved to: dc_dataset_summary.csv")


DC DATASET SUMMARY (Household Level)
                               Metric      Value
      Number of households in dataset      5,015
Household count (mapped to household)    220,449
   Person count (mapped to household)    668,071
                           Median AGI   $120,221
                  75th percentile AGI   $327,906
                  90th percentile AGI   $546,746
                  95th percentile AGI   $915,476
                              Max AGI $3,229,514
                 Households over $80k    135,496
                Households over $120k    110,302
                Households over $160k     91,709
                Households over $240k     68,994
       Total households with children        984
              Households with 1 child        984
           Households with 2 children          0
          Households with 3+ children          0
              Total children under 18    136,125
                     Children under 4     32,540
                     Children u