# DC Dataset Exploration

This notebook explores the DC dataset to understand household counts and income distribution.

In [1]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load DC dataset (new version with 7,000+ households)
sim = Microsimulation(dataset="hf://policyengine/test/DC_0930_v2.h5")

In [3]:
# Check dataset size
household_weight = sim.calculate("household_weight", period=2025)
print(f"Number of households in dataset: {len(household_weight):,}")
print(f"Weighted number of households: {household_weight.sum():,.0f}")

Number of households in dataset: 3,157
Weighted number of households: 641,845,170


In [4]:
# Check income distribution
agi = sim.calculate("adjusted_gross_income", period=2025)
print(f"Income distribution:")
print(f"  Median AGI: ${agi.median():,.0f}")
print(f"  75th percentile: ${agi.quantile(0.75):,.0f}")
print(f"  90th percentile: ${agi.quantile(0.90):,.0f}")
print(f"  95th percentile: ${agi.quantile(0.95):,.0f}")
print(f"  Max AGI: ${agi.max():,.0f}")
print(f"\nHouseholds by income threshold:")
print(f"  Households over $80k: {(agi > 80_000).sum():,}")
print(f"  Households over $120k: {(agi > 120_000).sum():,}")
print(f"  Households over $160k: {(agi > 160_000).sum():,}")
print(f"  Households over $240k: {(agi > 240_000).sum():,}")

Income distribution:
  Median AGI: $0
  75th percentile: $15,034
  90th percentile: $65,105
  95th percentile: $334,613
  Max AGI: $2,484,202

Households by income threshold:
  Households over $80k: 59,724.248763724085
  Households over $120k: 52,324.09627961823
  Households over $160k: 51,840.848403842145
  Households over $240k: 38,579.07857731297


In [5]:
# Check households with children
num_children = sim.calculate("tax_unit_children", period=2025)
print(f"\nHouseholds with children:")
print(f"  Total tax units with children: {(num_children > 0).sum():,}")
print(f"  Tax units with 1 child: {(num_children == 1).sum():,}")
print(f"  Tax units with 2 children: {(num_children == 2).sum():,}")
print(f"  Tax units with 3+ children: {(num_children >= 3).sum():,}")


Households with children:
  Total tax units with children: 139,370.18707847144
  Tax units with 1 child: 139,370.18707847144
  Tax units with 2 children: 0.0
  Tax units with 3+ children: 0.0


In [6]:
# Check children by age groups
age = sim.calculate("age", period=2025)
is_child = age < 18
print(f"\nChildren by age:")
print(f"  Total children under 18: {is_child.sum():,}")
print(f"  Children under 4: {(age < 4).sum():,}")
print(f"  Children under 6: {(age < 6).sum():,}")
print(f"  Children ages 6-17: {((age >= 6) & (age < 18)).sum():,}")


Children by age:
  Total children under 18: 139,370.18707847144
  Children under 4: 34,262.70863087539
  Children under 6: 46,870.04093802362
  Children ages 6-17: 1,968
