# DC Dataset Exploration

This notebook explores the DC dataset to understand household counts and income distribution.

In [1]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

In [2]:
# Load DC dataset
sim = Microsimulation(dataset="hf://policyengine/policyengine-us-data/states/DC.h5")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


DC.h5:   0%|          | 0.00/11.6M [00:00<?, ?B/s]

In [3]:
# Check dataset size
household_weight = sim.calculate("household_weight", period=2025)
household_count = sim.calculate("household_count", period=2025, map_to="household")
person_count = sim.calculate("person_count", period=2025, map_to="household")

print(f"Number of households in dataset: {len(household_weight):,}")
print(f"Household count (mapped): {household_count.sum():,.0f}")
print(f"Person count (mapped): {person_count.sum():,.0f}")

Number of households in dataset: 5,185
Household count (mapped): 242,183
Person count (mapped): 692,743


In [4]:
# Check household income distribution (aggregate to household level using map_to)
agi = sim.calculate("adjusted_gross_income", period=2025, map_to="household")
print(f"Income distribution:")
print(f"  Median AGI: ${agi.median():,.0f}")
print(f"  75th percentile: ${agi.quantile(0.75):,.0f}")
print(f"  90th percentile: ${agi.quantile(0.90):,.0f}")
print(f"  95th percentile: ${agi.quantile(0.95):,.0f}")
print(f"  Max AGI: ${agi.max():,.0f}")
print(f"\nHouseholds by income threshold:")
print(f"  Households over $80k: {(agi > 80_000).sum():,}")
print(f"  Households over $120k: {(agi > 120_000).sum():,}")
print(f"  Households over $160k: {(agi > 160_000).sum():,}")
print(f"  Households over $240k: {(agi > 240_000).sum():,}")

Income distribution:
  Median AGI: $103,817
  75th percentile: $355,850
  90th percentile: $679,833
  95th percentile: $1,013,736
  Max AGI: $3,035,056

Households by income threshold:
  Households over $80k: 136,597.29157550895
  Households over $120k: 113,646.88146775239
  Households over $160k: 95,798.15931612276
  Households over $240k: 76,501.23587184609


In [5]:
# Check households with children (count at person level, aggregate to household)
is_child = sim.calculate("is_child", period=2025, map_to="person")
household_id = sim.calculate("household_id", period=2025, map_to="person")
household_weight = sim.calculate("household_weight", period=2025, map_to="person")

# Create DataFrame for easier manipulation
df_households = pd.DataFrame({
    'household_id': household_id,
    'is_child': is_child,
    'household_weight': household_weight
})

# Count children per household
children_per_household = df_households.groupby('household_id').agg({
    'is_child': 'sum',
    'household_weight': 'first'  # household_weight is same for all members
}).reset_index()

# Calculate weighted household counts
total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()
households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()
households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()
households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()

print(f"\nHouseholds with children (weighted):")
print(f"  Total households with children: {total_households_with_children:,.0f}")
print(f"  Households with 1 child: {households_with_1_child:,.0f}")
print(f"  Households with 2 children: {households_with_2_children:,.0f}")
print(f"  Households with 3+ children: {households_with_3plus_children:,.0f}")


Households with children (weighted):
  Total households with children: 72,062
  Households with 1 child: 33,300
  Households with 2 children: 25,153
  Households with 3+ children: 13,608


In [6]:
# Check children by age groups using Ben's workaround
import pandas as pd
df = pd.DataFrame({
    "household_id": sim.calculate("household_id", map_to="person"),
    "tax_unit_id": sim.calculate("tax_unit_id", map_to="person"),
    "person_id": sim.calculate("person_id", map_to="person"),
    "age": sim.calculate("age", map_to="person"),
    "person_weight": sim.calculate("person_weight", map_to="person")
})

# Filter for children and apply weights
children_under_4_df = df[df['age'] < 4]
children_under_6_df = df[df['age'] < 6]
children_under_18_df = df[df['age'] < 18]
children_6_17_df = df[(df['age'] >= 6) & (df['age'] < 18)]

# Calculate weighted totals
is_child = sim.calculate("is_child", period=2025)
total_children = is_child.sum()
children_under_4 = children_under_4_df['person_weight'].sum()
children_under_6 = children_under_6_df['person_weight'].sum()
children_6_17 = children_6_17_df['person_weight'].sum()

print(f"\nChildren by age:")
print(f"  Total children under 18: {total_children:,.0f}")
print(f"  Children under 4: {children_under_4:,.0f}")
print(f"  Children under 6: {children_under_6:,.0f}")
print(f"  Children ages 6-17: {children_6_17:,.0f}")

print(f"\nSample of children under 4:")
print(children_under_4_df[['household_id', 'tax_unit_id', 'person_id', 'age']].head(10))


Children by age:
  Total children under 18: 131,759
  Children under 4: 32,332
  Children under 6: 47,446
  Children ages 6-17: 81,687

Sample of children under 4:
     household_id  tax_unit_id  person_id  age
46         200012           26    5200046  3.0
120        200044           61    5200120  3.0
131        200090           67    5200131  2.0
132        200090           67    5200132  1.0
155        200058           80    5200155  2.0
156        200058           80    5200156  1.0
174        200066           89    5200174  2.0
212        200077          102    5200212  2.0
248        200089          120    5200248  3.0
253        200092          122    5200253  3.0


In [7]:
# Create weighted summary table
weighted_summary_data = {
    'Metric': [
        'Household count (weighted)',
        'Person count (weighted)',
        'Median AGI',
        '75th percentile AGI',
        '90th percentile AGI',
        '95th percentile AGI',
        'Max AGI',
        'Households over $80k',
        'Households over $120k',
        'Households over $160k',
        'Households over $240k',
        'Total households with children',
        'Households with 1 child',
        'Households with 2 children',
        'Households with 3+ children',
        'Total children under 18',
        'Children under 4',
        'Children under 6',
        'Children ages 6-17'
    ],
    'Value': [
        f"{household_count.sum():,.0f}",
        f"{person_count.sum():,.0f}",
        f"${agi.median():,.0f}",
        f"${agi.quantile(0.75):,.0f}",
        f"${agi.quantile(0.90):,.0f}",
        f"${agi.quantile(0.95):,.0f}",
        f"${agi.max():,.0f}",
        f"{(agi > 80_000).sum():,.0f}",
        f"{(agi > 120_000).sum():,.0f}",
        f"{(agi > 160_000).sum():,.0f}",
        f"{(agi > 240_000).sum():,.0f}",
        f"{total_households_with_children:,.0f}",
        f"{households_with_1_child:,.0f}",
        f"{households_with_2_children:,.0f}",
        f"{households_with_3plus_children:,.0f}",
        f"{total_children:,.0f}",
        f"{children_under_4:,.0f}",
        f"{children_under_6:,.0f}",
        f"{children_6_17:,.0f}"
    ]
}

# Get unique counts for unweighted table
unique_households = df['household_id'].nunique()
unique_persons = len(df)

# Create unweighted summary table
unweighted_summary_data = {
    'Metric': [
        'Number of households in dataset',
        'Number of persons in dataset',
        'Households with children (unweighted)',
        'Households with 1 child (unweighted)',
        'Households with 2 children (unweighted)',
        'Households with 3+ children (unweighted)',
        'Children under 18 (unweighted)',
        'Children under 4 (unweighted)',
        'Children under 6 (unweighted)',
        'Children ages 6-17 (unweighted)'
    ],
    'Value': [
        f"{unique_households:,}",
        f"{unique_persons:,}",
        f"{(children_per_household['is_child'] > 0).sum():,}",
        f"{(children_per_household['is_child'] == 1).sum():,}",
        f"{(children_per_household['is_child'] == 2).sum():,}",
        f"{(children_per_household['is_child'] >= 3).sum():,}",
        f"{len(children_under_18_df):,}",
        f"{len(children_under_4_df):,}",
        f"{len(children_under_6_df):,}",
        f"{len(children_6_17_df):,}"
    ]
}

weighted_df = pd.DataFrame(weighted_summary_data)
unweighted_df = pd.DataFrame(unweighted_summary_data)

print("\n" + "="*60)
print("DC DATASET SUMMARY - WEIGHTED (Population Estimates)")
print("="*60)
print(weighted_df.to_string(index=False))
print("="*60)

print("\n" + "="*60)
print("DC DATASET SUMMARY - UNWEIGHTED (Sample Counts)")
print("="*60)
print(unweighted_df.to_string(index=False))
print("="*60)

# Save both tables
weighted_df.to_csv('dc_dataset_summary_weighted.csv', index=False)
unweighted_df.to_csv('dc_dataset_summary_unweighted.csv', index=False)
print("\nSummaries saved to:")
print("  - dc_dataset_summary_weighted.csv")
print("  - dc_dataset_summary_unweighted.csv")


DC DATASET SUMMARY - WEIGHTED (Population Estimates)
                        Metric      Value
    Household count (weighted)    242,183
       Person count (weighted)    692,743
                    Median AGI   $103,817
           75th percentile AGI   $355,850
           90th percentile AGI   $679,833
           95th percentile AGI $1,013,736
                       Max AGI $3,035,056
          Households over $80k    136,597
         Households over $120k    113,647
         Households over $160k     95,798
         Households over $240k     76,501
Total households with children     72,062
       Households with 1 child     33,300
    Households with 2 children     25,153
   Households with 3+ children     13,608
       Total children under 18    131,759
              Children under 4     32,332
              Children under 6     47,446
            Children ages 6-17     81,687

DC DATASET SUMMARY - UNWEIGHTED (Sample Counts)
                                  Metric  Value
         