# CPS 2024 Dataset Exploration

This notebook explores the CPS 2024 dataset to understand household counts and income distribution for comparison with the DC dataset.

In [7]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

In [9]:
# Load microsimulation with new data source
sim = Microsimulation(dataset='hf://policyengine/policyengine-us-data/cps_2024.h5')

EntryNotFoundError: 404 Client Error. (Request ID: Root=1-68dd68f4-1f8ede284014812a17566523;48a4a1a7-d197-4e14-b249-0f8ffde66e50)

Entry Not Found for url: https://huggingface.co/policyengine/policyengine-us-data/resolve/main/cps_2024.h5.

In [None]:
# Check dataset size
household_weight = sim.calculate("household_weight", period=2025)
household_count = sim.calculate("household_count", period=2025, map_to="household")
person_count = sim.calculate("person_count", period=2025, map_to="household")

print(f"Number of households in dataset: {len(household_weight):,}")
print(f"Household count (mapped): {household_count.sum():,.0f}")
print(f"Person count (mapped): {person_count.sum():,.0f}")

In [None]:
# Check household income distribution (aggregate to household level using map_to)
agi = sim.calculate("adjusted_gross_income", period=2025, map_to="household")
print(f"Income distribution:")
print(f"  Median AGI: ${agi.median():,.0f}")
print(f"  75th percentile: ${agi.quantile(0.75):,.0f}")
print(f"  90th percentile: ${agi.quantile(0.90):,.0f}")
print(f"  95th percentile: ${agi.quantile(0.95):,.0f}")
print(f"  Max AGI: ${agi.max():,.0f}")
print(f"\nHouseholds by income threshold:")
print(f"  Households over $80k: {(agi > 80_000).sum():,}")
print(f"  Households over $120k: {(agi > 120_000).sum():,}")
print(f"  Households over $160k: {(agi > 160_000).sum():,}")
print(f"  Households over $240k: {(agi > 240_000).sum():,}")

In [None]:
# Check households with children (count at person level, aggregate to household)
is_child = sim.calculate("is_child", period=2025)
household_id = sim.calculate("household_id", period=2025)

# Count children per household
children_per_household = pd.Series(is_child).groupby(household_id).sum()

# Map back to household array
household_children = children_per_household.reindex(household_id.unique()).fillna(0).values

print(f"\nHouseholds with children:")
print(f"  Total households with children: {(household_children > 0).sum():,}")
print(f"  Households with 1 child: {(household_children == 1).sum():,}")
print(f"  Households with 2 children: {(household_children == 2).sum():,}")
print(f"  Households with 3+ children: {(household_children >= 3).sum():,}")

In [None]:
# Check children by age groups (count at person level)
age = sim.calculate("age", period=2025)
is_child_age = age < 18

# Calculate totals
total_children = is_child.sum()
children_under_4 = (age < 4).sum()
children_under_6 = (age < 6).sum()
children_6_17 = ((age >= 6) & (age < 18)).sum()

print(f"\nChildren by age:")
print(f"  Total children under 18: {total_children:,}")
print(f"  Children under 4: {children_under_4:,}")
print(f"  Children under 6: {children_under_6:,}")
print(f"  Children ages 6-17: {children_6_17:,}")

In [None]:
# Distribution of children under 4 per tax unit
df = sim.calculate_dataframe(['household_id', 'tax_unit_id', 'person_id', 'age'])

# Filter for children under 4
children_under_4_df = df[df['age'] < 4]

# Count children per tax unit
children_per_tax_unit = children_under_4_df.groupby('tax_unit_id').size()

# Get all unique tax units
all_tax_units = df['tax_unit_id'].unique()

# Create distribution: start with 0 for all tax units, then update with actual counts
distribution = pd.Series(0, index=all_tax_units)
distribution.update(children_per_tax_unit)

# Get value counts of the distribution
distribution_summary = distribution.value_counts().sort_index()

# Print results
print("Distribution of children under 4 per tax unit:")
print(distribution_summary)
print(f"\nTotal tax units: {len(all_tax_units):,}")
print(f"Tax units with at least one child under 4: {(distribution > 0).sum():,}")

In [None]:
# Create summary table with all findings
summary_data = {
    'Metric': [
        'Number of households in dataset',
        'Household count (mapped to household)',
        'Person count (mapped to household)',
        'Median AGI',
        '75th percentile AGI',
        '90th percentile AGI',
        '95th percentile AGI',
        'Max AGI',
        'Households over $80k',
        'Households over $120k',
        'Households over $160k',
        'Households over $240k',
        'Total households with children',
        'Households with 1 child',
        'Households with 2 children',
        'Households with 3+ children',
        'Total children under 18',
        'Children under 4',
        'Children under 6',
        'Children ages 6-17',
        'Total tax units',
        'Tax units with at least one child under 4'
    ],
    'Value': [
        f"{len(household_weight):,}",
        f"{household_count.sum():,.0f}",
        f"{person_count.sum():,.0f}",
        f"${agi.median():,.0f}",
        f"${agi.quantile(0.75):,.0f}",
        f"${agi.quantile(0.90):,.0f}",
        f"${agi.quantile(0.95):,.0f}",
        f"${agi.max():,.0f}",
        f"{(agi > 80_000).sum():,.0f}",
        f"{(agi > 120_000).sum():,.0f}",
        f"{(agi > 160_000).sum():,.0f}",
        f"{(agi > 240_000).sum():,.0f}",
        f"{(household_children > 0).sum():,.0f}",
        f"{(household_children == 1).sum():,.0f}",
        f"{(household_children == 2).sum():,.0f}",
        f"{(household_children >= 3).sum():,.0f}",
        f"{total_children:,.0f}",
        f"{children_under_4:,.0f}",
        f"{children_under_6:,.0f}",
        f"{children_6_17:,.0f}",
        f"{len(all_tax_units):,}",
        f"{(distribution > 0).sum():,}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*60)
print("CPS 2024 DATASET SUMMARY (Household Level)")
print("="*60)
print(summary_df.to_string(index=False))
print("="*60)

# Also save as CSV for easy sharing
summary_df.to_csv('cps_2024_dataset_summary.csv', index=False)
print("\nSummary saved to: cps_2024_dataset_summary.csv")