# MN Dataset Exploration

This notebook explores the Minnesota (MN) dataset to understand household counts, income distribution, and demographic characteristics.

In [1]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

MN_DATASET = "hf://policyengine/policyengine-us-data/states/MN.h5"

In [2]:
# Load MN dataset
sim = Microsimulation(dataset=MN_DATASET)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


MN.h5:   0%|          | 0.00/45.5M [00:00<?, ?B/s]

In [3]:
# Check dataset size
household_weight = sim.calculate("household_weight", period=2025)
household_count = sim.calculate("household_count", period=2025, map_to="household")
person_count = sim.calculate("person_count", period=2025, map_to="household")

print(f"Number of households in dataset: {len(household_weight):,}")
print(f"Household count (weighted): {household_count.sum():,.0f}")
print(f"Person count (weighted): {person_count.sum():,.0f}")

Number of households in dataset: 32,518
Household count (weighted): 1,254,857
Person count (weighted): 4,066,311


In [4]:
# Check household income distribution
agi = sim.calculate("adjusted_gross_income", period=2025, map_to="household")
print(f"Income distribution:")
print(f"  Median AGI: ${agi.median():,.0f}")
print(f"  75th percentile: ${agi.quantile(0.75):,.0f}")
print(f"  90th percentile: ${agi.quantile(0.90):,.0f}")
print(f"  95th percentile: ${agi.quantile(0.95):,.0f}")
print(f"  Max AGI: ${agi.max():,.0f}")

Income distribution:
  Median AGI: $96,581
  75th percentile: $379,259
  90th percentile: $650,436
  95th percentile: $854,192
  Max AGI: $3,229,514


In [5]:
# Average household income per decile
agi_hh = np.array(sim.calculate("adjusted_gross_income", period=2025, map_to="household"))
weights = np.array(sim.calculate("household_weight", period=2025))

# Create DataFrame for decile analysis
df_decile = pd.DataFrame({
    'agi': agi_hh,
    'weight': weights
})

# Calculate weighted deciles
df_decile['cumweight'] = df_decile.sort_values('agi')['weight'].cumsum()
total_weight = df_decile['weight'].sum()
df_decile['decile'] = pd.cut(
    df_decile['cumweight'] / total_weight,
    bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']
)

# Calculate weighted average income per decile
decile_summary = df_decile.groupby('decile', observed=True).apply(
    lambda x: pd.Series({
        'Avg Household Income': np.average(x['agi'], weights=x['weight']),
        'Households': x['weight'].sum()
    })
).reset_index()

print("\n" + "="*70)
print("AVERAGE HOUSEHOLD INCOME BY DECILE")
print("="*70)
for _, row in decile_summary.iterrows():
    print(f"  {row['decile']:>5} Decile: ${row['Avg Household Income']:>12,.0f}  ({row['Households']:>10,.0f} households)")
print("="*70)


AVERAGE HOUSEHOLD INCOME BY DECILE
    1st Decile: $      -6,150  (   125,469 households)
    2nd Decile: $       1,104  (   125,399 households)
    3rd Decile: $      19,465  (   125,521 households)
    4th Decile: $      50,096  (   124,832 households)
    5th Decile: $      79,390  (   125,477 households)
    6th Decile: $     119,228  (   126,104 households)
    7th Decile: $     185,420  (   125,395 households)
    8th Decile: $     364,615  (   123,740 households)
    9th Decile: $     564,021  (   127,447 households)
   10th Decile: $   1,097,559  (   125,472 households)


In [6]:
# Check households with children
is_child = sim.calculate("is_child", period=2025, map_to="person")
household_id = sim.calculate("household_id", period=2025, map_to="person")
household_weight = sim.calculate("household_weight", period=2025, map_to="person")

# Create DataFrame
df_households = pd.DataFrame({
    'household_id': household_id,
    'is_child': is_child,
    'household_weight': household_weight
})

# Count children per household
children_per_household = df_households.groupby('household_id').agg({
    'is_child': 'sum',
    'household_weight': 'first'
}).reset_index()

# Calculate weighted household counts
total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()
households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()
households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()
households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()

print(f"\nHouseholds with children (weighted):")
print(f"  Total households with children: {total_households_with_children:,.0f}")
print(f"  Households with 1 child: {households_with_1_child:,.0f}")
print(f"  Households with 2 children: {households_with_2_children:,.0f}")
print(f"  Households with 3+ children: {households_with_3plus_children:,.0f}")


Households with children (weighted):
  Total households with children: 469,600
  Households with 1 child: 114,008
  Households with 2 children: 151,889
  Households with 3+ children: 203,703


In [7]:
# Check children by age groups
df = pd.DataFrame({
    "household_id": sim.calculate("household_id", map_to="person"),
    "tax_unit_id": sim.calculate("tax_unit_id", map_to="person"),
    "person_id": sim.calculate("person_id", map_to="person"),
    "age": sim.calculate("age", map_to="person"),
    "person_weight": sim.calculate("person_weight", map_to="person")
})

# Filter for children and apply weights
children_under_18_df = df[df['age'] < 18]
children_under_6_df = df[df['age'] < 6]
children_under_3_df = df[df['age'] < 3]

# Calculate weighted totals
total_children = children_under_18_df['person_weight'].sum()
children_under_6 = children_under_6_df['person_weight'].sum()
children_under_3 = children_under_3_df['person_weight'].sum()

print(f"\nChildren by age:")
print(f"  Total children under 18: {total_children:,.0f}")
print(f"  Children under 6: {children_under_6:,.0f}")
print(f"  Children under 3: {children_under_3:,.0f}")


Children by age:
  Total children under 18: 1,145,830
  Children under 6: 269,322
  Children under 3: 96,626


In [8]:
# Create weighted summary table
weighted_summary_data = {
    'Metric': [
        'Household count (weighted)',
        'Person count (weighted)',
        'Median AGI',
        '75th percentile AGI',
        '90th percentile AGI',
        '95th percentile AGI',
        'Max AGI',
        'Total households with children',
        'Households with 1 child',
        'Households with 2 children',
        'Households with 3+ children',
        'Total children under 18',
        'Children under 6',
        'Children under 3'
    ],
    'Value': [
        f"{household_count.sum():,.0f}",
        f"{person_count.sum():,.0f}",
        f"${agi.median():,.0f}",
        f"${agi.quantile(0.75):,.0f}",
        f"${agi.quantile(0.90):,.0f}",
        f"${agi.quantile(0.95):,.0f}",
        f"${agi.max():,.0f}",
        f"{total_households_with_children:,.0f}",
        f"{households_with_1_child:,.0f}",
        f"{households_with_2_children:,.0f}",
        f"{households_with_3plus_children:,.0f}",
        f"{total_children:,.0f}",
        f"{children_under_6:,.0f}",
        f"{children_under_3:,.0f}"
    ]
}

weighted_df = pd.DataFrame(weighted_summary_data)

print("\n" + "="*60)
print("MN DATASET SUMMARY - WEIGHTED (Population Estimates)")
print("="*60)
print(weighted_df.to_string(index=False))
print("="*60)

# Save table
weighted_df.to_csv('mn_dataset_summary_weighted.csv', index=False)
print("\nSummary saved to: mn_dataset_summary_weighted.csv")


MN DATASET SUMMARY - WEIGHTED (Population Estimates)
                        Metric      Value
    Household count (weighted)  1,254,857
       Person count (weighted)  4,066,311
                    Median AGI    $96,581
           75th percentile AGI   $379,259
           90th percentile AGI   $650,436
           95th percentile AGI   $854,192
                       Max AGI $3,229,514
Total households with children    469,600
       Households with 1 child    114,008
    Households with 2 children    151,889
   Households with 3+ children    203,703
       Total children under 18  1,145,830
              Children under 6    269,322
              Children under 3     96,626

Summary saved to: mn_dataset_summary_weighted.csv


In [9]:
# Households with $0 income
agi_hh = np.array(sim.calculate("adjusted_gross_income", period=2025, map_to="household"))
weights = np.array(sim.calculate("household_weight", period=2025))

zero_income_mask = agi_hh == 0
zero_income_count = weights[zero_income_mask].sum()
total_households = weights.sum()

print("\n" + "="*70)
print("HOUSEHOLDS WITH $0 INCOME")
print("="*70)
print(f"Household count: {zero_income_count:,.0f}")
print(f"Percentage of all households: {zero_income_count / total_households * 100:.2f}%")
print("="*70)


HOUSEHOLDS WITH $0 INCOME
Household count: 116,679
Percentage of all households: 9.30%


In [10]:
# Household counts by income brackets
income_brackets = [
    (0, 10000, "$0-$10k"),
    (10000, 20000, "$10k-$20k"),
    (20000, 30000, "$20k-$30k"),
    (30000, 40000, "$30k-$40k"),
    (40000, 50000, "$40k-$50k"),
    (50000, 60000, "$50k-$60k")
]

bracket_data = []
for lower, upper, label in income_brackets:
    mask = (agi_hh >= lower) & (agi_hh < upper)
    count = weights[mask].sum()
    pct_of_total = (count / total_households) * 100
    
    bracket_data.append({
        "Income Bracket": label,
        "Households": f"{count:,.0f}",
        "% of All Households": f"{pct_of_total:.2f}%"
    })

income_df = pd.DataFrame(bracket_data)

print("\n" + "="*70)
print("HOUSEHOLD COUNTS BY INCOME BRACKET")
print("="*70)
print(income_df.to_string(index=False))
print("="*70)

# Total in $0-$60k range
total_in_range = sum([weights[(agi_hh >= lower) & (agi_hh < upper)].sum() for lower, upper, _ in income_brackets])
print(f"\nTotal households in $0-$60k range: {total_in_range:,.0f}")
print(f"Percentage of all households in $0-$60k range: {total_in_range / total_households * 100:.2f}%")


HOUSEHOLD COUNTS BY INCOME BRACKET
Income Bracket Households % of All Households
       $0-$10k    240,135              19.14%
     $10k-$20k     41,051               3.27%
     $20k-$30k     39,704               3.16%
     $30k-$40k     42,173               3.36%
     $40k-$50k     43,723               3.48%
     $50k-$60k     30,854               2.46%

Total households in $0-$60k range: 437,639
Percentage of all households in $0-$60k range: 34.88%
