diff --git a/ri_dataset_exploration.ipynb b/ri_dataset_exploration.ipynb new file mode 100644 index 0000000..0796319 --- /dev/null +++ b/ri_dataset_exploration.ipynb @@ -0,0 +1,519 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "id": "be1cea7a", + "metadata": {}, + "outputs": [], + "source": [ + "from policyengine_us import Microsimulation\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d21b774", + "metadata": {}, + "outputs": [], + "source": [ + "# Load RI dataset\n", + "sim = Microsimulation(dataset=\"hf://policyengine/test/RI.h5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1870e7ac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of households in dataset: 2,368\n", + "Household count (mapped): 388,376\n", + "Person count (mapped): 1,106,390\n" + ] + } + ], + "source": [ + "# Check dataset size\n", + "household_weight = sim.calculate(\"household_weight\", period=2025)\n", + "household_count = sim.calculate(\"household_count\", period=2025, map_to=\"household\")\n", + "person_count = sim.calculate(\"person_count\", period=2025, map_to=\"household\")\n", + "\n", + "print(f\"Number of households in dataset: {len(household_weight):,}\")\n", + "print(f\"Household count (mapped): {household_count.sum():,.0f}\")\n", + "print(f\"Person count (mapped): {person_count.sum():,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f0c79a50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Income distribution:\n", + " Median AGI: $73,149\n", + " 75th percentile: $152,410\n", + " 90th percentile: $271,943\n", + " 95th percentile: $400,420\n", + " Max AGI: $1,740,956\n", + "\n", + "Households by income threshold:\n", + " Households over $80k: 180,850.09423405278\n", + " Households over $120k: 128,983.09166995375\n", + " Households over $160k: 88,148.79416422347\n", + " Households over $240k: 47,853.16035188042\n" + ] + } + ], + "source": [ + "# Check household income distribution (aggregate to household level using map_to)\n", + "agi = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n", + "print(f\"Income distribution:\")\n", + "print(f\" Median AGI: ${agi.median():,.0f}\")\n", + "print(f\" 75th percentile: ${agi.quantile(0.75):,.0f}\")\n", + "print(f\" 90th percentile: ${agi.quantile(0.90):,.0f}\")\n", + "print(f\" 95th percentile: ${agi.quantile(0.95):,.0f}\")\n", + "print(f\" Max AGI: ${agi.max():,.0f}\")\n", + "print(f\"\\nHouseholds by income threshold:\")\n", + "print(f\" Households over $80k: {(agi > 80_000).sum():,}\")\n", + "print(f\" Households over $120k: {(agi > 120_000).sum():,}\")\n", + "print(f\" Households over $160k: {(agi > 160_000).sum():,}\")\n", + "print(f\" Households over $240k: {(agi > 240_000).sum():,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "71b548db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Households with children (weighted):\n", + " Total households with children: 121,492\n", + " Households with 1 child: 66,113\n", + " Households with 2 children: 37,589\n", + " Households with 3+ children: 17,790\n" + ] + } + ], + "source": [ + "# Check households with children (count at person level, aggregate to household)\n", + "is_child = sim.calculate(\"is_child\", period=2025, map_to=\"person\")\n", + "household_id = sim.calculate(\"household_id\", period=2025, map_to=\"person\")\n", + "household_weight = sim.calculate(\"household_weight\", period=2025, map_to=\"person\")\n", + "\n", + "# Create DataFrame for easier manipulation\n", + "df_households = pd.DataFrame({\n", + " 'household_id': household_id,\n", + " 'is_child': is_child,\n", + " 'household_weight': household_weight\n", + "})\n", + "\n", + "# Count children per household\n", + "children_per_household = df_households.groupby('household_id').agg({\n", + " 'is_child': 'sum',\n", + " 'household_weight': 'first' # household_weight is same for all members\n", + "}).reset_index()\n", + "\n", + "# Calculate weighted household counts\n", + "total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()\n", + "households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()\n", + "households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()\n", + "households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()\n", + "\n", + "print(f\"\\nHouseholds with children (weighted):\")\n", + "print(f\" Total households with children: {total_households_with_children:,.0f}\")\n", + "print(f\" Households with 1 child: {households_with_1_child:,.0f}\")\n", + "print(f\" Households with 2 children: {households_with_2_children:,.0f}\")\n", + "print(f\" Households with 3+ children: {households_with_3plus_children:,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a215302f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Children by age:\n", + " Total children under 18: 203,860\n", + " Children under 4: 41,525\n", + " Children under 6: 61,595\n", + " Children ages 6-17: 138,203\n", + "\n", + "Sample of children under 4:\n", + " household_id tax_unit_id person_id age\n", + "7 2730011 5 7730007 1.0\n", + "16 2730006 10 7730016 2.0\n", + "37 2730018 26 7730037 3.0\n", + "83 2730035 55 7730083 0.0\n", + "101 2730041 66 7730101 3.0\n", + "102 2730041 66 7730102 2.0\n", + "103 2730041 66 7730103 0.0\n", + "108 2730043 69 7730108 1.0\n", + "111 2730045 71 7730111 1.0\n", + "115 2730045 71 7730115 3.0\n" + ] + } + ], + "source": [ + "# Check children by age groups using Ben's workaround\n", + "import pandas as pd\n", + "df = pd.DataFrame({\n", + " \"household_id\": sim.calculate(\"household_id\", map_to=\"person\"),\n", + " \"tax_unit_id\": sim.calculate(\"tax_unit_id\", map_to=\"person\"),\n", + " \"person_id\": sim.calculate(\"person_id\", map_to=\"person\"),\n", + " \"age\": sim.calculate(\"age\", map_to=\"person\"),\n", + " \"person_weight\": sim.calculate(\"person_weight\", map_to=\"person\")\n", + "})\n", + "\n", + "# Filter for children and apply weights\n", + "children_under_4_df = df[df['age'] < 4]\n", + "children_under_6_df = df[df['age'] < 6]\n", + "children_under_18_df = df[df['age'] < 18]\n", + "children_6_17_df = df[(df['age'] >= 6) & (df['age'] < 18)]\n", + "\n", + "# Calculate weighted totals\n", + "is_child = sim.calculate(\"is_child\", period=2025)\n", + "total_children = is_child.sum()\n", + "children_under_4 = children_under_4_df['person_weight'].sum()\n", + "children_under_6 = children_under_6_df['person_weight'].sum()\n", + "children_6_17 = children_6_17_df['person_weight'].sum()\n", + "\n", + "print(f\"\\nChildren by age:\")\n", + "print(f\" Total children under 18: {total_children:,.0f}\")\n", + "print(f\" Children under 4: {children_under_4:,.0f}\")\n", + "print(f\" Children under 6: {children_under_6:,.0f}\")\n", + "print(f\" Children ages 6-17: {children_6_17:,.0f}\")\n", + "\n", + "print(f\"\\nSample of children under 4:\")\n", + "print(children_under_4_df[['household_id', 'tax_unit_id', 'person_id', 'age']].head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9468033e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "RI DATASET SUMMARY - WEIGHTED (Population Estimates)\n", + "============================================================\n", + " Metric Value\n", + " Household count (weighted) 388,376\n", + " Person count (weighted) 1,106,390\n", + " Median AGI $73,149\n", + " 75th percentile AGI $152,410\n", + " 90th percentile AGI $271,943\n", + " 95th percentile AGI $400,420\n", + " Max AGI $1,740,956\n", + " Households over $80k 180,850\n", + " Households over $120k 128,983\n", + " Households over $160k 88,149\n", + " Households over $240k 47,853\n", + "Total households with children 121,492\n", + " Households with 1 child 66,113\n", + " Households with 2 children 37,589\n", + " Households with 3+ children 17,790\n", + " Total children under 18 203,860\n", + " Children under 4 41,525\n", + " Children under 6 61,595\n", + " Children ages 6-17 138,203\n", + "============================================================\n", + "\n", + "============================================================\n", + "RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)\n", + "============================================================\n", + " Metric Value\n", + " Number of households in dataset 2,368\n", + " Number of persons in dataset 5,888\n", + " Households with children (unweighted) 616\n", + " Households with 1 child (unweighted) 305\n", + " Households with 2 children (unweighted) 201\n", + "Households with 3+ children (unweighted) 110\n", + " Children under 18 (unweighted) 1,079\n", + " Children under 4 (unweighted) 254\n", + " Children under 6 (unweighted) 373\n", + " Children ages 6-17 (unweighted) 706\n", + "============================================================\n", + "\n", + "Summaries saved to:\n", + " - ri_dataset_summary_weighted.csv\n", + " - ri_dataset_summary_unweighted.csv\n" + ] + } + ], + "source": [ + "# Create weighted summary table\n", + "weighted_summary_data = {\n", + " 'Metric': [\n", + " 'Household count (weighted)',\n", + " 'Person count (weighted)',\n", + " 'Median AGI',\n", + " '75th percentile AGI',\n", + " '90th percentile AGI',\n", + " '95th percentile AGI',\n", + " 'Max AGI',\n", + " 'Households over $80k',\n", + " 'Households over $120k',\n", + " 'Households over $160k',\n", + " 'Households over $240k',\n", + " 'Total households with children',\n", + " 'Households with 1 child',\n", + " 'Households with 2 children',\n", + " 'Households with 3+ children',\n", + " 'Total children under 18',\n", + " 'Children under 4',\n", + " 'Children under 6',\n", + " 'Children ages 6-17'\n", + " ],\n", + " 'Value': [\n", + " f\"{household_count.sum():,.0f}\",\n", + " f\"{person_count.sum():,.0f}\",\n", + " f\"${agi.median():,.0f}\",\n", + " f\"${agi.quantile(0.75):,.0f}\",\n", + " f\"${agi.quantile(0.90):,.0f}\",\n", + " f\"${agi.quantile(0.95):,.0f}\",\n", + " f\"${agi.max():,.0f}\",\n", + " f\"{(agi > 80_000).sum():,.0f}\",\n", + " f\"{(agi > 120_000).sum():,.0f}\",\n", + " f\"{(agi > 160_000).sum():,.0f}\",\n", + " f\"{(agi > 240_000).sum():,.0f}\",\n", + " f\"{total_households_with_children:,.0f}\",\n", + " f\"{households_with_1_child:,.0f}\",\n", + " f\"{households_with_2_children:,.0f}\",\n", + " f\"{households_with_3plus_children:,.0f}\",\n", + " f\"{total_children:,.0f}\",\n", + " f\"{children_under_4:,.0f}\",\n", + " f\"{children_under_6:,.0f}\",\n", + " f\"{children_6_17:,.0f}\"\n", + " ]\n", + "}\n", + "\n", + "# Get unique counts for unweighted table\n", + "unique_households = df['household_id'].nunique()\n", + "unique_persons = len(df)\n", + "\n", + "# Create unweighted summary table\n", + "unweighted_summary_data = {\n", + " 'Metric': [\n", + " 'Number of households in dataset',\n", + " 'Number of persons in dataset',\n", + " 'Households with children (unweighted)',\n", + " 'Households with 1 child (unweighted)',\n", + " 'Households with 2 children (unweighted)',\n", + " 'Households with 3+ children (unweighted)',\n", + " 'Children under 18 (unweighted)',\n", + " 'Children under 4 (unweighted)',\n", + " 'Children under 6 (unweighted)',\n", + " 'Children ages 6-17 (unweighted)'\n", + " ],\n", + " 'Value': [\n", + " f\"{unique_households:,}\",\n", + " f\"{unique_persons:,}\",\n", + " f\"{(children_per_household['is_child'] > 0).sum():,}\",\n", + " f\"{(children_per_household['is_child'] == 1).sum():,}\",\n", + " f\"{(children_per_household['is_child'] == 2).sum():,}\",\n", + " f\"{(children_per_household['is_child'] >= 3).sum():,}\",\n", + " f\"{len(children_under_18_df):,}\",\n", + " f\"{len(children_under_4_df):,}\",\n", + " f\"{len(children_under_6_df):,}\",\n", + " f\"{len(children_6_17_df):,}\"\n", + " ]\n", + "}\n", + "\n", + "weighted_df = pd.DataFrame(weighted_summary_data)\n", + "unweighted_df = pd.DataFrame(unweighted_summary_data)\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"RI DATASET SUMMARY - WEIGHTED (Population Estimates)\")\n", + "print(\"=\"*60)\n", + "print(weighted_df.to_string(index=False))\n", + "print(\"=\"*60)\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)\")\n", + "print(\"=\"*60)\n", + "print(unweighted_df.to_string(index=False))\n", + "print(\"=\"*60)\n", + "\n", + "# Save both tables\n", + "weighted_df.to_csv('ri_dataset_summary_weighted.csv', index=False)\n", + "unweighted_df.to_csv('ri_dataset_summary_unweighted.csv', index=False)\n", + "print(\"\\nSummaries saved to:\")\n", + "print(\" - ri_dataset_summary_weighted.csv\")\n", + "print(\" - ri_dataset_summary_unweighted.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "dzvou2zqia4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Median AGI by aggregation level:\n", + " Household level: $73,149\n", + " Tax unit level: $35,546\n", + " Person level: $47,592\n", + "\n", + "Total AGI for Rhode Island (by aggregation level):\n", + " Using tax unit level: $43,501,430,523\n", + " Using household level: $43,501,430,523\n", + " Using person level: $77,844,195,552\n" + ] + } + ], + "source": [ + "# Compare median AGI at different aggregation levels\n", + "agi_household = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n", + "agi_tax_unit = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"tax_unit\")\n", + "agi_person = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"person\")\n", + "\n", + "print(\"Median AGI by aggregation level:\")\n", + "print(f\" Household level: ${agi_household.median():,.0f}\")\n", + "print(f\" Tax unit level: ${agi_tax_unit.median():,.0f}\")\n", + "print(f\" Person level: ${agi_person.median():,.0f}\")\n", + "\n", + "# Calculate total AGI - just sum the values (weights are already built into the arrays)\n", + "total_agi_tax_unit = agi_tax_unit.sum()\n", + "total_agi_household = agi_household.sum()\n", + "total_agi_person = agi_person.sum()\n", + "\n", + "print(f\"\\nTotal AGI for Rhode Island (by aggregation level):\")\n", + "print(f\" Using tax unit level: ${total_agi_tax_unit:,.0f}\")\n", + "print(f\" Using household level: ${total_agi_household:,.0f}\")\n", + "print(f\" Using person level: ${total_agi_person:,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "gispfkxpnph", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AGI Component Breakdown (Tax Unit Level)\n", + "============================================================\n", + "\n", + "Total Income (Statewide):\n", + " Employment Income: $ 31,034,426,346\n", + " Self-Employment Income: $ 1,890,240,187\n", + " Capital Gains: $ 1,086,347,982\n", + " Qualified Dividends: $ 1,002,331,804\n", + " Interest Income: $ 670,462,607\n", + " Taxable Social Security: $ 1,123,366,624\n", + " Pension Income: $ 1,384,610,313\n", + " Adjusted Gross Income (AGI): $ 43,501,430,523\n", + "\n", + "Median Values:\n", + " Employment Income: $ 29,531\n", + " Self-Employment Income: $ 0\n", + " Capital Gains: $ 0\n", + " Qualified Dividends: $ 0\n", + " Interest Income: $ 0\n", + " Taxable Social Security: $ 0\n", + " Pension Income: $ 0\n", + " Adjusted Gross Income (AGI): $ 35,546\n", + "\n", + "Sum of income components: $ 38,191,785,863\n", + "AGI (for comparison): $ 43,501,430,523\n", + "Difference (potential missing income or deductions): $ -5,309,644,660\n" + ] + } + ], + "source": [ + "# Break down AGI components at tax unit level\n", + "print(\"AGI Component Breakdown (Tax Unit Level)\")\n", + "print(\"=\"*60)\n", + "\n", + "# Calculate key income components\n", + "employment_income = sim.calculate(\"employment_income\", period=2025, map_to=\"tax_unit\")\n", + "self_employment_income = sim.calculate(\"self_employment_income\", period=2025, map_to=\"tax_unit\")\n", + "capital_gains = sim.calculate(\"capital_gains\", period=2025, map_to=\"tax_unit\")\n", + "qualified_dividend_income = sim.calculate(\"qualified_dividend_income\", period=2025, map_to=\"tax_unit\")\n", + "interest_income = sim.calculate(\"interest_income\", period=2025, map_to=\"tax_unit\")\n", + "taxable_social_security = sim.calculate(\"taxable_social_security\", period=2025, map_to=\"tax_unit\")\n", + "pension_income = sim.calculate(\"pension_income\", period=2025, map_to=\"tax_unit\")\n", + "adjusted_gross_income = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"tax_unit\")\n", + "\n", + "print(\"\\nTotal Income (Statewide):\")\n", + "print(f\" Employment Income: ${employment_income.sum():>15,.0f}\")\n", + "print(f\" Self-Employment Income: ${self_employment_income.sum():>15,.0f}\")\n", + "print(f\" Capital Gains: ${capital_gains.sum():>15,.0f}\")\n", + "print(f\" Qualified Dividends: ${qualified_dividend_income.sum():>15,.0f}\")\n", + "print(f\" Interest Income: ${interest_income.sum():>15,.0f}\")\n", + "print(f\" Taxable Social Security: ${taxable_social_security.sum():>15,.0f}\")\n", + "print(f\" Pension Income: ${pension_income.sum():>15,.0f}\")\n", + "print(f\" Adjusted Gross Income (AGI): ${adjusted_gross_income.sum():>15,.0f}\")\n", + "\n", + "print(\"\\nMedian Values:\")\n", + "print(f\" Employment Income: ${employment_income.median():>15,.0f}\")\n", + "print(f\" Self-Employment Income: ${self_employment_income.median():>15,.0f}\")\n", + "print(f\" Capital Gains: ${capital_gains.median():>15,.0f}\")\n", + "print(f\" Qualified Dividends: ${qualified_dividend_income.median():>15,.0f}\")\n", + "print(f\" Interest Income: ${interest_income.median():>15,.0f}\")\n", + "print(f\" Taxable Social Security: ${taxable_social_security.median():>15,.0f}\")\n", + "print(f\" Pension Income: ${pension_income.median():>15,.0f}\")\n", + "print(f\" Adjusted Gross Income (AGI): ${adjusted_gross_income.median():>15,.0f}\")\n", + "\n", + "# Calculate sum of components to compare with AGI\n", + "total_components = (employment_income + self_employment_income + capital_gains + \n", + " qualified_dividend_income + interest_income + taxable_social_security + pension_income)\n", + "print(f\"\\nSum of income components: ${total_components.sum():>15,.0f}\")\n", + "print(f\"AGI (for comparison): ${adjusted_gross_income.sum():>15,.0f}\")\n", + "print(f\"Difference (potential missing income or deductions): ${(total_components.sum() - adjusted_gross_income.sum()):>15,.0f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pe", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ri_dataset_summary_unweighted.csv b/ri_dataset_summary_unweighted.csv new file mode 100644 index 0000000..8567486 --- /dev/null +++ b/ri_dataset_summary_unweighted.csv @@ -0,0 +1,11 @@ +Metric,Value +Number of households in dataset,"2,368" +Number of persons in dataset,"5,888" +Households with children (unweighted),616 +Households with 1 child (unweighted),305 +Households with 2 children (unweighted),201 +Households with 3+ children (unweighted),110 +Children under 18 (unweighted),"1,079" +Children under 4 (unweighted),254 +Children under 6 (unweighted),373 +Children ages 6-17 (unweighted),706 diff --git a/ri_dataset_summary_weighted.csv b/ri_dataset_summary_weighted.csv new file mode 100644 index 0000000..c4597a9 --- /dev/null +++ b/ri_dataset_summary_weighted.csv @@ -0,0 +1,20 @@ +Metric,Value +Household count (weighted),"388,376" +Person count (weighted),"1,106,390" +Median AGI,"$73,149" +75th percentile AGI,"$152,410" +90th percentile AGI,"$271,943" +95th percentile AGI,"$400,420" +Max AGI,"$1,740,956" +Households over $80k,"180,850" +Households over $120k,"128,983" +Households over $160k,"88,149" +Households over $240k,"47,853" +Total households with children,"121,492" +Households with 1 child,"66,113" +Households with 2 children,"37,589" +Households with 3+ children,"17,790" +Total children under 18,"203,860" +Children under 4,"41,525" +Children under 6,"61,595" +Children ages 6-17,"138,203"