# NYC Dataset Exploration

This notebook explores the New York City (NYC) dataset to understand household counts, income distribution, and demographic characteristics relevant to the Mamdani Millionaire Income Tax analysis.

In [10]:
from policyengine_us import Microsimulation
import pandas as pd
import numpy as np

NYC_DATASET = "hf://policyengine/policyengine-us-data/cities/NYC.h5"
YEAR = 2026

In [11]:
# Load NYC dataset
sim = Microsimulation(dataset=NYC_DATASET)

In [12]:
# Check dataset size
household_weight = sim.calculate("household_weight", period=YEAR)
household_count = sim.calculate("household_count", period=YEAR, map_to="household")
person_count = sim.calculate("person_count", period=YEAR, map_to="household")

print(f"Number of households in dataset: {len(household_weight):,}")
print(f"Household count (weighted): {household_count.sum():,.0f}")
print(f"Person count (weighted): {person_count.sum():,.0f}")

# Check county info (determines in_nyc)
county_str = sim.calculate("county_str", period=YEAR, map_to="household")
print(f"\nCounty values in dataset: {np.unique(county_str)[:10]}...")  # First 10

# Check if in_nyc is set
in_nyc = sim.calculate("in_nyc", period=YEAR, map_to="household")
in_nyc_weighted = (np.array(in_nyc) * np.array(household_weight)).sum()
print(f"\nHouseholds in NYC (in_nyc=True, weighted): {in_nyc_weighted:,.0f}")
print(f"in_nyc unique values: {np.unique(in_nyc)}")

# Check state
state_name = sim.calculate("state_name", period=YEAR, map_to="household")
print(f"State names in dataset: {np.unique(state_name)}")

# Check NY taxable income
ny_taxable = sim.calculate("ny_taxable_income", period=YEAR, map_to="household")
print(f"\nNY taxable income - median: ${np.median(ny_taxable):,.0f}")
print(f"NY taxable income - max: ${np.max(ny_taxable):,.0f}")

Number of households in dataset: 51,495
Household count (weighted): 2,353,653
Person count (weighted): 6,891,060

County values in dataset: ['ALBANY_COUNTY_NY']...

Households in NYC (in_nyc=True, weighted): 0
in_nyc unique values: [False]
State names in dataset: ['NY']

NY taxable income - median: $183,056
NY taxable income - max: $3,067,754


In [13]:
# Check household income distribution
agi = sim.calculate("adjusted_gross_income", period=YEAR, map_to="household")
print(f"Income distribution:")
print(f"  Median AGI: ${agi.median():,.0f}")
print(f"  75th percentile: ${agi.quantile(0.75):,.0f}")
print(f"  90th percentile: ${agi.quantile(0.90):,.0f}")
print(f"  95th percentile: ${agi.quantile(0.95):,.0f}")
print(f"  99th percentile: ${agi.quantile(0.99):,.0f}")
print(f"  Max AGI: ${agi.max():,.0f}")

Income distribution:
  Median AGI: $40,031
  75th percentile: $207,486
  90th percentile: $821,751
  95th percentile: $1,864,716
  99th percentile: $2,118,524
  Max AGI: $3,152,176


In [14]:
# Check NYC taxable income distribution (relevant for Mamdani tax)
nyc_taxable_income = sim.calculate("nyc_taxable_income", period=YEAR, map_to="household")
print(f"NYC Taxable Income distribution:")
print(f"  Median: ${nyc_taxable_income.median():,.0f}")
print(f"  75th percentile: ${nyc_taxable_income.quantile(0.75):,.0f}")
print(f"  90th percentile: ${nyc_taxable_income.quantile(0.90):,.0f}")
print(f"  95th percentile: ${nyc_taxable_income.quantile(0.95):,.0f}")
print(f"  99th percentile: ${nyc_taxable_income.quantile(0.99):,.0f}")
print(f"  Max: ${nyc_taxable_income.max():,.0f}")

NYC Taxable Income distribution:
  Median: $0
  75th percentile: $0
  90th percentile: $0
  95th percentile: $0
  99th percentile: $0
  Max: $0


In [15]:
# High income households (relevant for millionaire tax)
weights = np.array(sim.calculate("household_weight", period=YEAR))
nyc_taxable = np.array(nyc_taxable_income)
total_households = weights.sum()

# Households above $1M (Mamdani threshold)
above_1m_mask = nyc_taxable >= 1_000_000
above_1m_count = weights[above_1m_mask].sum()

# Households in various high-income brackets
above_500k_mask = nyc_taxable >= 500_000
above_500k_count = weights[above_500k_mask].sum()

above_2m_mask = nyc_taxable >= 2_000_000
above_2m_count = weights[above_2m_mask].sum()

above_5m_mask = nyc_taxable >= 5_000_000
above_5m_count = weights[above_5m_mask].sum()

print("\n" + "="*70)
print("HIGH INCOME HOUSEHOLDS (NYC Taxable Income)")
print("="*70)
print(f"Households with income >= $500K:  {above_500k_count:,.0f} ({above_500k_count/total_households*100:.2f}%)")
print(f"Households with income >= $1M:    {above_1m_count:,.0f} ({above_1m_count/total_households*100:.2f}%)")
print(f"Households with income >= $2M:    {above_2m_count:,.0f} ({above_2m_count/total_households*100:.2f}%)")
print(f"Households with income >= $5M:    {above_5m_count:,.0f} ({above_5m_count/total_households*100:.2f}%)")
print("="*70)


HIGH INCOME HOUSEHOLDS (NYC Taxable Income)
Households with income >= $500K:  0 (0.00%)
Households with income >= $1M:    0 (0.00%)
Households with income >= $2M:    0 (0.00%)
Households with income >= $5M:    0 (0.00%)


In [16]:
# Household counts by income brackets (focus on high income for millionaire tax)
income_brackets = [
    (0, 50000, "$0-$50k"),
    (50000, 100000, "$50k-$100k"),
    (100000, 200000, "$100k-$200k"),
    (200000, 500000, "$200k-$500k"),
    (500000, 1000000, "$500k-$1M"),
    (1000000, 2000000, "$1M-$2M"),
    (2000000, 5000000, "$2M-$5M"),
    (5000000, float('inf'), "$5M+")
]

bracket_data = []
for lower, upper, label in income_brackets:
    if upper == float('inf'):
        mask = nyc_taxable >= lower
    else:
        mask = (nyc_taxable >= lower) & (nyc_taxable < upper)
    count = weights[mask].sum()
    pct_of_total = (count / total_households) * 100
    
    bracket_data.append({
        "Income Bracket": label,
        "Households": f"{count:,.0f}",
        "% of All Households": f"{pct_of_total:.2f}%"
    })

income_df = pd.DataFrame(bracket_data)

print("\n" + "="*70)
print("HOUSEHOLD COUNTS BY NYC TAXABLE INCOME BRACKET")
print("="*70)
print(income_df.to_string(index=False))
print("="*70)


HOUSEHOLD COUNTS BY NYC TAXABLE INCOME BRACKET
Income Bracket Households % of All Households
       $0-$50k  2,353,654             100.00%
    $50k-$100k          0               0.00%
   $100k-$200k          0               0.00%
   $200k-$500k          0               0.00%
     $500k-$1M          0               0.00%
       $1M-$2M          0               0.00%
       $2M-$5M          0               0.00%
          $5M+          0               0.00%


In [17]:
# Create weighted summary table
weighted_summary_data = {
    'Metric': [
        'Household count (weighted)',
        'Person count (weighted)',
        'Median AGI',
        '75th percentile AGI',
        '90th percentile AGI',
        '95th percentile AGI',
        '99th percentile AGI',
        'Max AGI',
        'Median NYC Taxable Income',
        '99th percentile NYC Taxable Income',
        'Households with income >= $1M',
        'Pct of households with income >= $1M'
    ],
    'Value': [
        f"{household_count.sum():,.0f}",
        f"{person_count.sum():,.0f}",
        f"${agi.median():,.0f}",
        f"${agi.quantile(0.75):,.0f}",
        f"${agi.quantile(0.90):,.0f}",
        f"${agi.quantile(0.95):,.0f}",
        f"${agi.quantile(0.99):,.0f}",
        f"${agi.max():,.0f}",
        f"${nyc_taxable_income.median():,.0f}",
        f"${nyc_taxable_income.quantile(0.99):,.0f}",
        f"{above_1m_count:,.0f}",
        f"{above_1m_count/total_households*100:.2f}%"
    ]
}

weighted_df = pd.DataFrame(weighted_summary_data)

print("\n" + "="*60)
print("NYC DATASET SUMMARY - WEIGHTED (Population Estimates)")
print("="*60)
print(weighted_df.to_string(index=False))
print("="*60)

# Save table
weighted_df.to_csv('nyc_dataset_summary_weighted.csv', index=False)
print("\nSummary saved to: nyc_dataset_summary_weighted.csv")


NYC DATASET SUMMARY - WEIGHTED (Population Estimates)
                              Metric      Value
          Household count (weighted)  2,353,653
             Person count (weighted)  6,891,060
                          Median AGI    $40,031
                 75th percentile AGI   $207,486
                 90th percentile AGI   $821,751
                 95th percentile AGI $1,864,716
                 99th percentile AGI $2,118,524
                             Max AGI $3,152,176
           Median NYC Taxable Income         $0
  99th percentile NYC Taxable Income         $0
       Households with income >= $1M          0
Pct of households with income >= $1M      0.00%

Summary saved to: nyc_dataset_summary_weighted.csv


## County Discrepancy Investigation

Ben reported seeing different county results when running the same dataset. His results showed multiple NYC counties (Queens, Bronx, etc.) instead of just Albany County. This may be related to local branch differences.

In [18]:
# Ben's results (from his local environment - may be related to local branches):
#
# In [5]: from policyengine_us import Microsimulation
#
# In [6]: sim = Microsimulation(dataset="hf://policyengine/policyengine-us-data/cities/NYC.h5")
#
# In [7]: sim.calculate("county")
# Out[7]: 
#                   value      weight
# 0      QUEENS_COUNTY_NY    0.052011
# 1      QUEENS_COUNTY_NY    0.021776
# 2      QUEENS_COUNTY_NY    0.063064
# 3      QUEENS_COUNTY_NY    0.013337
# 4      QUEENS_COUNTY_NY   29.656933
# ...                 ...         ...
# 51490   BRONX_COUNTY_NY    0.050521
# 51491   BRONX_COUNTY_NY  249.701233
# 51492   BRONX_COUNTY_NY   10.212130
# 51493   BRONX_COUNTY_NY    0.592128
# 51494   BRONX_COUNTY_NY    0.095578
#
# [51495 rows x 2 columns]

# Compare with our results
county = sim.calculate("county", period=YEAR)
print("County value counts:")
print(county.value_counts())

# Filtering demonstration (from Ben's testing)
df = sim.calculate_dataframe(['household_id', 'household_weight', 'congressional_district_geoid', 'state_fips', 'county', 'county_str', 'in_nyc'])
pdf = pd.DataFrame(df)
print("\nDataFrame columns and sample:")
print(pdf.head())
print(f"\nUnique counties: {pdf['county'].unique()}")
print(f"in_nyc values: {pdf['in_nyc'].unique()}")

County value counts:
ALBANY_COUNTY_NY    51495
Name: count, dtype: int64

DataFrame columns and sample:
   household_id  household_weight  congressional_district_geoid  state_fips  \
0       4550000          0.052011                          3603          36   
1       4550001          0.021776                          3603          36   
2       4550002          0.063064                          3603          36   
3       4550003          0.013337                          3603          36   
4       4550004         29.656933                          3603          36   

             county        county_str  in_nyc  
0  QUEENS_COUNTY_NY  QUEENS_COUNTY_NY    True  
1  QUEENS_COUNTY_NY  QUEENS_COUNTY_NY    True  
2  QUEENS_COUNTY_NY  QUEENS_COUNTY_NY    True  
3  QUEENS_COUNTY_NY  QUEENS_COUNTY_NY    True  
4  QUEENS_COUNTY_NY  QUEENS_COUNTY_NY    True  

Unique counties: ['QUEENS_COUNTY_NY' 'KINGS_COUNTY_NY' 'NEW_YORK_COUNTY_NY'
 'RICHMOND_COUNTY_NY' 'BRONX_COUNTY_NY']
in_nyc values: 