diff --git a/.claude/policyengine-guide.md b/.claude/policyengine-guide.md new file mode 100644 index 00000000..922c542c --- /dev/null +++ b/.claude/policyengine-guide.md @@ -0,0 +1,568 @@ +# PolicyEngine.py - Claude Guide + +This guide helps you use the policyengine.py package to perform tax-benefit microsimulation analysis. + +## Core workflow + +1. **Create or load a dataset** with microdata (person, household, etc.) +2. **Run a simulation** applying tax-benefit rules to the dataset +3. **Extract results** using output classes (Aggregate, ChangeAggregate) +4. **Visualise** using built-in plotting utilities + +## Package structure + +``` +policyengine +├── core/ +│ ├── Dataset, YearData # Data containers +│ ├── Simulation # Runs tax-benefit calculations +│ ├── Policy, Parameter # Define reforms +│ └── map_to_entity() # Entity mapping utility +├── outputs/ +│ ├── Aggregate # Calculate statistics +│ └── ChangeAggregate # Analyse reforms +├── tax_benefit_models/ +│ ├── uk/ # UK-specific models +│ └── us/ # US-specific models +└── utils/ + └── plotting # Visualisation tools +``` + +## Quick start patterns + +### Pattern 1: Synthetic scenario analysis + +Use when: User wants to analyse specific household scenarios + +```python +import pandas as pd +from microdf import MicroDataFrame +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + UKYearData, + uk_latest +) +from policyengine.core import Simulation + +# Create synthetic person data +person_df = MicroDataFrame( + pd.DataFrame({ + "person_id": [0, 1, 2], + "person_household_id": [0, 0, 1], + "person_benunit_id": [0, 0, 1], + "age": [35, 8, 40], + "employment_income": [30000, 0, 50000], + "person_weight": [1.0, 1.0, 1.0], + }), + weights="person_weight" +) + +# Create household data +household_df = MicroDataFrame( + pd.DataFrame({ + "household_id": [0, 1], + "region": ["LONDON", "SOUTH_EAST"], + "rent": [15000, 12000], + "household_weight": [1.0, 1.0], + }), + weights="household_weight" +) + +# Create benunit data (UK only) +benunit_df = MicroDataFrame( + pd.DataFrame({ + "benunit_id": [0, 1], + "would_claim_uc": [True, True], + "benunit_weight": [1.0, 1.0], + }), + weights="benunit_weight" +) + +# Package into dataset +dataset = PolicyEngineUKDataset( + name="Custom scenario", + description="Analysis scenario", + filepath="./custom.h5", + year=2026, + data=UKYearData( + person=person_df, + household=household_df, + benunit=benunit_df, + ) +) + +# Run simulation +simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, +) +simulation.run() + +# Access results +output = simulation.output_dataset.data +print(output.household[["household_id", "household_net_income"]]) +``` + +### Pattern 2: US synthetic scenario + +```python +from policyengine.tax_benefit_models.us import ( + PolicyEngineUSDataset, + USYearData, + us_latest +) + +# Create person data (note: US has more entity types) +person_df = MicroDataFrame( + pd.DataFrame({ + "person_id": [0, 1, 2, 3], + "person_household_id": [0, 0, 0, 0], + "person_tax_unit_id": [0, 0, 0, 0], + "person_spm_unit_id": [0, 0, 0, 0], + "person_family_id": [0, 0, 0, 0], + "person_marital_unit_id": [0, 0, 1, 2], + "age": [35, 33, 8, 5], + "employment_income": [60000, 40000, 0, 0], + "person_weight": [1.0, 1.0, 1.0, 1.0], + }), + weights="person_weight" +) + +# Create entity dataframes (tax_unit, spm_unit, family, marital_unit, household) +# ... (see examples/employment_income_variation_us.py for full pattern) + +dataset = PolicyEngineUSDataset( + name="US scenario", + year=2024, + filepath="./us_scenario.h5", + data=USYearData( + person=person_df, + tax_unit=tax_unit_df, + spm_unit=spm_unit_df, + family=family_df, + marital_unit=marital_unit_df, + household=household_df, + ) +) +``` + +### Pattern 3: Parameter sweep analysis + +Use when: User wants to vary one parameter across many values + +```python +import numpy as np + +# Create N scenarios with varying parameter +n_scenarios = 43 +income_values = np.linspace(0, 100000, n_scenarios) + +# Create person data with all scenarios +person_df = MicroDataFrame( + pd.DataFrame({ + "person_id": range(n_scenarios), + "person_household_id": range(n_scenarios), + "person_benunit_id": range(n_scenarios), + "age": [35] * n_scenarios, + "employment_income": income_values, + "person_weight": [1.0] * n_scenarios, + }), + weights="person_weight" +) + +# Create matching household/benunit data +household_df = MicroDataFrame( + pd.DataFrame({ + "household_id": range(n_scenarios), + "region": ["LONDON"] * n_scenarios, + "rent": [15000] * n_scenarios, + "household_weight": [1.0] * n_scenarios, + }), + weights="household_weight" +) + +# ... create dataset and run simulation once for all scenarios +``` + +### Pattern 4: Policy reform analysis + +Use when: User wants to compare baseline vs reform + +```python +from policyengine.core import Policy, Parameter, ParameterValue +import datetime + +# Define reform +parameter = Parameter( + name="gov.hmrc.income_tax.allowances.personal_allowance.amount", + tax_benefit_model_version=uk_latest, + description="Personal allowance", + data_type=float, +) + +policy = Policy( + name="Increase personal allowance", + description="Raises PA to £15,000", + parameter_values=[ + ParameterValue( + parameter=parameter, + start_date=datetime.date(2026, 1, 1), + end_date=datetime.date(2026, 12, 31), + value=15000, + ) + ], +) + +# Run baseline +baseline_sim = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, +) +baseline_sim.run() + +# Run reform +reform_sim = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + policy=policy, +) +reform_sim.run() + +# Analyse impact +from policyengine.outputs.change_aggregate import ( + ChangeAggregate, + ChangeAggregateType +) + +winners = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_net_income", + aggregate_type=ChangeAggregateType.COUNT, + change_geq=1, # Gain at least £1 +) +winners.run() +print(f"Winners: {winners.result:,.0f}") +``` + +### Pattern 5: Extract aggregates + +Use when: User wants statistics from simulation results + +```python +from policyengine.outputs.aggregate import Aggregate, AggregateType + +# Total spending on a benefit +total_uc = Aggregate( + simulation=simulation, + variable="universal_credit", + entity="benunit", + aggregate_type=AggregateType.SUM, +) +total_uc.run() +print(f"Total UC: £{total_uc.result / 1e9:.1f}bn") + +# Mean income in top decile +top_decile_income = Aggregate( + simulation=simulation, + variable="household_net_income", + entity="household", + aggregate_type=AggregateType.MEAN, + filter_variable="household_net_income", + quantile=10, + quantile_eq=10, # 10th decile only +) +top_decile_income.run() +print(f"Top decile mean income: £{top_decile_income.result:,.0f}") + +# Count households below poverty line +poverty_count = Aggregate( + simulation=simulation, + variable="household_id", + entity="household", + aggregate_type=AggregateType.COUNT, + filter_variable="in_absolute_poverty_bhc", + filter_eq=True, +) +poverty_count.run() +print(f"Households in poverty: {poverty_count.result:,.0f}") +``` + +### Pattern 6: Entity mapping + +Use when: User needs to map data between entity levels + +```python +# Map person income to household level (sum) +household_income = dataset.data.map_to_entity( + source_entity="person", + target_entity="household", + columns=["employment_income"], + how="sum" +) + +# Map household rent to person level (broadcast) +person_rent = dataset.data.map_to_entity( + source_entity="household", + target_entity="person", + columns=["rent"], + how="project" +) + +# Split household savings equally per person +person_savings_share = dataset.data.map_to_entity( + source_entity="household", + target_entity="person", + columns=["total_savings"], + how="divide" +) + +# Map custom values +import numpy as np +custom_values = np.array([100, 200, 150]) +household_totals = dataset.data.map_to_entity( + source_entity="person", + target_entity="household", + values=custom_values, + how="sum" +) +``` + +### Pattern 7: Visualisation + +```python +from policyengine.utils.plotting import format_fig, COLORS +import plotly.graph_objects as go + +fig = go.Figure() +fig.add_trace(go.Scatter( + x=income_values, + y=net_income_values, + mode='lines', + name='Net income', + line=dict(color=COLORS["primary"], width=3) +)) + +format_fig( + fig, + title="Net income by employment income", + xaxis_title="Employment income (£)", + yaxis_title="Net income (£)", + height=600, + width=1000, +) +fig.show() +``` + +## Entity structures + +### UK entities +``` +household + └── benunit (benefit unit - family claiming benefits together) + └── person +``` + +### US entities +``` +household + ├── tax_unit (federal tax filing unit) + ├── spm_unit (Supplemental Poverty Measure unit) + ├── family (Census definition) + └── marital_unit (married couple or single) + └── person +``` + +## Key concepts + +### 1. MicroDataFrame +All entity data uses `MicroDataFrame` which automatically handles survey weights: +```python +df = MicroDataFrame(pd_dataframe, weights="weight_column_name") +df.sum() # Automatically weighted +``` + +### 2. Entity mapping +When variables are at different entity levels, automatic mapping occurs: +- **Person → Group**: Sum values within each group +- **Group → Person**: Replicate group value to all members + +### 3. Required fields + +**UK person:** +- `person_id`, `person_household_id`, `person_benunit_id` +- `age`, `employment_income` +- `person_weight` + +**UK household:** +- `household_id` +- `region` (e.g., "LONDON", "SOUTH_EAST") +- `rent` (annual) +- `household_weight` + +**UK benunit:** +- `benunit_id` +- `would_claim_uc` (boolean - CRITICAL for UC calculations) +- `benunit_weight` + +**US person:** +- `person_id`, `person_household_id`, `person_tax_unit_id`, `person_spm_unit_id`, `person_family_id`, `person_marital_unit_id` +- `age`, `employment_income` +- `person_weight` + +**US household:** +- `household_id` +- `state_code` (e.g., "CA", "NY") +- `household_weight` + +### 4. Common pitfalls + +**Always set would_claim variables:** +```python +"would_claim_uc": [True] * n_benunits # UK +``` + +**Set disability variables to avoid spikes:** +```python +"is_disabled_for_benefits": [False] * n_people +"uc_limited_capability_for_WRA": [False] * n_people +``` + +**Use consistent ID linkages:** +```python +# Person 0 must link to valid household_id and benunit_id +person_df["person_household_id"] = [0, 0, 1] # Persons 0,1 in household 0 +``` + +## Finding parameters + +### UK common parameters +``` +gov.hmrc.income_tax.allowances.personal_allowance.amount +gov.hmrc.national_insurance.class_1.rates.main +gov.dwp.universal_credit.means_test.reduction_rate +gov.dwp.universal_credit.elements.child.first_child +gov.dwp.child_benefit.amount.first_child +``` + +### US common parameters +``` +gov.irs.income.standard_deduction.single +gov.irs.income.standard_deduction.joint +gov.irs.credits.ctc.amount.base +gov.irs.credits.ctc.refundable.amount.max +gov.irs.credits.eitc.max[0] # 0 children +gov.usda.snap.normal_allotment.max[1] # 1 person +``` + +## Aggregation methods for entity mapping + +- `how='sum'`: Aggregate by summing (person → group default) +- `how='first'`: Take first value in group +- `how='project'`: Broadcast group value to members (group → person default) +- `how='divide'`: Split equally among members + +## Response patterns + +When user asks to: + +1. **"Analyse a family with £X income"** → Use Pattern 1 (synthetic scenario) +2. **"How does income vary from £0 to £100k"** → Use Pattern 3 (parameter sweep) +3. **"What if we increased personal allowance?"** → Use Pattern 4 (policy reform) +4. **"How many people benefit?"** → Use Pattern 5 (extract aggregates) +5. **"Compare US vs UK"** → Create both datasets, run separately +6. **"Show me the phase-out"** → Use Pattern 3 + Pattern 7 (sweep + visualise) + +## Debugging tips + +1. **Check dataset shape**: `len(dataset.data.person)` should match expectations +2. **Verify linkages**: All person IDs should map to valid household IDs +3. **Check weights**: `dataset.data.household["household_weight"].sum()` +4. **Inspect output columns**: `list(simulation.output_dataset.data.person.columns)` +5. **Test small first**: Use 3-5 scenarios before scaling to 100+ + +## Example responses + +**User: "What's the net income of a single person earning £30k in London?"** + +```python +# I'll create a synthetic dataset with one person earning £30k in London +# and run a UK simulation to calculate their net income. + +import pandas as pd +from microdf import MicroDataFrame +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, UKYearData, uk_latest +) +from policyengine.core import Simulation + +# Create person data +person_df = MicroDataFrame( + pd.DataFrame({ + "person_id": [0], + "person_household_id": [0], + "person_benunit_id": [0], + "age": [30], + "employment_income": [30000], + "person_weight": [1.0], + }), + weights="person_weight" +) + +# Create household data +household_df = MicroDataFrame( + pd.DataFrame({ + "household_id": [0], + "region": ["LONDON"], + "rent": [12000], # Typical London rent + "household_weight": [1.0], + }), + weights="household_weight" +) + +# Create benunit data +benunit_df = MicroDataFrame( + pd.DataFrame({ + "benunit_id": [0], + "would_claim_uc": [True], + "benunit_weight": [1.0], + }), + weights="benunit_weight" +) + +# Create and run simulation +dataset = PolicyEngineUKDataset( + name="Single person £30k", + filepath="./temp_scenario.h5", + year=2026, + data=UKYearData( + person=person_df, + household=household_df, + benunit=benunit_df, + ) +) + +simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, +) +simulation.run() + +# Extract results +output = simulation.output_dataset.data +net_income = output.household["household_net_income"].iloc[0] +income_tax = output.person["income_tax"].iloc[0] +ni = output.person["national_insurance"].iloc[0] + +print(f"Employment income: £30,000") +print(f"Income tax: £{income_tax:,.0f}") +print(f"National Insurance: £{ni:,.0f}") +print(f"Net income: £{net_income:,.0f}") +``` + +## Additional resources + +- Full examples in `examples/` directory +- Core concepts: `docs/core-concepts.md` +- UK model: `docs/country-models-uk.md` +- US model: `docs/country-models-us.md` diff --git a/.claude/quick-reference.md b/.claude/quick-reference.md new file mode 100644 index 00000000..d5b5c872 --- /dev/null +++ b/.claude/quick-reference.md @@ -0,0 +1,367 @@ +# PolicyEngine.py Quick Reference + +## Imports cheat sheet + +```python +# Core +from policyengine.core import Simulation, Policy, Parameter, ParameterValue + +# UK +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + UKYearData, + uk_latest +) + +# US +from policyengine.tax_benefit_models.us import ( + PolicyEngineUSDataset, + USYearData, + us_latest +) + +# Outputs +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.outputs.change_aggregate import ChangeAggregate, ChangeAggregateType + +# Utilities +from policyengine.utils.plotting import format_fig, COLORS +from microdf import MicroDataFrame +import pandas as pd +import numpy as np +``` + +## Minimal working example (UK) + +```python +import pandas as pd +from microdf import MicroDataFrame +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, UKYearData, uk_latest +) +from policyengine.core import Simulation + +# Person data +person_df = MicroDataFrame(pd.DataFrame({ + "person_id": [0], + "person_household_id": [0], + "person_benunit_id": [0], + "age": [30], + "employment_income": [30000], + "person_weight": [1.0], +}), weights="person_weight") + +# Household data +household_df = MicroDataFrame(pd.DataFrame({ + "household_id": [0], + "region": ["LONDON"], + "rent": [12000], + "household_weight": [1.0], +}), weights="household_weight") + +# Benunit data +benunit_df = MicroDataFrame(pd.DataFrame({ + "benunit_id": [0], + "would_claim_uc": [True], + "benunit_weight": [1.0], +}), weights="benunit_weight") + +# Create dataset +dataset = PolicyEngineUKDataset( + name="Example", + filepath="./temp.h5", + year=2026, + data=UKYearData(person=person_df, household=household_df, benunit=benunit_df) +) + +# Run simulation +sim = Simulation(dataset=dataset, tax_benefit_model_version=uk_latest) +sim.run() + +# Get results +output = sim.output_dataset.data +print(output.household[["household_net_income"]]) +``` + +## Minimal working example (US) + +```python +import pandas as pd +from microdf import MicroDataFrame +from policyengine.tax_benefit_models.us import ( + PolicyEngineUSDataset, USYearData, us_latest +) +from policyengine.core import Simulation + +# Person data (US requires more entity links) +person_df = MicroDataFrame(pd.DataFrame({ + "person_id": [0, 1], + "person_household_id": [0, 0], + "person_tax_unit_id": [0, 0], + "person_spm_unit_id": [0, 0], + "person_family_id": [0, 0], + "person_marital_unit_id": [0, 0], + "age": [35, 33], + "employment_income": [60000, 40000], + "person_weight": [1.0, 1.0], +}), weights="person_weight") + +# Create minimal entity dataframes +entities = {} +for entity in ["tax_unit", "spm_unit", "family", "marital_unit"]: + entities[entity] = MicroDataFrame(pd.DataFrame({ + f"{entity}_id": [0], + f"{entity}_weight": [1.0], + }), weights=f"{entity}_weight") + +household_df = MicroDataFrame(pd.DataFrame({ + "household_id": [0], + "state_code": ["CA"], + "household_weight": [1.0], +}), weights="household_weight") + +# Create dataset +dataset = PolicyEngineUSDataset( + name="Example", + filepath="./temp.h5", + year=2024, + data=USYearData( + person=person_df, + tax_unit=entities["tax_unit"], + spm_unit=entities["spm_unit"], + family=entities["family"], + marital_unit=entities["marital_unit"], + household=household_df, + ) +) + +# Run simulation +sim = Simulation(dataset=dataset, tax_benefit_model_version=us_latest) +sim.run() + +# Get results +print(sim.output_dataset.data.household[["household_net_income"]]) +``` + +## Common patterns + +### Parameter sweep (vary one input) +```python +n = 50 +incomes = np.linspace(0, 100000, n) + +person_df = MicroDataFrame(pd.DataFrame({ + "person_id": range(n), + "person_household_id": range(n), + "person_benunit_id": range(n), + "age": [30] * n, + "employment_income": incomes, + "person_weight": [1.0] * n, +}), weights="person_weight") + +# Create matching household/benunit data with n rows +# ... then run simulation once for all scenarios +``` + +### Policy reform +```python +import datetime +from policyengine.core import Policy, Parameter, ParameterValue + +parameter = Parameter( + name="gov.hmrc.income_tax.allowances.personal_allowance.amount", + tax_benefit_model_version=uk_latest, + description="Personal allowance", + data_type=float, +) + +policy = Policy( + name="Reform", + description="Change PA", + parameter_values=[ParameterValue( + parameter=parameter, + start_date=datetime.date(2026, 1, 1), + end_date=datetime.date(2026, 12, 31), + value=15000, + )] +) + +# Run with policy +reform_sim = Simulation(dataset=dataset, tax_benefit_model_version=uk_latest, policy=policy) +``` + +### Extract aggregate statistics +```python +from policyengine.outputs.aggregate import Aggregate, AggregateType + +# Sum +total = Aggregate( + simulation=sim, + variable="universal_credit", + entity="benunit", + aggregate_type=AggregateType.SUM, +) +total.run() + +# Mean +avg = Aggregate( + simulation=sim, + variable="household_net_income", + entity="household", + aggregate_type=AggregateType.MEAN, +) +avg.run() + +# Count with filter +count = Aggregate( + simulation=sim, + variable="person_id", + entity="person", + aggregate_type=AggregateType.COUNT, + filter_variable="age", + filter_geq=65, # Age >= 65 +) +count.run() +``` + +### Compare baseline vs reform +```python +from policyengine.outputs.change_aggregate import ChangeAggregate, ChangeAggregateType + +winners = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_net_income", + aggregate_type=ChangeAggregateType.COUNT, + change_geq=1, +) +winners.run() + +revenue = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_tax", + aggregate_type=ChangeAggregateType.SUM, +) +revenue.run() +``` + +### Entity mapping +```python +# Sum person income to household +household_income = dataset.data.map_to_entity( + source_entity="person", + target_entity="household", + columns=["employment_income"], + how="sum" +) + +# Broadcast household rent to persons +person_rent = dataset.data.map_to_entity( + source_entity="household", + target_entity="person", + columns=["rent"], + how="project" +) + +# Divide household value equally per person +per_person = dataset.data.map_to_entity( + source_entity="household", + target_entity="person", + columns=["total_savings"], + how="divide" +) + +# Map custom values +custom_totals = dataset.data.map_to_entity( + source_entity="person", + target_entity="household", + values=custom_array, + how="sum" +) +``` + +## Critical fields + +### UK +- **Person**: `person_id`, `person_household_id`, `person_benunit_id`, `age`, `employment_income`, `person_weight` +- **Household**: `household_id`, `region`, `rent`, `household_weight` +- **Benunit**: `benunit_id`, `would_claim_uc`, `benunit_weight` + +### US +- **Person**: `person_id`, `person_household_id`, `person_tax_unit_id`, `person_spm_unit_id`, `person_family_id`, `person_marital_unit_id`, `age`, `employment_income`, `person_weight` +- **Household**: `household_id`, `state_code`, `household_weight` +- **Other entities**: Each needs `{entity}_id` and `{entity}_weight` + +## Common UK regions +```python +["LONDON", "SOUTH_EAST", "SOUTH_WEST", "EAST_OF_ENGLAND", + "WEST_MIDLANDS", "EAST_MIDLANDS", "YORKSHIRE", + "NORTH_WEST", "NORTH_EAST", "WALES", "SCOTLAND", "NORTHERN_IRELAND"] +``` + +## Common US state codes +```python +["CA", "NY", "TX", "FL", "PA", "IL", "OH", "GA", "NC", "MI", ...] +``` + +## Aggregate filter options +```python +# Exact match +filter_eq=value + +# Greater than/equal +filter_geq=value + +# Less than/equal +filter_leq=value + +# Quantile filtering (deciles) +quantile=10 # Split into 10 groups +quantile_eq=1 # First decile only +quantile_geq=9 # Top two deciles +quantile_leq=2 # Bottom two deciles +``` + +## Common parameters + +### UK +``` +gov.hmrc.income_tax.allowances.personal_allowance.amount +gov.hmrc.income_tax.rates.uk[0] # Basic rate +gov.hmrc.national_insurance.class_1.rates.main +gov.dwp.universal_credit.means_test.reduction_rate +gov.dwp.universal_credit.elements.child.first_child +gov.dwp.child_benefit.amount.first_child +``` + +### US +``` +gov.irs.income.standard_deduction.single +gov.irs.income.standard_deduction.joint +gov.irs.credits.ctc.amount.base +gov.irs.credits.eitc.max[0] +gov.ssa.payroll.rate.employee +gov.usda.snap.normal_allotment.max[1] +``` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| No UC calculated | Set `would_claim_uc=True` | +| Random UC spikes | Set `is_disabled_for_benefits=False`, `uc_limited_capability_for_WRA=False` | +| KeyError on column | Check variable name in docs, may be different entity level | +| Empty results | Check weights sum correctly, verify ID linkages | +| Slow performance | Use parameter sweep pattern (one simulation for N scenarios) | + +## Visualisation template +```python +from policyengine.utils.plotting import format_fig, COLORS +import plotly.graph_objects as go + +fig = go.Figure() +fig.add_trace(go.Scatter(x=x_vals, y=y_vals, line=dict(color=COLORS["primary"]))) +format_fig(fig, title="Title", xaxis_title="X", yaxis_title="Y") +fig.show() +``` diff --git a/docs/core-concepts.md b/docs/core-concepts.md index bdba1f39..52c3f290 100644 --- a/docs/core-concepts.md +++ b/docs/core-concepts.md @@ -327,6 +327,119 @@ agg = Aggregate( When you request a household-level variable at person level: 1. Replicates household values to all persons in that household (expansion) +### Direct entity mapping + +You can also map data between entities directly using the `map_to_entity` method: + +```python +# Map person income to household level (sum) +household_income = dataset.data.map_to_entity( + source_entity="person", + target_entity="household", + columns=["employment_income"], + how="sum" +) + +# Map household rent to person level (project/broadcast) +person_rent = dataset.data.map_to_entity( + source_entity="household", + target_entity="person", + columns=["rent"], + how="project" +) +``` + +#### Mapping with custom values + +You can map custom value arrays instead of existing columns: + +```python +# Map custom per-person values to household level +import numpy as np + +# Create custom values (e.g., imputed data) +custom_values = np.array([100, 200, 150, 300]) + +household_totals = dataset.data.map_to_entity( + source_entity="person", + target_entity="household", + values=custom_values, + how="sum" +) +``` + +#### Aggregation methods + +The `how` parameter controls how values are mapped: + +**Person → Group (aggregation):** +- `how='sum'` (default): Sum values within each group +- `how='first'`: Take first person's value in each group + +```python +# Sum person incomes to household level +household_income = data.map_to_entity( + source_entity="person", + target_entity="household", + columns=["employment_income"], + how="sum" +) + +# Take first person's age as household reference +household_age = data.map_to_entity( + source_entity="person", + target_entity="household", + columns=["age"], + how="first" +) +``` + +**Group → Person (expansion):** +- `how='project'` (default): Broadcast group value to all members +- `how='divide'`: Split group value equally among members + +```python +# Broadcast household rent to each person +person_rent = data.map_to_entity( + source_entity="household", + target_entity="person", + columns=["rent"], + how="project" +) + +# Split household savings equally per person +person_savings = data.map_to_entity( + source_entity="household", + target_entity="person", + columns=["total_savings"], + how="divide" +) +``` + +**Group → Group (via person entity):** +- `how='sum'` (default): Sum through person entity +- `how='first'`: Take first source group's value +- `how='project'`: Broadcast first source group's value +- `how='divide'`: Split proportionally based on person counts + +```python +# UK: Sum benunit benefits to household level +household_benefits = data.map_to_entity( + source_entity="benunit", + target_entity="household", + columns=["universal_credit"], + how="sum" +) + +# US: Map tax unit income to household, splitting by members +household_from_tax = data.map_to_entity( + source_entity="tax_unit", + target_entity="household", + columns=["taxable_income"], + how="divide" +) +``` + ## Visualisation The package includes utilities for creating PolicyEngine-branded visualisations: diff --git a/docs/country-models-uk.md b/docs/country-models-uk.md index 27d7dae7..bd9d1fbd 100644 --- a/docs/country-models-uk.md +++ b/docs/country-models-uk.md @@ -295,6 +295,42 @@ Valid region values: - `SCOTLAND` - `NORTHERN_IRELAND` +## Entity mapping + +The UK model has a simpler entity structure than the US, with three levels: person → benunit → household. + +### Direct entity mapping + +You can map data between entities using the `map_to_entity` method: + +```python +# Map person income to benunit level +benunit_income = dataset.data.map_to_entity( + source_entity="person", + target_entity="benunit", + columns=["employment_income"], + how="sum" +) + +# Split household rent equally among persons +person_rent_share = dataset.data.map_to_entity( + source_entity="household", + target_entity="person", + columns=["rent"], + how="divide" +) + +# Map benunit UC to household level +household_uc = dataset.data.map_to_entity( + source_entity="benunit", + target_entity="household", + columns=["universal_credit"], + how="sum" +) +``` + +See the [Entity mapping section](core-concepts.md#entity-mapping) in Core Concepts for full documentation on aggregation methods. + ## Data sources The UK model can use several data sources: diff --git a/docs/country-models-us.md b/docs/country-models-us.md index 927966ee..547a4f3b 100644 --- a/docs/country-models-us.md +++ b/docs/country-models-us.md @@ -368,6 +368,38 @@ Household variables are replicated to all household members: # Each person in household gets the same household_net_income value ``` +### Direct entity mapping + +For complex multi-entity scenarios, you can use `map_to_entity` directly: + +```python +# Map SPM unit SNAP benefits to household level +household_snap = dataset.data.map_to_entity( + source_entity="spm_unit", + target_entity="household", + columns=["snap"], + how="sum" +) + +# Split tax unit income equally among persons +person_tax_income = dataset.data.map_to_entity( + source_entity="tax_unit", + target_entity="person", + columns=["taxable_income"], + how="divide" +) + +# Map custom analysis values +custom_analysis = dataset.data.map_to_entity( + source_entity="person", + target_entity="tax_unit", + values=custom_values_array, + how="sum" +) +``` + +See the [Entity mapping section](core-concepts.md#entity-mapping) in Core Concepts for full documentation on aggregation methods. + ## Data sources The US model can use several data sources: diff --git a/src/policyengine/core/__init__.py b/src/policyengine/core/__init__.py index b96e8edd..630620a0 100644 --- a/src/policyengine/core/__init__.py +++ b/src/policyengine/core/__init__.py @@ -1,4 +1,5 @@ from .dataset import Dataset +from .dataset import YearData as YearData from .dataset import map_to_entity as map_to_entity from .dataset_version import DatasetVersion as DatasetVersion from .dynamic import Dynamic as Dynamic diff --git a/src/policyengine/core/dataset.py b/src/policyengine/core/dataset.py index a79c0b6d..35b74570 100644 --- a/src/policyengine/core/dataset.py +++ b/src/policyengine/core/dataset.py @@ -8,6 +8,63 @@ from .tax_benefit_model import TaxBenefitModel +class YearData(BaseModel): + """Base class for entity-level data for a single year.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + @property + def entity_data(self) -> dict[str, MicroDataFrame]: + """Return a dictionary of entity names to their data. + + This should be implemented by subclasses to return the appropriate entities. + """ + raise NotImplementedError( + "Subclasses must implement entity_data property" + ) + + @property + def person_entity(self) -> str: + """Return the name of the person-level entity. + + Defaults to 'person' but can be overridden by subclasses. + """ + return "person" + + def map_to_entity( + self, + source_entity: str, + target_entity: str, + columns: list[str] = None, + values: list = None, + how: str = "sum", + ) -> MicroDataFrame: + """Map data from source entity to target entity using join keys. + + Args: + source_entity (str): The source entity name. + target_entity (str): The target entity name. + columns (list[str], optional): List of column names to map. If None, maps all columns. + values (list, optional): List of values to use instead of column data. + how (str): Aggregation method ('sum' or 'first') when mapping to higher-level entities (default 'sum'). + + Returns: + MicroDataFrame: The mapped data at the target entity level. + + Raises: + ValueError: If source or target entity is invalid. + """ + return map_to_entity( + entity_data=self.entity_data, + source_entity=source_entity, + target_entity=target_entity, + person_entity=self.person_entity, + columns=columns, + values=values, + how=how, + ) + + class Dataset(BaseModel): """Base class for datasets. @@ -43,6 +100,8 @@ def map_to_entity( target_entity: str, person_entity: str = "person", columns: list[str] | None = None, + values: list | None = None, + how: str = "sum", ) -> MicroDataFrame: """Map data from source entity to target entity using join keys. @@ -58,12 +117,17 @@ def map_to_entity( target_entity: The target entity name person_entity: The name of the person entity (default "person") columns: List of column names to map. If None, maps all columns + values: List of values to use instead of column data. If provided, creates a single unnamed column + how: Aggregation method (default 'sum') + - For person → group: 'sum' (aggregate), 'first' (take first value) + - For group → person: 'project' (broadcast), 'divide' (split equally) + - For group → group: 'sum', 'first', 'project', 'divide' Returns: MicroDataFrame: The mapped data at the target entity level Raises: - ValueError: If source or target entity is invalid + ValueError: If source or target entity is invalid or unsupported aggregation method """ valid_entities = set(entity_data.keys()) @@ -79,6 +143,18 @@ def map_to_entity( # Get source data (convert to plain DataFrame to avoid weighted operations during mapping) source_df = pd.DataFrame(entity_data[source_entity]) + # Handle values parameter - create a temporary column with the provided values + if values is not None: + if len(values) != len(source_df): + raise ValueError( + f"Length of values ({len(values)}) must match source entity length ({len(source_df)})" + ) + # Create a temporary DataFrame with just ID columns and the values column + id_cols = {col for col in source_df.columns if col.endswith("_id")} + source_df = source_df[[col for col in id_cols]] + source_df["__mapped_value"] = values + columns = ["__mapped_value"] + if columns: # Select only requested columns (keep all ID columns for joins) id_cols = {col for col in source_df.columns if col.endswith("_id")} @@ -118,10 +194,17 @@ def map_to_entity( if c not in id_cols and c not in weight_cols ] - # Group by join key and sum - aggregated = source_df.groupby(join_key, as_index=False)[ - agg_cols - ].sum() + # Group by join key and aggregate + if how == "sum": + aggregated = source_df.groupby(join_key, as_index=False)[ + agg_cols + ].sum() + elif how == "first": + aggregated = source_df.groupby(join_key, as_index=False)[ + agg_cols + ].first() + else: + raise ValueError(f"Unsupported aggregation method: {how}") # Rename join key to target key if needed if join_key != target_key: @@ -146,6 +229,10 @@ def map_to_entity( # Group entity to person: expand group-level data to person level if source_entity != person_entity and target_entity == person_entity: + # Default to 'project' (broadcast) for group -> person if 'sum' was provided + if how == "sum": + how = "project" + source_key = f"{source_entity}_id" # Check for both naming patterns person_source_key = f"{person_entity}_{source_entity}_id" @@ -163,6 +250,40 @@ def map_to_entity( source_df = source_df.rename(columns={source_key: join_key}) result = target_pd.merge(source_df, on=join_key, how="left") + + # Handle divide operation + if how == "divide": + # Get columns to divide (exclude ID and weight columns) + id_cols = { + col for col in result.columns if col.endswith("_id") + } + weight_cols = { + col for col in result.columns if col.endswith("_weight") + } + value_cols = [ + c + for c in result.columns + if c not in id_cols and c not in weight_cols + ] + + # Count members in each group + group_counts = ( + target_pd.groupby(join_key, as_index=False) + .size() + .rename(columns={"size": "__group_count"}) + ) + result = result.merge(group_counts, on=join_key, how="left") + + # Divide values by group count + for col in value_cols: + result[col] = result[col] / result["__group_count"] + + result = result.drop(columns=["__group_count"]) + elif how not in ["project"]: + raise ValueError( + f"Unsupported aggregation method for group->person: {how}. Use 'project' or 'divide'." + ) + return MicroDataFrame(result, weights=target_weight) # Group to group: go through person table @@ -228,9 +349,43 @@ def map_to_entity( if c not in id_cols and c not in weight_cols ] - aggregated = source_with_target.groupby( - target_link_key, as_index=False - )[agg_cols].sum() + if how == "sum": + aggregated = source_with_target.groupby( + target_link_key, as_index=False + )[agg_cols].sum() + elif how == "first": + aggregated = source_with_target.groupby( + target_link_key, as_index=False + )[agg_cols].first() + elif how == "project": + # Just take first value (broadcast to target groups) + aggregated = source_with_target.groupby( + target_link_key, as_index=False + )[agg_cols].first() + elif how == "divide": + # Count persons in each source group + source_group_counts = ( + person_df.groupby(source_link_key, as_index=False) + .size() + .rename(columns={"size": "__source_count"}) + ) + source_with_target = source_with_target.merge( + source_group_counts, on=source_link_key, how="left" + ) + + # Divide values by source group count (per-person share) + for col in agg_cols: + source_with_target[col] = ( + source_with_target[col] + / source_with_target["__source_count"] + ) + + # Now aggregate (sum of per-person shares) to target level + aggregated = source_with_target.groupby( + target_link_key, as_index=False + )[agg_cols].sum() + else: + raise ValueError(f"Unsupported aggregation method: {how}") # Rename target link key to target key if needed if target_link_key != target_key: diff --git a/src/policyengine/core/dynamic.py b/src/policyengine/core/dynamic.py index 3b6ba553..81ef62b7 100644 --- a/src/policyengine/core/dynamic.py +++ b/src/policyengine/core/dynamic.py @@ -23,7 +23,10 @@ def __add__(self, other: "Dynamic") -> "Dynamic": # Combine simulation modifiers combined_modifier = None - if self.simulation_modifier is not None and other.simulation_modifier is not None: + if ( + self.simulation_modifier is not None + and other.simulation_modifier is not None + ): def combined_modifier(sim): sim = self.simulation_modifier(sim) diff --git a/src/policyengine/core/parameter.py b/src/policyengine/core/parameter.py index 54e3e116..79c4f175 100644 --- a/src/policyengine/core/parameter.py +++ b/src/policyengine/core/parameter.py @@ -8,6 +8,7 @@ class Parameter(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) name: str + label: str | None = None description: str | None = None data_type: type | None = None tax_benefit_model_version: TaxBenefitModelVersion diff --git a/src/policyengine/core/policy.py b/src/policyengine/core/policy.py index 3aeb19b9..bfb4ca9e 100644 --- a/src/policyengine/core/policy.py +++ b/src/policyengine/core/policy.py @@ -23,7 +23,10 @@ def __add__(self, other: "Policy") -> "Policy": # Combine simulation modifiers combined_modifier = None - if self.simulation_modifier is not None and other.simulation_modifier is not None: + if ( + self.simulation_modifier is not None + and other.simulation_modifier is not None + ): def combined_modifier(sim): sim = self.simulation_modifier(sim) diff --git a/src/policyengine/core/tax_benefit_model_version.py b/src/policyengine/core/tax_benefit_model_version.py index 8555c6f6..b03c73eb 100644 --- a/src/policyengine/core/tax_benefit_model_version.py +++ b/src/policyengine/core/tax_benefit_model_version.py @@ -29,6 +29,44 @@ def run(self, simulation: "Simulation") -> "Simulation": "The TaxBenefitModel class must define a method to execute simulations." ) + def get_parameter(self, name: str) -> "Parameter": + """Get a parameter by name. + + Args: + name: The parameter name (e.g., "gov.hmrc.income_tax.allowances.personal_allowance.amount") + + Returns: + Parameter: The matching parameter + + Raises: + ValueError: If parameter not found + """ + for param in self.parameters: + if param.name == name: + return param + raise ValueError( + f"Parameter '{name}' not found in {self.model.id} version {self.version}" + ) + + def get_variable(self, name: str) -> "Variable": + """Get a variable by name. + + Args: + name: The variable name (e.g., "income_tax", "household_net_income") + + Returns: + Variable: The matching variable + + Raises: + ValueError: If variable not found + """ + for var in self.variables: + if var.name == name: + return var + raise ValueError( + f"Variable '{name}' not found in {self.model.id} version {self.version}" + ) + def __repr__(self) -> str: # Give the id and version, and the number of variables, parameters, parameter values return f"" diff --git a/src/policyengine/tax_benefit_models/uk/datasets.py b/src/policyengine/tax_benefit_models/uk/datasets.py index 113d4b57..bdf89d9e 100644 --- a/src/policyengine/tax_benefit_models/uk/datasets.py +++ b/src/policyengine/tax_benefit_models/uk/datasets.py @@ -2,12 +2,12 @@ import pandas as pd from microdf import MicroDataFrame -from pydantic import BaseModel, ConfigDict +from pydantic import ConfigDict -from policyengine.core import Dataset, map_to_entity +from policyengine.core import Dataset, YearData -class UKYearData(BaseModel): +class UKYearData(YearData): """Entity-level data for a single year.""" model_config = ConfigDict(arbitrary_types_allowed=True) @@ -16,34 +16,14 @@ class UKYearData(BaseModel): benunit: MicroDataFrame household: MicroDataFrame - def map_to_entity( - self, source_entity: str, target_entity: str, columns: list[str] = None - ) -> MicroDataFrame: - """Map data from source entity to target entity using join keys. - - Args: - source_entity (str): The source entity name ('person', 'benunit', 'household'). - target_entity (str): The target entity name ('person', 'benunit', 'household'). - columns (list[str], optional): List of column names to map. If None, maps all columns. - - Returns: - MicroDataFrame: The mapped data at the target entity level. - - Raises: - ValueError: If source or target entity is invalid. - """ - entity_data = { + @property + def entity_data(self) -> dict[str, MicroDataFrame]: + """Return a dictionary of entity names to their data.""" + return { "person": self.person, "benunit": self.benunit, "household": self.household, } - return map_to_entity( - entity_data=entity_data, - source_entity=source_entity, - target_entity=target_entity, - person_entity="person", - columns=columns, - ) class PolicyEngineUKDataset(Dataset): diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index 18f1ef25..ee0aa08a 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -83,6 +83,7 @@ def __init__(self, **kwargs: dict): parameter = Parameter( id=self.id + "-" + param_node.name, name=param_node.name, + label=param_node.metadata.get("label", param_node.name), tax_benefit_model_version=self, description=param_node.description, data_type=type( diff --git a/src/policyengine/tax_benefit_models/us/datasets.py b/src/policyengine/tax_benefit_models/us/datasets.py index 676e08e3..53643cc5 100644 --- a/src/policyengine/tax_benefit_models/us/datasets.py +++ b/src/policyengine/tax_benefit_models/us/datasets.py @@ -3,12 +3,12 @@ import pandas as pd from microdf import MicroDataFrame -from pydantic import BaseModel, ConfigDict +from pydantic import ConfigDict -from policyengine.core import Dataset, map_to_entity +from policyengine.core import Dataset, YearData -class USYearData(BaseModel): +class USYearData(YearData): """Entity-level data for a single year.""" model_config = ConfigDict(arbitrary_types_allowed=True) @@ -20,23 +20,10 @@ class USYearData(BaseModel): tax_unit: MicroDataFrame household: MicroDataFrame - def map_to_entity( - self, source_entity: str, target_entity: str, columns: list[str] = None - ) -> MicroDataFrame: - """Map data from source entity to target entity using join keys. - - Args: - source_entity (str): The source entity name. - target_entity (str): The target entity name. - columns (list[str], optional): List of column names to map. If None, maps all columns. - - Returns: - MicroDataFrame: The mapped data at the target entity level. - - Raises: - ValueError: If source or target entity is invalid. - """ - entity_data = { + @property + def entity_data(self) -> dict[str, MicroDataFrame]: + """Return a dictionary of entity names to their data.""" + return { "person": self.person, "marital_unit": self.marital_unit, "family": self.family, @@ -44,13 +31,6 @@ def map_to_entity( "tax_unit": self.tax_unit, "household": self.household, } - return map_to_entity( - entity_data=entity_data, - source_entity=source_entity, - target_entity=target_entity, - person_entity="person", - columns=columns, - ) class PolicyEngineUSDataset(Dataset): diff --git a/src/policyengine/utils/plotting.py b/src/policyengine/utils/plotting.py index 77aed94f..c3c0ff28 100644 --- a/src/policyengine/utils/plotting.py +++ b/src/policyengine/utils/plotting.py @@ -1,6 +1,5 @@ """Plotting utilities for PolicyEngine visualisations.""" - import plotly.graph_objects as go # PolicyEngine brand colours diff --git a/tests/test_get_parameter_variable.py b/tests/test_get_parameter_variable.py new file mode 100644 index 00000000..9c2b48d5 --- /dev/null +++ b/tests/test_get_parameter_variable.py @@ -0,0 +1,141 @@ +"""Tests for get_parameter and get_variable methods on TaxBenefitModelVersion.""" + +import pytest + +from policyengine.tax_benefit_models.uk import uk_latest +from policyengine.tax_benefit_models.us import us_latest + + +def test_uk_get_variable(): + """Test getting a variable by name from UK model.""" + # Get a known variable + var = uk_latest.get_variable("income_tax") + + assert var is not None + assert var.name == "income_tax" + assert var.entity == "person" + assert var.tax_benefit_model_version == uk_latest + + +def test_uk_get_variable_not_found(): + """Test error handling when variable doesn't exist.""" + with pytest.raises( + ValueError, match="Variable 'nonexistent_variable' not found" + ): + uk_latest.get_variable("nonexistent_variable") + + +def test_uk_get_parameter(): + """Test getting a parameter by name from UK model.""" + # Get a known parameter + param = uk_latest.get_parameter( + "gov.hmrc.income_tax.allowances.personal_allowance.amount" + ) + + assert param is not None + assert ( + param.name + == "gov.hmrc.income_tax.allowances.personal_allowance.amount" + ) + assert param.tax_benefit_model_version == uk_latest + + +def test_uk_get_parameter_not_found(): + """Test error handling when parameter doesn't exist.""" + with pytest.raises( + ValueError, match="Parameter 'nonexistent.parameter' not found" + ): + uk_latest.get_parameter("nonexistent.parameter") + + +def test_us_get_variable(): + """Test getting a variable by name from US model.""" + # Get a known variable + var = us_latest.get_variable("income_tax") + + assert var is not None + assert var.name == "income_tax" + assert var.entity == "tax_unit" + assert var.tax_benefit_model_version == us_latest + + +def test_us_get_variable_not_found(): + """Test error handling when variable doesn't exist.""" + with pytest.raises( + ValueError, match="Variable 'nonexistent_variable' not found" + ): + us_latest.get_variable("nonexistent_variable") + + +def test_us_get_parameter(): + """Test getting a parameter by name from US model.""" + # Get a known parameter + param = us_latest.get_parameter( + "gov.irs.investment.net_investment_income_tax.rate" + ) + + assert param is not None + assert param.name == "gov.irs.investment.net_investment_income_tax.rate" + assert param.tax_benefit_model_version == us_latest + + +def test_us_get_parameter_not_found(): + """Test error handling when parameter doesn't exist.""" + with pytest.raises( + ValueError, match="Parameter 'nonexistent.parameter' not found" + ): + us_latest.get_parameter("nonexistent.parameter") + + +def test_uk_multiple_variables(): + """Test getting multiple different variables.""" + vars_to_test = [ + "income_tax", + "national_insurance", + "universal_credit", + "household_net_income", + ] + + for var_name in vars_to_test: + var = uk_latest.get_variable(var_name) + assert var.name == var_name + + +def test_us_multiple_variables(): + """Test getting multiple different variables.""" + vars_to_test = [ + "income_tax", + "employee_payroll_tax", + "eitc", + "household_net_income", + ] + + for var_name in vars_to_test: + var = us_latest.get_variable(var_name) + assert var.name == var_name + + +def test_uk_multiple_parameters(): + """Test getting multiple different parameters.""" + params_to_test = [ + "gov.hmrc.income_tax.allowances.personal_allowance.amount", + "gov.hmrc.income_tax.rates.uk[0].rate", + "gov.dwp.universal_credit.means_test.reduction_rate", + ] + + for param_name in params_to_test: + param = uk_latest.get_parameter(param_name) + assert param.name == param_name + + +def test_us_multiple_parameters(): + """Test getting multiple different parameters.""" + params_to_test = [ + "gov.irs.investment.net_investment_income_tax.rate", + "gov.irs.self_employment.rate.social_security", + "gov.irs.vita.eligibility.income_limit", + ] + + for param_name in params_to_test: + param = us_latest.get_parameter(param_name) + assert param.name == param_name