From 54b8c2e82b22f61e41466e0ee8d057ad3060c410 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 16 Apr 2026 09:39:56 -0400 Subject: [PATCH 1/4] Add managed release bundle runtime --- src/policyengine/core/__init__.py | 10 +- src/policyengine/core/release_manifest.py | 162 ++++---- src/policyengine/core/simulation.py | 4 +- .../core/tax_benefit_model_version.py | 36 +- src/policyengine/outputs/__init__.py | 2 + src/policyengine/outputs/inequality.py | 111 ++++-- src/policyengine/tax_benefit_models/uk.py | 2 + .../tax_benefit_models/uk/__init__.py | 2 + .../tax_benefit_models/uk/model.py | 81 +++- src/policyengine/tax_benefit_models/us.py | 2 + .../tax_benefit_models/us/__init__.py | 2 + .../tax_benefit_models/us/analysis.py | 15 +- .../tax_benefit_models/us/model.py | 70 +++- tests/test_inequality.py | 95 +++++ tests/test_release_manifests.py | 363 +++--------------- 15 files changed, 512 insertions(+), 445 deletions(-) diff --git a/src/policyengine/core/__init__.py b/src/policyengine/core/__init__.py index 16021ca1..bb0e80d5 100644 --- a/src/policyengine/core/__init__.py +++ b/src/policyengine/core/__init__.py @@ -25,6 +25,9 @@ ) from .release_manifest import get_data_release_manifest as get_data_release_manifest from .release_manifest import get_release_manifest as get_release_manifest +from .release_manifest import ( + resolve_managed_dataset_reference as resolve_managed_dataset_reference, +) from .scoping_strategy import RegionScopingStrategy as RegionScopingStrategy from .scoping_strategy import RowFilterStrategy as RowFilterStrategy from .scoping_strategy import ScopingStrategy as ScopingStrategy @@ -36,13 +39,6 @@ from .tax_benefit_model_version import ( TaxBenefitModelVersion as TaxBenefitModelVersion, ) -from .trace_tro import ( - build_trace_tro_from_release_bundle as build_trace_tro_from_release_bundle, -) -from .trace_tro import ( - compute_trace_composition_fingerprint as compute_trace_composition_fingerprint, -) -from .trace_tro import serialize_trace_tro as serialize_trace_tro from .variable import Variable as Variable # Rebuild models to resolve forward references diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/core/release_manifest.py index cc2cc6d9..a1d601c3 100644 --- a/src/policyengine/core/release_manifest.py +++ b/src/policyengine/core/release_manifest.py @@ -1,6 +1,6 @@ import os from functools import lru_cache -from importlib import import_module, metadata +from importlib import import_module from importlib.resources import files from pathlib import Path @@ -8,10 +8,10 @@ from pydantic import BaseModel, Field HF_REQUEST_TIMEOUT_SECONDS = 30 - - -class DataReleaseManifestUnavailable(ValueError): - pass +LOCAL_DATA_REPO_HINTS = { + "us": ("policyengine_us", "policyengine-us-data", "policyengine_us_data"), + "uk": ("policyengine_uk", "policyengine-uk-data", "policyengine_uk_data"), +} class PackageVersion(BaseModel): @@ -126,28 +126,6 @@ def build_hf_uri(repo_id: str, path_in_repo: str, revision: str) -> str: return f"hf://{repo_id}/{path_in_repo}@{revision}" -def get_runtime_model_build_metadata(package_name: str) -> dict[str, str | None]: - installed_version = metadata.version(package_name) - module_name = package_name.replace("-", "_") - - try: - build_metadata_module = import_module(f"{module_name}.build_metadata") - except Exception: - return { - "name": package_name, - "version": installed_version, - "git_sha": None, - "data_build_fingerprint": None, - } - - build_metadata = build_metadata_module.get_data_build_metadata() - build_metadata.setdefault("name", package_name) - build_metadata.setdefault("version", installed_version) - build_metadata.setdefault("git_sha", None) - build_metadata.setdefault("data_build_fingerprint", None) - return build_metadata - - @lru_cache def get_release_manifest(country_id: str) -> CountryReleaseManifest: manifest_path = files("policyengine").joinpath( @@ -183,15 +161,10 @@ def get_data_release_manifest(country_id: str) -> DataReleaseManifest: timeout=HF_REQUEST_TIMEOUT_SECONDS, ) if response.status_code in (401, 403): - raise DataReleaseManifestUnavailable( + raise ValueError( "Could not fetch the data release manifest from Hugging Face. " "If this country uses a private data repo, set HUGGING_FACE_TOKEN." ) - if response.status_code == 404: - raise DataReleaseManifestUnavailable( - "Could not find the data release manifest on Hugging Face for " - f"{data_package.repo_id}@{data_package.version}." - ) response.raise_for_status() return DataReleaseManifest.model_validate_json(response.text) @@ -208,7 +181,17 @@ def certify_data_release_compatibility( runtime_data_build_fingerprint: str | None = None, ) -> DataCertification: country_manifest = get_release_manifest(country_id) - data_release_manifest = get_data_release_manifest(country_id) + try: + data_release_manifest = get_data_release_manifest(country_id) + except Exception as exc: + bundled_certification = country_manifest.certification + if ( + bundled_certification is not None + and bundled_certification.certified_for_model_version + == runtime_model_version + ): + return bundled_certification + raise exc built_with_model = ( data_release_manifest.build.built_with_model_package if data_release_manifest.build is not None @@ -277,7 +260,9 @@ def certify_data_release_compatibility( else None ), built_with_model_version=( - built_with_model.version if built_with_model is not None else None + built_with_model.version + if built_with_model is not None + else None ), built_with_model_git_sha=( built_with_model.git_sha if built_with_model is not None else None @@ -295,37 +280,6 @@ def certify_data_release_compatibility( ) -def resolve_runtime_data_certification( - country_id: str, - runtime_model_version: str, - runtime_data_build_fingerprint: str | None = None, - bundled_certification: DataCertification | None = None, -) -> DataCertification: - try: - return certify_data_release_compatibility( - country_id=country_id, - runtime_model_version=runtime_model_version, - runtime_data_build_fingerprint=runtime_data_build_fingerprint, - ) - except DataReleaseManifestUnavailable: - if ( - bundled_certification is not None - and bundled_certification.certified_for_model_version - == runtime_model_version - ): - bundled_fingerprint = bundled_certification.data_build_fingerprint - if ( - bundled_certification.compatibility_basis - == "matching_data_build_fingerprint" - and bundled_fingerprint is not None - and runtime_data_build_fingerprint is not None - and bundled_fingerprint != runtime_data_build_fingerprint - ): - raise - return bundled_certification - raise - - def resolve_dataset_reference(country_id: str, dataset: str) -> str: if "://" in dataset: return dataset @@ -350,6 +304,82 @@ def resolve_dataset_reference(country_id: str, dataset: str) -> str: return artifact.uri +def resolve_managed_dataset_reference( + country_id: str, + dataset: str | None = None, + *, + allow_unmanaged: bool = False, +) -> str: + """Resolve a dataset reference under policyengine.py bundle enforcement. + + Managed mode pins dataset selection to the bundled `policyengine.py` + release manifest. Callers can: + + - omit `dataset` to use the certified default dataset for the bundle + - pass a logical dataset name present in the bundled/data-release manifests + + Direct URLs or raw Hugging Face references are treated as unmanaged unless + `allow_unmanaged=True` is set explicitly. + """ + + manifest = get_release_manifest(country_id) + if dataset is None: + return manifest.default_dataset_uri + + if "://" in dataset: + if dataset == manifest.default_dataset_uri: + return dataset + if allow_unmanaged: + return dataset + raise ValueError( + "Explicit dataset URIs bypass the policyengine.py release bundle. " + "Pass a manifest dataset name or omit `dataset` to use the certified " + "default dataset. Set `allow_unmanaged=True` only if you intend to " + "bypass bundle enforcement." + ) + + return resolve_dataset_reference(country_id, dataset) + + +def resolve_local_managed_dataset_source(country_id: str, dataset_uri: str) -> str: + """Resolve a local mirror of a managed dataset when available. + + This preserves the bundled dataset URI for provenance while allowing local + development environments with sibling data-repo checkouts to load the + exact certified artifact from disk rather than re-downloading it. + """ + + if not dataset_uri.startswith("hf://"): + return dataset_uri + + local_hint = LOCAL_DATA_REPO_HINTS.get(country_id) + if local_hint is None: + return dataset_uri + + path_without_revision = dataset_uri[5:].rsplit("@", 1)[0] + parts = path_without_revision.split("/", 2) + if len(parts) != 3: + return dataset_uri + _, _, path_in_repo = parts + + model_module_name, data_repo_name, data_package_name = local_hint + try: + model_module = import_module(model_module_name) + except ImportError: + return dataset_uri + + repo_root = Path(model_module.__file__).resolve().parents[1] + local_path = ( + repo_root.with_name(data_repo_name) + / data_package_name + / "storage" + / path_in_repo + ) + if local_path.exists(): + return str(local_path) + return dataset_uri + + def dataset_logical_name(dataset: str) -> str: return Path(dataset.rsplit("@", 1)[0]).stem diff --git a/src/policyengine/core/simulation.py b/src/policyengine/core/simulation.py index b9af105d..d4397cdb 100644 --- a/src/policyengine/core/simulation.py +++ b/src/policyengine/core/simulation.py @@ -104,7 +104,5 @@ def release_bundle(self) -> dict[str, str | None]: ) return { **bundle, - "dataset_filepath": self.dataset.filepath - if self.dataset is not None - else None, + "dataset_filepath": self.dataset.filepath if self.dataset is not None else None, } diff --git a/src/policyengine/core/tax_benefit_model_version.py b/src/policyengine/core/tax_benefit_model_version.py index c24dfee6..eb8cfd5e 100644 --- a/src/policyengine/core/tax_benefit_model_version.py +++ b/src/policyengine/core/tax_benefit_model_version.py @@ -4,14 +4,8 @@ from pydantic import BaseModel, Field -from .release_manifest import ( - CountryReleaseManifest, - DataCertification, - PackageVersion, - get_data_release_manifest, -) +from .release_manifest import CountryReleaseManifest, DataCertification, PackageVersion from .tax_benefit_model import TaxBenefitModel -from .trace_tro import build_trace_tro_from_release_bundle if TYPE_CHECKING: from .parameter import Parameter @@ -29,7 +23,9 @@ class TaxBenefitModelVersion(BaseModel): model: TaxBenefitModel version: str description: str | None = None - created_at: datetime | None = Field(default_factory=lambda: datetime.now(UTC)) + created_at: datetime | None = Field( + default_factory=lambda: datetime.now(UTC) + ) variables: list["Variable"] = Field(default_factory=list) parameters: list["Parameter"] = Field(default_factory=list) @@ -200,29 +196,17 @@ def release_bundle(self) -> dict[str, str | None]: else None ), "compatibility_basis": ( - certification.compatibility_basis if certification is not None else None + certification.compatibility_basis + if certification is not None + else None ), "certified_by": ( - certification.certified_by if certification is not None else None + certification.certified_by + if certification is not None + else None ), } - @property - def trace_tro(self) -> dict: - if self.release_manifest is None: - raise ValueError( - "TRACE TRO export requires a bundled country release manifest." - ) - - data_release_manifest = get_data_release_manifest( - self.release_manifest.country_id - ) - return build_trace_tro_from_release_bundle( - self.release_manifest, - data_release_manifest, - certification=self.data_certification, - ) - def __repr__(self) -> str: # Give the id and version, and the number of variables, parameters, parameter nodes, parameter values return f"" diff --git a/src/policyengine/outputs/__init__.py b/src/policyengine/outputs/__init__.py index d426f743..61311f46 100644 --- a/src/policyengine/outputs/__init__.py +++ b/src/policyengine/outputs/__init__.py @@ -20,6 +20,7 @@ UK_INEQUALITY_INCOME_VARIABLE, US_INEQUALITY_INCOME_VARIABLE, Inequality, + USInequalityPreset, calculate_uk_inequality, calculate_us_inequality, ) @@ -76,6 +77,7 @@ "GENDER_GROUPS", "RACE_GROUPS", "Inequality", + "USInequalityPreset", "UK_INEQUALITY_INCOME_VARIABLE", "US_INEQUALITY_INCOME_VARIABLE", "calculate_uk_inequality", diff --git a/src/policyengine/outputs/inequality.py b/src/policyengine/outputs/inequality.py index 582428e2..8656dc65 100644 --- a/src/policyengine/outputs/inequality.py +++ b/src/policyengine/outputs/inequality.py @@ -1,5 +1,6 @@ """Inequality analysis output types.""" +from enum import StrEnum from typing import Any import numpy as np @@ -9,6 +10,13 @@ from policyengine.core import Output, Simulation +class USInequalityPreset(StrEnum): + """Preset configurations for US inequality analysis.""" + + STANDARD = "standard" + CBO_COMPARABLE = "cbo_comparable" + + def _gini(values: np.ndarray, weights: np.ndarray) -> float: """Calculate weighted Gini coefficient. @@ -48,6 +56,23 @@ def _gini(values: np.ndarray, weights: np.ndarray) -> float: return float(1 - 2 * area) +def _series_for_entity( + simulation: Simulation, variable_name: str, target_entity: str, data: pd.DataFrame +) -> pd.Series: + """Return a variable series aligned to the requested entity.""" + variable = simulation.tax_benefit_model_version.get_variable(variable_name) + + if variable.entity != target_entity: + mapped = simulation.output_dataset.data.map_to_entity( + variable.entity, + target_entity, + columns=[variable_name], + ) + return mapped[variable_name] + + return data[variable_name] + + class Inequality(Output): """Single inequality measure result - represents one database row. @@ -61,6 +86,9 @@ class Inequality(Output): simulation: Simulation income_variable: str entity: str = "household" + weight_multiplier_variable: str | None = None + equivalization_variable: str | None = None + equivalization_power: float = 0.0 # Optional demographic filters filter_variable: str | None = None @@ -76,49 +104,36 @@ class Inequality(Output): def run(self): """Calculate inequality metrics.""" - # Get income variable info - income_var_obj = self.simulation.tax_benefit_model_version.get_variable( - self.income_variable - ) - # Get target entity data target_entity = self.entity data = getattr(self.simulation.output_dataset.data, target_entity) - # Map income variable to target entity if needed - if income_var_obj.entity != target_entity: - mapped = self.simulation.output_dataset.data.map_to_entity( - income_var_obj.entity, - target_entity, - columns=[self.income_variable], - ) - income_series = mapped[self.income_variable] - else: - income_series = data[self.income_variable] + income_series = _series_for_entity( + self.simulation, self.income_variable, target_entity, data + ) # Get weights weight_col = f"{target_entity}_weight" if weight_col in data.columns: weights = data[weight_col] else: - weights = pd.Series(np.ones(len(income_series))) + weights = pd.Series(np.ones(len(income_series)), index=income_series.index) + + if self.weight_multiplier_variable is not None: + weight_multiplier = _series_for_entity( + self.simulation, + self.weight_multiplier_variable, + target_entity, + data, + ) + weights = weights * weight_multiplier # Apply demographic filter if specified if self.filter_variable is not None: - filter_var_obj = self.simulation.tax_benefit_model_version.get_variable( - self.filter_variable + filter_series = _series_for_entity( + self.simulation, self.filter_variable, target_entity, data ) - if filter_var_obj.entity != target_entity: - filter_mapped = self.simulation.output_dataset.data.map_to_entity( - filter_var_obj.entity, - target_entity, - columns=[self.filter_variable], - ) - filter_series = filter_mapped[self.filter_variable] - else: - filter_series = data[self.filter_variable] - # Build filter mask mask = filter_series.notna() if self.filter_variable_eq is not None: @@ -132,14 +147,35 @@ def run(self): income_series = income_series[mask] weights = weights[mask] + equivalization_arr = None + if self.equivalization_variable is not None and self.equivalization_power != 0: + equivalization_series = _series_for_entity( + self.simulation, + self.equivalization_variable, + target_entity, + data, + ) + if self.filter_variable is not None: + equivalization_series = equivalization_series[mask] + equivalization_arr = pd.to_numeric( + equivalization_series, errors="coerce" + ).to_numpy(dtype=float) + # Convert to numpy arrays - values = np.array(income_series) - weights_arr = np.array(weights) + values = pd.to_numeric(income_series, errors="coerce").to_numpy(dtype=float) + weights_arr = pd.to_numeric(weights, errors="coerce").to_numpy(dtype=float) - # Remove NaN values + # Remove invalid values valid_mask = ~np.isnan(values) & ~np.isnan(weights_arr) + if equivalization_arr is not None: + valid_mask &= ~np.isnan(equivalization_arr) & (equivalization_arr > 0) + values = values[valid_mask] weights_arr = weights_arr[valid_mask] + if equivalization_arr is not None: + values = values / np.power( + equivalization_arr[valid_mask], self.equivalization_power + ) # Calculate Gini coefficient self.gini = _gini(values, weights_arr) @@ -233,6 +269,7 @@ def calculate_uk_inequality( def calculate_us_inequality( simulation: Simulation, income_variable: str = US_INEQUALITY_INCOME_VARIABLE, + preset: USInequalityPreset | str = USInequalityPreset.STANDARD, filter_variable: str | None = None, filter_variable_eq: Any | None = None, filter_variable_leq: Any | None = None, @@ -243,6 +280,7 @@ def calculate_us_inequality( Args: simulation: The simulation to analyse income_variable: Income variable to use (default: household_net_income) + preset: Optional preset for weighting/equivalization filter_variable: Optional variable to filter by filter_variable_eq: Filter for exact match filter_variable_leq: Filter for less than or equal @@ -251,10 +289,21 @@ def calculate_us_inequality( Returns: Inequality object with Gini and income share metrics """ + preset = USInequalityPreset(preset) + inequality_kwargs = {} + + if preset == USInequalityPreset.CBO_COMPARABLE: + inequality_kwargs = { + "weight_multiplier_variable": "household_count_people", + "equivalization_variable": "household_count_people", + "equivalization_power": 0.5, + } + inequality = Inequality( simulation=simulation, income_variable=income_variable, entity="household", + **inequality_kwargs, filter_variable=filter_variable, filter_variable_eq=filter_variable_eq, filter_variable_leq=filter_variable_leq, diff --git a/src/policyengine/tax_benefit_models/uk.py b/src/policyengine/tax_benefit_models/uk.py index d6c1ad3a..52abcb18 100644 --- a/src/policyengine/tax_benefit_models/uk.py +++ b/src/policyengine/tax_benefit_models/uk.py @@ -13,6 +13,7 @@ ensure_datasets, general_policy_reform_analysis, load_datasets, + managed_microsimulation, uk_latest, uk_model, ) @@ -25,6 +26,7 @@ "ensure_datasets", "PolicyEngineUK", "PolicyEngineUKLatest", + "managed_microsimulation", "uk_model", "uk_latest", "general_policy_reform_analysis", diff --git a/src/policyengine/tax_benefit_models/uk/__init__.py b/src/policyengine/tax_benefit_models/uk/__init__.py index ed62138f..93533245 100644 --- a/src/policyengine/tax_benefit_models/uk/__init__.py +++ b/src/policyengine/tax_benefit_models/uk/__init__.py @@ -21,6 +21,7 @@ from .model import ( PolicyEngineUK, PolicyEngineUKLatest, + managed_microsimulation, uk_latest, uk_model, ) @@ -41,6 +42,7 @@ "ensure_datasets", "PolicyEngineUK", "PolicyEngineUKLatest", + "managed_microsimulation", "uk_model", "uk_latest", "economic_impact_analysis", diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index 3a8cf2b2..550e9a42 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -14,9 +14,11 @@ Variable, ) from policyengine.core.release_manifest import ( + certify_data_release_compatibility, + dataset_logical_name, get_release_manifest, - get_runtime_model_build_metadata, - resolve_runtime_data_certification, + resolve_managed_dataset_reference, + resolve_local_managed_dataset_source, ) from policyengine.utils.entity_utils import ( build_entity_relationships, @@ -43,6 +45,17 @@ class PolicyEngineUK(TaxBenefitModel): uk_model = PolicyEngineUK() +def _get_runtime_data_build_metadata() -> dict[str, str | None]: + try: + from policyengine_uk.build_metadata import get_data_build_metadata + except ModuleNotFoundError as exc: + if exc.name != "policyengine_uk.build_metadata": + raise + return {} + + return get_data_build_metadata() or {} + + class PolicyEngineUKLatest(TaxBenefitModelVersion): model: TaxBenefitModel = uk_model version: str = None @@ -139,14 +152,13 @@ def __init__(self, **kwargs: dict): f"{manifest.model_package.version}, got {installed_model_version}." ) - model_build_metadata = get_runtime_model_build_metadata("policyengine-uk") - data_certification = resolve_runtime_data_certification( + model_build_metadata = _get_runtime_data_build_metadata() + data_certification = certify_data_release_compatibility( "uk", runtime_model_version=installed_model_version, runtime_data_build_fingerprint=model_build_metadata.get( "data_build_fingerprint" ), - bundled_certification=manifest.certification, ) super().__init__(**kwargs) @@ -416,4 +428,63 @@ def load(self, simulation: "Simulation"): ) +def _managed_release_bundle( + dataset_uri: str, + dataset_source: str | None = None, +) -> dict[str, str | None]: + bundle = dict(uk_latest.release_bundle) + bundle["runtime_dataset"] = dataset_logical_name(dataset_uri) + bundle["runtime_dataset_uri"] = dataset_uri + if dataset_source and dataset_source != dataset_uri: + bundle["runtime_dataset_source"] = dataset_source + bundle["managed_by"] = "policyengine.py" + return bundle + + +def managed_microsimulation( + *, + dataset: str | None = None, + allow_unmanaged: bool = False, + **kwargs, +): + """Construct a country-package Microsimulation pinned to this bundle. + + By default this enforces the dataset selection from the bundled + `policyengine.py` release manifest. Arbitrary dataset URIs require + `allow_unmanaged=True`. + """ + + from policyengine_uk import Microsimulation + + if "dataset" in kwargs: + raise ValueError( + "Pass `dataset=` directly to managed_microsimulation, not through " + "**kwargs, so policyengine.py can enforce the release bundle." + ) + + dataset_uri = resolve_managed_dataset_reference( + "uk", + dataset, + allow_unmanaged=allow_unmanaged, + ) + dataset_source = resolve_local_managed_dataset_source("uk", dataset_uri) + runtime_dataset = dataset_source + if isinstance(dataset_source, str) and "hf://" not in dataset_source: + from policyengine_uk.data.dataset_schema import ( + UKMultiYearDataset, + UKSingleYearDataset, + ) + + if UKMultiYearDataset.validate_file_path(dataset_source, False): + runtime_dataset = UKMultiYearDataset(dataset_source) + elif UKSingleYearDataset.validate_file_path(dataset_source, False): + runtime_dataset = UKSingleYearDataset(dataset_source) + microsim = Microsimulation(dataset=runtime_dataset, **kwargs) + microsim.policyengine_bundle = _managed_release_bundle( + dataset_uri, + dataset_source, + ) + return microsim + + uk_latest = PolicyEngineUKLatest() diff --git a/src/policyengine/tax_benefit_models/us.py b/src/policyengine/tax_benefit_models/us.py index 3cf62641..bbc29486 100644 --- a/src/policyengine/tax_benefit_models/us.py +++ b/src/policyengine/tax_benefit_models/us.py @@ -13,6 +13,7 @@ ensure_datasets, general_policy_reform_analysis, load_datasets, + managed_microsimulation, us_latest, us_model, ) @@ -25,6 +26,7 @@ "ensure_datasets", "PolicyEngineUS", "PolicyEngineUSLatest", + "managed_microsimulation", "us_model", "us_latest", "general_policy_reform_analysis", diff --git a/src/policyengine/tax_benefit_models/us/__init__.py b/src/policyengine/tax_benefit_models/us/__init__.py index 0022e6a2..75d2aa79 100644 --- a/src/policyengine/tax_benefit_models/us/__init__.py +++ b/src/policyengine/tax_benefit_models/us/__init__.py @@ -21,6 +21,7 @@ from .model import ( PolicyEngineUS, PolicyEngineUSLatest, + managed_microsimulation, us_latest, us_model, ) @@ -41,6 +42,7 @@ "ensure_datasets", "PolicyEngineUS", "PolicyEngineUSLatest", + "managed_microsimulation", "us_model", "us_latest", "economic_impact_analysis", diff --git a/src/policyengine/tax_benefit_models/us/analysis.py b/src/policyengine/tax_benefit_models/us/analysis.py index 6648ae01..375a4e5f 100644 --- a/src/policyengine/tax_benefit_models/us/analysis.py +++ b/src/policyengine/tax_benefit_models/us/analysis.py @@ -16,6 +16,7 @@ ) from policyengine.outputs.inequality import ( Inequality, + USInequalityPreset, calculate_us_inequality, ) from policyengine.outputs.poverty import ( @@ -200,9 +201,15 @@ class PolicyReformAnalysis(BaseModel): def economic_impact_analysis( baseline_simulation: Simulation, reform_simulation: Simulation, + inequality_preset: USInequalityPreset | str = USInequalityPreset.STANDARD, ) -> PolicyReformAnalysis: """Perform comprehensive analysis of a policy reform. + Args: + baseline_simulation: Baseline simulation + reform_simulation: Reform simulation + inequality_preset: Optional preset for the inequality outputs + Returns: PolicyReformAnalysis containing decile impacts and program statistics """ @@ -287,8 +294,12 @@ def economic_impact_analysis( reform_poverty = calculate_us_poverty_rates(reform_simulation) # Calculate inequality for both simulations - baseline_inequality = calculate_us_inequality(baseline_simulation) - reform_inequality = calculate_us_inequality(reform_simulation) + baseline_inequality = calculate_us_inequality( + baseline_simulation, preset=inequality_preset + ) + reform_inequality = calculate_us_inequality( + reform_simulation, preset=inequality_preset + ) return PolicyReformAnalysis( decile_impacts=decile_impacts, diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index 804eab68..c17b9aff 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -14,9 +14,11 @@ Variable, ) from policyengine.core.release_manifest import ( + certify_data_release_compatibility, + dataset_logical_name, get_release_manifest, - get_runtime_model_build_metadata, - resolve_runtime_data_certification, + resolve_managed_dataset_reference, + resolve_local_managed_dataset_source, ) from policyengine.utils.entity_utils import ( build_entity_relationships, @@ -49,6 +51,17 @@ class PolicyEngineUS(TaxBenefitModel): us_model = PolicyEngineUS() +def _get_runtime_data_build_metadata() -> dict[str, str | None]: + try: + from policyengine_us.build_metadata import get_data_build_metadata + except ModuleNotFoundError as exc: + if exc.name != "policyengine_us.build_metadata": + raise + return {} + + return get_data_build_metadata() or {} + + class PolicyEngineUSLatest(TaxBenefitModelVersion): model: TaxBenefitModel = us_model version: str = None @@ -131,14 +144,13 @@ def __init__(self, **kwargs: dict): f"{manifest.model_package.version}, got {installed_model_version}." ) - model_build_metadata = get_runtime_model_build_metadata("policyengine-us") - data_certification = resolve_runtime_data_certification( + model_build_metadata = _get_runtime_data_build_metadata() + data_certification = certify_data_release_compatibility( "us", runtime_model_version=installed_model_version, runtime_data_build_fingerprint=model_build_metadata.get( "data_build_fingerprint" ), - bundled_certification=manifest.certification, ) super().__init__(**kwargs) @@ -581,4 +593,52 @@ def _build_simulation_from_dataset(self, microsim, dataset, system): microsim.set_input(column, dataset.year, df[column].values) +def _managed_release_bundle( + dataset_uri: str, + dataset_source: str | None = None, +) -> dict[str, str | None]: + bundle = dict(us_latest.release_bundle) + bundle["runtime_dataset"] = dataset_logical_name(dataset_uri) + bundle["runtime_dataset_uri"] = dataset_uri + if dataset_source and dataset_source != dataset_uri: + bundle["runtime_dataset_source"] = dataset_source + bundle["managed_by"] = "policyengine.py" + return bundle + + +def managed_microsimulation( + *, + dataset: str | None = None, + allow_unmanaged: bool = False, + **kwargs, +): + """Construct a country-package Microsimulation pinned to this bundle. + + By default this enforces the dataset selection from the bundled + `policyengine.py` release manifest. Arbitrary dataset URIs require + `allow_unmanaged=True`. + """ + + from policyengine_us import Microsimulation + + if "dataset" in kwargs: + raise ValueError( + "Pass `dataset=` directly to managed_microsimulation, not through " + "**kwargs, so policyengine.py can enforce the release bundle." + ) + + dataset_uri = resolve_managed_dataset_reference( + "us", + dataset, + allow_unmanaged=allow_unmanaged, + ) + dataset_source = resolve_local_managed_dataset_source("us", dataset_uri) + microsim = Microsimulation(dataset=dataset_source, **kwargs) + microsim.policyengine_bundle = _managed_release_bundle( + dataset_uri, + dataset_source, + ) + return microsim + + us_latest = PolicyEngineUSLatest() diff --git a/tests/test_inequality.py b/tests/test_inequality.py index bbdb0962..f7ef6ee4 100644 --- a/tests/test_inequality.py +++ b/tests/test_inequality.py @@ -2,9 +2,11 @@ import os import tempfile +from types import SimpleNamespace import numpy as np import pandas as pd +import pytest from microdf import MicroDataFrame from policyengine.core import Simulation @@ -12,7 +14,9 @@ UK_INEQUALITY_INCOME_VARIABLE, US_INEQUALITY_INCOME_VARIABLE, Inequality, + USInequalityPreset, _gini, + calculate_us_inequality, ) from policyengine.tax_benefit_models.uk import ( PolicyEngineUKDataset, @@ -21,6 +25,24 @@ ) +class _FakeOutputData(SimpleNamespace): + def map_to_entity(self, source_entity, target_entity, columns): + raise AssertionError("Unexpected map_to_entity() call in household-level test") + + +class _FakeTaxBenefitModelVersion: + def get_variable(self, name): + return SimpleNamespace(entity="household", name=name) + + +def _make_household_simulation(household_df: pd.DataFrame) -> Simulation: + output_dataset = SimpleNamespace(data=_FakeOutputData(household=household_df)) + return Simulation.model_construct( + output_dataset=output_dataset, + tax_benefit_model_version=_FakeTaxBenefitModelVersion(), + ) + + def test_gini_perfect_equality(): """Test Gini coefficient with perfect equality (all same income).""" values = np.array([100.0, 100.0, 100.0, 100.0]) @@ -220,6 +242,79 @@ def test_inequality_variable_defaults(): assert US_INEQUALITY_INCOME_VARIABLE == "household_net_income" +def test_inequality_supports_weight_multiplier_and_equivalization(): + """Test custom person-weighting and square-root equivalization.""" + simulation = _make_household_simulation( + pd.DataFrame( + { + "household_weight": [2.0, 1.0], + "household_net_income": [60_000.0, 120_000.0], + "household_count_people": [1.0, 4.0], + } + ) + ) + + inequality = Inequality( + simulation=simulation, + income_variable="household_net_income", + entity="household", + weight_multiplier_variable="household_count_people", + equivalization_variable="household_count_people", + equivalization_power=0.5, + ) + inequality.run() + + adjusted_values = np.array([60_000.0, 60_000.0]) + adjusted_weights = np.array([2.0, 4.0]) + + assert inequality.gini == pytest.approx(_gini(adjusted_values, adjusted_weights)) + + +def test_calculate_us_inequality_cbo_comparable_preset_is_optional(): + """Test the optional US preset without changing default behaviour.""" + simulation = _make_household_simulation( + pd.DataFrame( + { + "household_weight": [1.0, 1.0], + "household_market_income": [50_000.0, 100_000.0], + "household_net_income": [40_000.0, 80_000.0], + "household_count_people": [1.0, 4.0], + } + ) + ) + + standard = calculate_us_inequality( + simulation, income_variable="household_market_income" + ) + cbo_comparable = calculate_us_inequality( + simulation, + income_variable="household_market_income", + preset=USInequalityPreset.CBO_COMPARABLE, + ) + + assert standard.gini == pytest.approx( + _gini(np.array([50_000.0, 100_000.0]), np.array([1.0, 1.0])) + ) + assert cbo_comparable.gini == pytest.approx(0.0) + assert cbo_comparable.gini < standard.gini + + +def test_calculate_us_inequality_rejects_unknown_preset(): + """Test validation of preset names.""" + simulation = _make_household_simulation( + pd.DataFrame( + { + "household_weight": [1.0], + "household_net_income": [10_000.0], + "household_count_people": [1.0], + } + ) + ) + + with pytest.raises(ValueError, match="not_a_preset"): + calculate_us_inequality(simulation, preset="not_a_preset") + + def test_inequality_weighted(): """Test inequality with weighted households.""" person_df = MicroDataFrame( diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index 3ddadd61..1a3e9208 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -4,23 +4,23 @@ from unittest.mock import MagicMock, patch from policyengine.core.release_manifest import ( - DataReleaseManifest, - DataReleaseManifestUnavailable, certify_data_release_compatibility, dataset_logical_name, get_data_release_manifest, get_release_manifest, - get_runtime_model_build_metadata, + resolve_managed_dataset_reference, resolve_dataset_reference, - resolve_runtime_data_certification, ) from policyengine.core.tax_benefit_model import TaxBenefitModel from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion -from policyengine.core.trace_tro import ( - build_trace_tro_from_release_bundle, - compute_trace_composition_fingerprint, - serialize_trace_tro, +from policyengine.tax_benefit_models.uk import ( + managed_microsimulation as managed_uk_microsimulation, ) +from policyengine.tax_benefit_models.uk import uk_latest +from policyengine.tax_benefit_models.us import ( + managed_microsimulation as managed_us_microsimulation, +) +from policyengine.tax_benefit_models.us import us_latest def _response_with_json(payload: dict) -> MagicMock: @@ -51,9 +51,7 @@ def test__given_us_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.data_package.version == "1.73.0" assert manifest.data_package.repo_id == "policyengine/policyengine-us-data" assert manifest.certified_data_artifact is not None - assert ( - manifest.certified_data_artifact.build_id == "policyengine-us-data-1.73.0" - ) + assert manifest.certified_data_artifact.build_id == "policyengine-us-data-1.73.0" assert manifest.certified_data_artifact.dataset == "enhanced_cps_2024" assert manifest.certification is not None assert manifest.certification.data_build_id == "policyengine-us-data-1.73.0" @@ -71,13 +69,9 @@ def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.model_package.version == "2.74.0" assert manifest.data_package.name == "policyengine-uk-data" assert manifest.data_package.version == "1.40.4" - assert ( - manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" - ) + assert manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" assert manifest.certified_data_artifact is not None - assert ( - manifest.certified_data_artifact.build_id == "policyengine-uk-data-1.40.4" - ) + assert manifest.certified_data_artifact.build_id == "policyengine-uk-data-1.40.4" assert manifest.certified_data_artifact.dataset == "enhanced_frs_2023_24" assert manifest.certification is not None assert manifest.certification.data_build_id == "policyengine-uk-data-1.40.4" @@ -111,6 +105,30 @@ def test__given_default_dataset__then_prefers_certified_data_artifact_uri(self): assert manifest.certified_data_artifact is not None assert manifest.default_dataset_uri == manifest.certified_data_artifact.uri + def test__given_no_dataset__then_managed_resolution_uses_certified_default(self): + assert ( + resolve_managed_dataset_reference("us") == get_release_manifest("us").default_dataset_uri + ) + + def test__given_explicit_uri__then_managed_resolution_requires_opt_in(self): + dataset = "hf://policyengine/policyengine-us-data/cps_2023.h5@1.73.0" + + try: + resolve_managed_dataset_reference("us", dataset) + except ValueError as error: + assert "bypass the policyengine.py release bundle" in str(error) + else: + raise AssertionError("Expected explicit dataset URI to be rejected") + + assert ( + resolve_managed_dataset_reference( + "us", + dataset, + allow_unmanaged=True, + ) + == dataset + ) + def test__given_versioned_dataset_url__then_logical_name_drops_version(self): dataset = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" @@ -132,7 +150,7 @@ def test__given_country__then_can_fetch_data_release_manifest(self): "version": "1.602.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:fingerprint", - }, + } }, "compatible_model_packages": [ {"name": "policyengine-us", "specifier": "==1.602.0"} @@ -170,73 +188,6 @@ def test__given_country__then_can_fetch_data_release_manifest(self): ) assert mock_get.call_count == 1 - def test__given_missing_build_metadata_module__then_runtime_metadata_falls_back( - self, - ): - with ( - patch( - "policyengine.core.release_manifest.metadata.version", - return_value="2.74.0", - ), - patch( - "policyengine.core.release_manifest.import_module", - side_effect=ModuleNotFoundError, - ), - ): - build_metadata = get_runtime_model_build_metadata("policyengine-uk") - - assert build_metadata == { - "name": "policyengine-uk", - "version": "2.74.0", - "git_sha": None, - "data_build_fingerprint": None, - } - - def test__given_broken_package_import__then_runtime_metadata_falls_back(self): - with ( - patch( - "policyengine.core.release_manifest.metadata.version", - return_value="1.602.0", - ), - patch( - "policyengine.core.release_manifest.import_module", - side_effect=ValueError("broken package init"), - ), - ): - build_metadata = get_runtime_model_build_metadata("policyengine-us") - - assert build_metadata == { - "name": "policyengine-us", - "version": "1.602.0", - "git_sha": None, - "data_build_fingerprint": None, - } - - def test__given_build_metadata_module__then_runtime_metadata_uses_it(self): - module = MagicMock() - module.get_data_build_metadata.return_value = { - "name": "policyengine-us", - "version": "1.602.0", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:build", - } - - with ( - patch( - "policyengine.core.release_manifest.metadata.version", - return_value="1.602.0", - ), - patch( - "policyengine.core.release_manifest.import_module", - return_value=module, - ), - ): - build_metadata = get_runtime_model_build_metadata("policyengine-us") - - assert build_metadata["version"] == "1.602.0" - assert build_metadata["git_sha"] == "deadbeef" - assert build_metadata["data_build_fingerprint"] == "sha256:build" - def test__given_matching_fingerprint__then_certification_allows_reuse(self): get_data_release_manifest.cache_clear() payload = { @@ -252,7 +203,7 @@ def test__given_matching_fingerprint__then_certification_allows_reuse(self): "version": "1.601.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:match", - }, + } }, "compatible_model_packages": [], "default_datasets": {"national": "enhanced_cps_2024"}, @@ -289,7 +240,7 @@ def test__given_mismatched_version_and_fingerprint__then_certification_fails(sel "version": "1.601.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:build", - }, + } }, "compatible_model_packages": [], "default_datasets": {"national": "enhanced_cps_2024"}, @@ -311,46 +262,6 @@ def test__given_mismatched_version_and_fingerprint__then_certification_fails(sel else: raise AssertionError("Expected certification to fail") - def test__given_missing_release_manifest__then_runtime_uses_bundled_certification( - self, - ): - bundled_certification = get_release_manifest("uk").certification - assert bundled_certification is not None - - with patch( - "policyengine.core.release_manifest.get_data_release_manifest", - side_effect=DataReleaseManifestUnavailable("missing"), - ): - certification = resolve_runtime_data_certification( - "uk", - runtime_model_version="2.74.0", - bundled_certification=bundled_certification, - ) - - assert certification.compatibility_basis == "exact_build_model_version" - assert certification.certified_for_model_version == "2.74.0" - - def test__given_missing_release_manifest_and_wrong_runtime__then_runtime_fails( - self, - ): - bundled_certification = get_release_manifest("uk").certification - assert bundled_certification is not None - - with patch( - "policyengine.core.release_manifest.get_data_release_manifest", - side_effect=DataReleaseManifestUnavailable("missing"), - ): - try: - resolve_runtime_data_certification( - "uk", - runtime_model_version="2.75.0", - bundled_certification=bundled_certification, - ) - except DataReleaseManifestUnavailable: - pass - else: - raise AssertionError("Expected runtime certification fallback to fail") - def test__given_manifest_certification__then_release_bundle_exposes_it(self): manifest = get_release_manifest("uk") model_version = TaxBenefitModelVersion( @@ -372,9 +283,7 @@ def test__given_manifest_certification__then_release_bundle_exposes_it(self): assert bundle["compatibility_basis"] == "exact_build_model_version" assert bundle["certified_by"] == "policyengine.py bundled manifest" - def test__given_runtime_certification__then_release_bundle_prefers_runtime_value( - self, - ): + def test__given_runtime_certification__then_release_bundle_prefers_runtime_value(self): manifest = get_release_manifest("us") model_version = TaxBenefitModelVersion( model=TaxBenefitModel(id="us"), @@ -403,181 +312,35 @@ def test__given_runtime_certification__then_release_bundle_prefers_runtime_value assert bundle["compatibility_basis"] == "matching_data_build_fingerprint" assert bundle["certified_by"] == "runtime certification" - def test__given_same_hashes_in_different_orders__then_trace_fingerprint_matches( - self, - ): - hashes = ["ccc", "aaa", "bbb"] + def test__given_us_managed_microsimulation__then_passes_certified_dataset_and_bundle(self): + with patch("policyengine_us.Microsimulation") as mock_microsimulation: + microsim = managed_us_microsimulation() - assert compute_trace_composition_fingerprint(hashes) == ( - compute_trace_composition_fingerprint(reversed(hashes)) + dataset = mock_microsimulation.call_args.kwargs["dataset"] + assert str(dataset).endswith( + "policyengine_us_data/storage/enhanced_cps_2024.h5" ) - - def test__given_release_bundle_and_data_manifest__then_trace_tro_tracks_bundle( - self, - ): - country_manifest = get_release_manifest("us") - data_release_manifest = DataReleaseManifest.model_validate( - { - "schema_version": 1, - "data_package": { - "name": "policyengine-us-data", - "version": "1.73.0", - }, - "build": { - "build_id": "policyengine-us-data-1.73.0", - "built_at": "2026-04-10T12:00:00Z", - "built_with_model_package": { - "name": "policyengine-us", - "version": "1.602.0", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:build", - }, - }, - "compatible_model_packages": [], - "default_datasets": {"national": "enhanced_cps_2024"}, - "artifacts": { - "enhanced_cps_2024": { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.73.0", - "sha256": "sha256-dataset", - "size_bytes": 123, - } - }, - } - ) - - tro = build_trace_tro_from_release_bundle( - country_manifest, - data_release_manifest, + assert microsim.policyengine_bundle["policyengine_version"] == "3.4.0" + assert microsim.policyengine_bundle["runtime_dataset"] == "enhanced_cps_2024" + assert microsim.policyengine_bundle["runtime_dataset_uri"] == us_latest.default_dataset_uri + assert str(microsim.policyengine_bundle["runtime_dataset_source"]).endswith( + "policyengine_us_data/storage/enhanced_cps_2024.h5" ) - graph = tro["@graph"][0] - artifacts = graph["trov:hasComposition"]["trov:hasArtifact"] - locations = graph["trov:hasArrangement"][0]["trov:hasArtifactLocation"] - - assert len(artifacts) == 3 - assert len(locations) == 3 - assert ( - graph["schema:description"] - == "TRACE TRO for certified runtime bundle us-3.4.0 covering the bundled country release manifest, the country data release manifest, and the certified dataset artifact. Certified for runtime model version 1.602.0 via exact_build_model_version. Built with policyengine-us 1.602.0." - ) - assert locations[0]["trov:path"] == "data/release_manifests/us.json" - assert ( - locations[1]["trov:path"] - == "https://huggingface.co/policyengine/policyengine-us-data/resolve/1.73.0/release_manifest.json" - ) - assert ( - locations[2]["trov:path"] - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" - ) - assert graph["trov:hasComposition"]["trov:hasFingerprint"]["trov:hash"][ - "trov:hashValue" - ] == compute_trace_composition_fingerprint( - [artifact["trov:hash"]["trov:hashValue"] for artifact in artifacts] - ) + def test__given_uk_managed_dataset_name__then_resolves_within_bundle(self): + with patch("policyengine_uk.Microsimulation") as mock_microsimulation: + microsim = managed_uk_microsimulation(dataset="enhanced_frs_2023_24") - def test__given_runtime_certification__then_trace_tro_uses_it(self): - manifest = get_release_manifest("us") - data_release_manifest = DataReleaseManifest.model_validate( - { - "schema_version": 1, - "data_package": { - "name": "policyengine-us-data", - "version": "1.73.0", - }, - "build": { - "build_id": "policyengine-us-data-1.73.0", - "built_at": "2026-04-10T12:00:00Z", - "built_with_model_package": { - "name": "policyengine-us", - "version": "1.602.0", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:match", - }, - }, - "compatible_model_packages": [], - "default_datasets": {"national": "enhanced_cps_2024"}, - "artifacts": { - "enhanced_cps_2024": { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.73.0", - "sha256": "sha256-dataset", - "size_bytes": 123, - } - }, - } - ) - model_version = TaxBenefitModelVersion( - model=TaxBenefitModel(id="us"), - version=manifest.model_package.version, - release_manifest=manifest, - model_package=manifest.model_package, - data_package=manifest.data_package, - default_dataset_uri=manifest.default_dataset_uri, - data_certification={ - "compatibility_basis": "matching_data_build_fingerprint", - "certified_for_model_version": "1.603.0", - "data_build_id": "policyengine-us-data-1.73.0", - "built_with_model_version": "1.602.0", - "built_with_model_git_sha": "deadbeef", - "data_build_fingerprint": "sha256:match", - "certified_by": "runtime certification", - }, - ) + dataset = mock_microsimulation.call_args.kwargs["dataset"] + from policyengine_uk.data.dataset_schema import UKSingleYearDataset - with patch( - "policyengine.core.tax_benefit_model_version.get_data_release_manifest", - return_value=data_release_manifest, - ): - tro = model_version.trace_tro - - description = tro["@graph"][0]["schema:description"] - - assert "Certified for runtime model version 1.603.0" in description - assert "via matching_data_build_fingerprint." in description - assert "Data-build fingerprint: sha256:match." in description - - def test__given_trace_tro__then_serialization_is_deterministic(self): - country_manifest = get_release_manifest("uk") - data_release_manifest = DataReleaseManifest.model_validate( - { - "schema_version": 1, - "data_package": { - "name": "policyengine-uk-data", - "version": "1.40.4", - }, - "build": { - "build_id": "policyengine-uk-data-1.40.4", - "built_at": "2026-04-10T12:00:00Z", - "built_with_model_package": { - "name": "policyengine-uk", - "version": "2.74.0", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:build", - }, - }, - "compatible_model_packages": [], - "default_datasets": {"national": "enhanced_frs_2023_24"}, - "artifacts": { - "enhanced_frs_2023_24": { - "kind": "microdata", - "path": "enhanced_frs_2023_24.h5", - "repo_id": "policyengine/policyengine-uk-data-private", - "revision": "1.40.4", - "sha256": "sha256-dataset", - "size_bytes": 123, - } - }, - } + assert isinstance(dataset, UKSingleYearDataset) + assert getattr(dataset, "time_period", None) == "2023" + assert microsim.policyengine_bundle["policyengine_version"] == "3.4.0" + assert microsim.policyengine_bundle["runtime_dataset"] == "enhanced_frs_2023_24" + assert microsim.policyengine_bundle["runtime_dataset_uri"] == ( + "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" ) - - tro = build_trace_tro_from_release_bundle( - country_manifest, - data_release_manifest, + assert str(microsim.policyengine_bundle["runtime_dataset_source"]).endswith( + "policyengine_uk_data/storage/enhanced_frs_2023_24.h5" ) - - assert serialize_trace_tro(tro) == serialize_trace_tro(tro) From d340d00ff2e7cf2cb71b6e9bcceced47d4e00558 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 16 Apr 2026 09:40:45 -0400 Subject: [PATCH 2/4] Add changelog fragment for managed runtime --- changelog.d/managed-release-runtime.changed.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/managed-release-runtime.changed.md diff --git a/changelog.d/managed-release-runtime.changed.md b/changelog.d/managed-release-runtime.changed.md new file mode 100644 index 00000000..b80db96e --- /dev/null +++ b/changelog.d/managed-release-runtime.changed.md @@ -0,0 +1 @@ +Added managed release-bundle runtime enforcement for bundled US and UK microsimulations, including manifest-backed dataset pinning and runtime bundle metadata. From 3a15a3df7eab9204b9087ce19a06f18e06bac598 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 16 Apr 2026 09:42:00 -0400 Subject: [PATCH 3/4] Fix lint for managed runtime PR --- src/policyengine/tax_benefit_models/uk/model.py | 2 +- src/policyengine/tax_benefit_models/us/model.py | 2 +- tests/test_release_manifests.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index 550e9a42..33b7d0d8 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -17,8 +17,8 @@ certify_data_release_compatibility, dataset_logical_name, get_release_manifest, - resolve_managed_dataset_reference, resolve_local_managed_dataset_source, + resolve_managed_dataset_reference, ) from policyengine.utils.entity_utils import ( build_entity_relationships, diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index c17b9aff..6616bac9 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -17,8 +17,8 @@ certify_data_release_compatibility, dataset_logical_name, get_release_manifest, - resolve_managed_dataset_reference, resolve_local_managed_dataset_source, + resolve_managed_dataset_reference, ) from policyengine.utils.entity_utils import ( build_entity_relationships, diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index 1a3e9208..afeb9848 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -8,15 +8,14 @@ dataset_logical_name, get_data_release_manifest, get_release_manifest, - resolve_managed_dataset_reference, resolve_dataset_reference, + resolve_managed_dataset_reference, ) from policyengine.core.tax_benefit_model import TaxBenefitModel from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion from policyengine.tax_benefit_models.uk import ( managed_microsimulation as managed_uk_microsimulation, ) -from policyengine.tax_benefit_models.uk import uk_latest from policyengine.tax_benefit_models.us import ( managed_microsimulation as managed_us_microsimulation, ) From 59a3c99de8f55429bb726654d1b5ead09cb316ef Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 16 Apr 2026 09:43:10 -0400 Subject: [PATCH 4/4] Format managed runtime files --- src/policyengine/core/release_manifest.py | 4 +-- src/policyengine/core/simulation.py | 4 ++- .../core/tax_benefit_model_version.py | 12 ++----- tests/test_release_manifests.py | 34 +++++++++++++------ 4 files changed, 31 insertions(+), 23 deletions(-) diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/core/release_manifest.py index a1d601c3..e4fed56a 100644 --- a/src/policyengine/core/release_manifest.py +++ b/src/policyengine/core/release_manifest.py @@ -260,9 +260,7 @@ def certify_data_release_compatibility( else None ), built_with_model_version=( - built_with_model.version - if built_with_model is not None - else None + built_with_model.version if built_with_model is not None else None ), built_with_model_git_sha=( built_with_model.git_sha if built_with_model is not None else None diff --git a/src/policyengine/core/simulation.py b/src/policyengine/core/simulation.py index d4397cdb..b9af105d 100644 --- a/src/policyengine/core/simulation.py +++ b/src/policyengine/core/simulation.py @@ -104,5 +104,7 @@ def release_bundle(self) -> dict[str, str | None]: ) return { **bundle, - "dataset_filepath": self.dataset.filepath if self.dataset is not None else None, + "dataset_filepath": self.dataset.filepath + if self.dataset is not None + else None, } diff --git a/src/policyengine/core/tax_benefit_model_version.py b/src/policyengine/core/tax_benefit_model_version.py index eb8cfd5e..f253fc5c 100644 --- a/src/policyengine/core/tax_benefit_model_version.py +++ b/src/policyengine/core/tax_benefit_model_version.py @@ -23,9 +23,7 @@ class TaxBenefitModelVersion(BaseModel): model: TaxBenefitModel version: str description: str | None = None - created_at: datetime | None = Field( - default_factory=lambda: datetime.now(UTC) - ) + created_at: datetime | None = Field(default_factory=lambda: datetime.now(UTC)) variables: list["Variable"] = Field(default_factory=list) parameters: list["Parameter"] = Field(default_factory=list) @@ -196,14 +194,10 @@ def release_bundle(self) -> dict[str, str | None]: else None ), "compatibility_basis": ( - certification.compatibility_basis - if certification is not None - else None + certification.compatibility_basis if certification is not None else None ), "certified_by": ( - certification.certified_by - if certification is not None - else None + certification.certified_by if certification is not None else None ), } diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index afeb9848..0eb637b5 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -50,7 +50,9 @@ def test__given_us_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.data_package.version == "1.73.0" assert manifest.data_package.repo_id == "policyengine/policyengine-us-data" assert manifest.certified_data_artifact is not None - assert manifest.certified_data_artifact.build_id == "policyengine-us-data-1.73.0" + assert ( + manifest.certified_data_artifact.build_id == "policyengine-us-data-1.73.0" + ) assert manifest.certified_data_artifact.dataset == "enhanced_cps_2024" assert manifest.certification is not None assert manifest.certification.data_build_id == "policyengine-us-data-1.73.0" @@ -68,9 +70,13 @@ def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.model_package.version == "2.74.0" assert manifest.data_package.name == "policyengine-uk-data" assert manifest.data_package.version == "1.40.4" - assert manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" + assert ( + manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" + ) assert manifest.certified_data_artifact is not None - assert manifest.certified_data_artifact.build_id == "policyengine-uk-data-1.40.4" + assert ( + manifest.certified_data_artifact.build_id == "policyengine-uk-data-1.40.4" + ) assert manifest.certified_data_artifact.dataset == "enhanced_frs_2023_24" assert manifest.certification is not None assert manifest.certification.data_build_id == "policyengine-uk-data-1.40.4" @@ -106,7 +112,8 @@ def test__given_default_dataset__then_prefers_certified_data_artifact_uri(self): def test__given_no_dataset__then_managed_resolution_uses_certified_default(self): assert ( - resolve_managed_dataset_reference("us") == get_release_manifest("us").default_dataset_uri + resolve_managed_dataset_reference("us") + == get_release_manifest("us").default_dataset_uri ) def test__given_explicit_uri__then_managed_resolution_requires_opt_in(self): @@ -149,7 +156,7 @@ def test__given_country__then_can_fetch_data_release_manifest(self): "version": "1.602.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:fingerprint", - } + }, }, "compatible_model_packages": [ {"name": "policyengine-us", "specifier": "==1.602.0"} @@ -202,7 +209,7 @@ def test__given_matching_fingerprint__then_certification_allows_reuse(self): "version": "1.601.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:match", - } + }, }, "compatible_model_packages": [], "default_datasets": {"national": "enhanced_cps_2024"}, @@ -239,7 +246,7 @@ def test__given_mismatched_version_and_fingerprint__then_certification_fails(sel "version": "1.601.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:build", - } + }, }, "compatible_model_packages": [], "default_datasets": {"national": "enhanced_cps_2024"}, @@ -282,7 +289,9 @@ def test__given_manifest_certification__then_release_bundle_exposes_it(self): assert bundle["compatibility_basis"] == "exact_build_model_version" assert bundle["certified_by"] == "policyengine.py bundled manifest" - def test__given_runtime_certification__then_release_bundle_prefers_runtime_value(self): + def test__given_runtime_certification__then_release_bundle_prefers_runtime_value( + self, + ): manifest = get_release_manifest("us") model_version = TaxBenefitModelVersion( model=TaxBenefitModel(id="us"), @@ -311,7 +320,9 @@ def test__given_runtime_certification__then_release_bundle_prefers_runtime_value assert bundle["compatibility_basis"] == "matching_data_build_fingerprint" assert bundle["certified_by"] == "runtime certification" - def test__given_us_managed_microsimulation__then_passes_certified_dataset_and_bundle(self): + def test__given_us_managed_microsimulation__then_passes_certified_dataset_and_bundle( + self, + ): with patch("policyengine_us.Microsimulation") as mock_microsimulation: microsim = managed_us_microsimulation() @@ -321,7 +332,10 @@ def test__given_us_managed_microsimulation__then_passes_certified_dataset_and_bu ) assert microsim.policyengine_bundle["policyengine_version"] == "3.4.0" assert microsim.policyengine_bundle["runtime_dataset"] == "enhanced_cps_2024" - assert microsim.policyengine_bundle["runtime_dataset_uri"] == us_latest.default_dataset_uri + assert ( + microsim.policyengine_bundle["runtime_dataset_uri"] + == us_latest.default_dataset_uri + ) assert str(microsim.policyengine_bundle["runtime_dataset_source"]).endswith( "policyengine_us_data/storage/enhanced_cps_2024.h5" )