PolicyEngine · MaxGhenis · Apr 12, 2026 · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026
diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
@@ -7,10 +7,16 @@
 modelling and policy analysis.
 """
 
-from policyengine_uk.data import UKSingleYearDataset
+from functools import lru_cache
 from pathlib import Path
-import pandas as pd
+
 import numpy as np
+import pandas as pd
+from policyengine_uk import CountryTaxBenefitSystem
+from policyengine_uk.data import UKSingleYearDataset
+from policyengine_uk.variables.household.income.employment_status import (
+    EmploymentStatus,
+)
 from policyengine_uk_data.utils.datasets import (
     sum_to_entity,
     categorical,
@@ -22,6 +28,192 @@
 from policyengine_uk_data.parameters import load_take_up_rate, load_parameter
 
 
+LEGACY_JOBSEEKER_MIN_AGE = 18
+HOURS_WORKED_WEEKS_PER_YEAR = 52
+ESA_MIN_AGE = 16
+ESA_HEALTH_EMPLOYMENT_STATUSES = (
+    EmploymentStatus.LONG_TERM_DISABLED.name,
+    EmploymentStatus.SHORT_TERM_DISABLED.name,
+)
+
+
+@lru_cache(maxsize=None)
+def load_legacy_jobseeker_max_annual_hours(year: int) -> int:
+    """Read the JSA single-claimant hours rule from policyengine-uk."""
+
+    system = CountryTaxBenefitSystem()
+    max_weekly_hours = int(system.parameters.gov.dwp.JSA.hours.single(str(year)))
+    return max_weekly_hours * HOURS_WORKED_WEEKS_PER_YEAR
+
+
+def derive_legacy_jobseeker_proxy(
+    age,
+    employment_status,
+    hours_worked,
+    current_education,
+    employment_status_reported,
+    state_pension_age,
+    max_annual_hours,
+) -> np.ndarray:
+    """Approximate legacy JSA claimant-state from observed survey data.
+
+    This is intentionally a proxy, not a legislative determination. It
+    identifies person-level working-age adults who report being unemployed
+    and working less than the legacy JSA 16-hour weekly limit. The
+    ``hours_worked`` input is the annualised FRS-derived measure used in the
+    dataset, so the threshold is converted to annual hours here.
+    """
+
+    age = np.asarray(age)
+    employment_status = np.asarray(employment_status)
+    hours_worked = np.asarray(hours_worked)
+    current_education = np.asarray(current_education)
+    employment_status_reported = np.asarray(employment_status_reported)
+    state_pension_age = np.asarray(state_pension_age)
+
+    return (
+        employment_status_reported
+        & (age >= LEGACY_JOBSEEKER_MIN_AGE)
+        & (age < state_pension_age)
+        & (employment_status == "UNEMPLOYED")
+        & (hours_worked < max_annual_hours)
+        & (current_education == "NOT_IN_EDUCATION")
+    )
+
+
+def derive_esa_health_condition_proxy(
+    age,
+    employment_status,
+    employment_status_reported,
+    state_pension_age,
+) -> np.ndarray:
+    """Approximate working-age ESA health-related claimant-state.
+
+    This proxy relies only on person-level labour market status, not on
+    current disability or incapacity benefit receipt. It is a dataset-side
+    approximation for future modelling, not a direct observation of ESA
+    legal entitlement or LCW/LCWRA status.
+    """
+
+    age = np.asarray(age)
+    employment_status = np.asarray(employment_status)
+    employment_status_reported = np.asarray(employment_status_reported)
+    state_pension_age = np.asarray(state_pension_age)
+    disability_labour_market_state = np.isin(
+        employment_status, ESA_HEALTH_EMPLOYMENT_STATUSES
+    )
+
+    return (
+        employment_status_reported
+        & (age >= ESA_MIN_AGE)
+        & (age < state_pension_age)
+        & disability_labour_market_state
+    )
+
+
+def derive_esa_support_group_proxy(
+    age,
+    employment_status,
+    hours_worked,
+    esa_health_condition_proxy,
+    employment_status_reported,
+    state_pension_age,
+) -> np.ndarray:
+    """Approximate a severe-health ESA subgroup akin to support group.
+
+    This is a stricter subset of ``esa_health_condition_proxy`` intended
+    for future legacy ESA approximation work. It uses only non-receipt
+    labour market signals already available in the survey.
+    """
+
+    age = np.asarray(age)
+    employment_status = np.asarray(employment_status)
+    hours_worked = np.asarray(hours_worked)
+    esa_health_condition_proxy = np.asarray(esa_health_condition_proxy)
+    employment_status_reported = np.asarray(employment_status_reported)
+    state_pension_age = np.asarray(state_pension_age)
+    severe_health_evidence = (employment_status == "LONG_TERM_DISABLED") & (
+        hours_worked <= 0
+    )
+
+    return (
+        employment_status_reported
+        & (age >= ESA_MIN_AGE)
+        & (age < state_pension_age)
+        & esa_health_condition_proxy
+        & severe_health_evidence
+    )
+
+
+def add_legacy_benefit_proxies(
+    pe_person: pd.DataFrame,
+    employment_status_reported,
+    state_pension_age,
+    legacy_jobseeker_max_annual_hours,
+) -> pd.DataFrame:
+    """Populate person-scoped ESA/JSA proxy columns on the person frame.
+
+    These remain person-level by design because the claimant-state inputs
+    they approximate attach to individuals. Downstream benunit-level legacy
+    benefit models should aggregate them explicitly rather than assuming the
+    raw survey contains a benunit claimant-state field.
+    """
+
+    pe_person["legacy_jobseeker_proxy"] = derive_legacy_jobseeker_proxy(
+        age=pe_person.age,
+        employment_status=pe_person.employment_status,
+        hours_worked=pe_person.hours_worked,
+        current_education=pe_person.current_education,
+        employment_status_reported=employment_status_reported,
+        state_pension_age=state_pension_age,
+        max_annual_hours=legacy_jobseeker_max_annual_hours,
+    )
+    pe_person["esa_health_condition_proxy"] = derive_esa_health_condition_proxy(
+        age=pe_person.age,
+        employment_status=pe_person.employment_status,
+        employment_status_reported=employment_status_reported,
+        state_pension_age=state_pension_age,
+    )
+    pe_person["esa_support_group_proxy"] = derive_esa_support_group_proxy(
+        age=pe_person.age,
+        employment_status=pe_person.employment_status,
+        hours_worked=pe_person.hours_worked,
+        esa_health_condition_proxy=pe_person.esa_health_condition_proxy,
+        employment_status_reported=employment_status_reported,
+        state_pension_age=state_pension_age,
+    )
+    return pe_person
+
+
+def apply_legacy_benefit_proxies(
+    pe_person: pd.DataFrame, sim, year: int, employment_status_reported
+) -> pd.DataFrame:
+    """Attach legacy ESA/JSA proxies using post-build simulation context."""
+
+    state_pension_age = sim.calculate("state_pension_age", year).values
+    legacy_jobseeker_max_annual_hours = load_legacy_jobseeker_max_annual_hours(year)
+    return add_legacy_benefit_proxies(
+        pe_person,
+        employment_status_reported=employment_status_reported,
+        state_pension_age=state_pension_age,
+        legacy_jobseeker_max_annual_hours=legacy_jobseeker_max_annual_hours,
+    )
+
+
+def attach_legacy_benefit_proxies_from_frs_person(
+    pe_person: pd.DataFrame, person: pd.DataFrame, sim, year: int
+) -> pd.DataFrame:
+    """Bridge raw FRS person fields into the proxy derivation hook."""
+
+    employment_status_reported = person.empstati.fillna(0).to_numpy() > 0
+    return apply_legacy_benefit_proxies(
+        pe_person,
+        sim,
+        year,
+        employment_status_reported=employment_status_reported,
+    )
+
+
 def create_frs(
     raw_frs_folder: str,
     year: int,
@@ -744,7 +936,6 @@ def determine_education_level(fted_val, typeed2_val, age_val):
     sim = Microsimulation(dataset=dataset)
     region = sim.populations["benunit"].household("region", dataset.time_period)
     lha_category = sim.calculate("LHA_category", year)
-
     brma = np.empty(len(region), dtype=object)
 
     # Sample from a random BRMA in the region, weighted by the number of observations in each BRMA
@@ -808,6 +999,13 @@ def determine_education_level(fted_val, typeed2_val, age_val):
         paragraph_3 | paragraph_4 | paragraph_5
     )
 
+    # Dataset-side claimant-state approximations for future legacy ESA/JSA
+    # modelling. These are explicit proxies based on observed survey
+    # conditions, not legislative determinations.
+    pe_person = attach_legacy_benefit_proxies_from_frs_person(
+        pe_person, person, sim, year
+    )
+
     # Generate stochastic take-up decisions
     # All randomness is generated here in the data package using take-up rates
     # stored in YAML parameter files. This keeps the country package purely