Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 201 additions & 3 deletions policyengine_uk_data/datasets/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,16 @@
modelling and policy analysis.
"""

from policyengine_uk.data import UKSingleYearDataset
from functools import lru_cache
from pathlib import Path
import pandas as pd

import numpy as np
import pandas as pd
from policyengine_uk import CountryTaxBenefitSystem
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk.variables.household.income.employment_status import (
EmploymentStatus,
)
from policyengine_uk_data.utils.datasets import (
sum_to_entity,
categorical,
Expand All @@ -22,6 +28,192 @@
from policyengine_uk_data.parameters import load_take_up_rate, load_parameter


LEGACY_JOBSEEKER_MIN_AGE = 18
HOURS_WORKED_WEEKS_PER_YEAR = 52
ESA_MIN_AGE = 16
ESA_HEALTH_EMPLOYMENT_STATUSES = (
EmploymentStatus.LONG_TERM_DISABLED.name,
EmploymentStatus.SHORT_TERM_DISABLED.name,
)


@lru_cache(maxsize=None)
def load_legacy_jobseeker_max_annual_hours(year: int) -> int:
"""Read the JSA single-claimant hours rule from policyengine-uk."""

system = CountryTaxBenefitSystem()
max_weekly_hours = int(system.parameters.gov.dwp.JSA.hours.single(str(year)))
return max_weekly_hours * HOURS_WORKED_WEEKS_PER_YEAR


def derive_legacy_jobseeker_proxy(
age,
employment_status,
hours_worked,
current_education,
employment_status_reported,
state_pension_age,
max_annual_hours,
) -> np.ndarray:
"""Approximate legacy JSA claimant-state from observed survey data.

This is intentionally a proxy, not a legislative determination. It
identifies person-level working-age adults who report being unemployed
and working less than the legacy JSA 16-hour weekly limit. The
``hours_worked`` input is the annualised FRS-derived measure used in the
dataset, so the threshold is converted to annual hours here.
"""

age = np.asarray(age)
employment_status = np.asarray(employment_status)
hours_worked = np.asarray(hours_worked)
current_education = np.asarray(current_education)
employment_status_reported = np.asarray(employment_status_reported)
state_pension_age = np.asarray(state_pension_age)

return (
employment_status_reported
& (age >= LEGACY_JOBSEEKER_MIN_AGE)
& (age < state_pension_age)
& (employment_status == "UNEMPLOYED")
& (hours_worked < max_annual_hours)
& (current_education == "NOT_IN_EDUCATION")
)


def derive_esa_health_condition_proxy(
age,
employment_status,
employment_status_reported,
state_pension_age,
) -> np.ndarray:
"""Approximate working-age ESA health-related claimant-state.

This proxy relies only on person-level labour market status, not on
current disability or incapacity benefit receipt. It is a dataset-side
approximation for future modelling, not a direct observation of ESA
legal entitlement or LCW/LCWRA status.
"""

age = np.asarray(age)
employment_status = np.asarray(employment_status)
employment_status_reported = np.asarray(employment_status_reported)
state_pension_age = np.asarray(state_pension_age)
disability_labour_market_state = np.isin(
employment_status, ESA_HEALTH_EMPLOYMENT_STATUSES
)

return (
employment_status_reported
& (age >= ESA_MIN_AGE)
& (age < state_pension_age)
& disability_labour_market_state
)


def derive_esa_support_group_proxy(
age,
employment_status,
hours_worked,
esa_health_condition_proxy,
employment_status_reported,
state_pension_age,
) -> np.ndarray:
"""Approximate a severe-health ESA subgroup akin to support group.

This is a stricter subset of ``esa_health_condition_proxy`` intended
for future legacy ESA approximation work. It uses only non-receipt
labour market signals already available in the survey.
"""

age = np.asarray(age)
employment_status = np.asarray(employment_status)
hours_worked = np.asarray(hours_worked)
esa_health_condition_proxy = np.asarray(esa_health_condition_proxy)
employment_status_reported = np.asarray(employment_status_reported)
state_pension_age = np.asarray(state_pension_age)
severe_health_evidence = (employment_status == "LONG_TERM_DISABLED") & (
hours_worked <= 0
)

return (
employment_status_reported
& (age >= ESA_MIN_AGE)
& (age < state_pension_age)
& esa_health_condition_proxy
& severe_health_evidence
)


def add_legacy_benefit_proxies(
pe_person: pd.DataFrame,
employment_status_reported,
state_pension_age,
legacy_jobseeker_max_annual_hours,
) -> pd.DataFrame:
"""Populate person-scoped ESA/JSA proxy columns on the person frame.

These remain person-level by design because the claimant-state inputs
they approximate attach to individuals. Downstream benunit-level legacy
benefit models should aggregate them explicitly rather than assuming the
raw survey contains a benunit claimant-state field.
"""

pe_person["legacy_jobseeker_proxy"] = derive_legacy_jobseeker_proxy(
age=pe_person.age,
employment_status=pe_person.employment_status,
hours_worked=pe_person.hours_worked,
current_education=pe_person.current_education,
employment_status_reported=employment_status_reported,
state_pension_age=state_pension_age,
max_annual_hours=legacy_jobseeker_max_annual_hours,
)
pe_person["esa_health_condition_proxy"] = derive_esa_health_condition_proxy(
age=pe_person.age,
employment_status=pe_person.employment_status,
employment_status_reported=employment_status_reported,
state_pension_age=state_pension_age,
)
pe_person["esa_support_group_proxy"] = derive_esa_support_group_proxy(
age=pe_person.age,
employment_status=pe_person.employment_status,
hours_worked=pe_person.hours_worked,
esa_health_condition_proxy=pe_person.esa_health_condition_proxy,
employment_status_reported=employment_status_reported,
state_pension_age=state_pension_age,
)
return pe_person


def apply_legacy_benefit_proxies(
pe_person: pd.DataFrame, sim, year: int, employment_status_reported
) -> pd.DataFrame:
"""Attach legacy ESA/JSA proxies using post-build simulation context."""

state_pension_age = sim.calculate("state_pension_age", year).values
legacy_jobseeker_max_annual_hours = load_legacy_jobseeker_max_annual_hours(year)
return add_legacy_benefit_proxies(
pe_person,
employment_status_reported=employment_status_reported,
state_pension_age=state_pension_age,
legacy_jobseeker_max_annual_hours=legacy_jobseeker_max_annual_hours,
)


def attach_legacy_benefit_proxies_from_frs_person(
pe_person: pd.DataFrame, person: pd.DataFrame, sim, year: int
) -> pd.DataFrame:
"""Bridge raw FRS person fields into the proxy derivation hook."""

employment_status_reported = person.empstati.fillna(0).to_numpy() > 0
return apply_legacy_benefit_proxies(
pe_person,
sim,
year,
employment_status_reported=employment_status_reported,
)


def create_frs(
raw_frs_folder: str,
year: int,
Expand Down Expand Up @@ -744,7 +936,6 @@ def determine_education_level(fted_val, typeed2_val, age_val):
sim = Microsimulation(dataset=dataset)
region = sim.populations["benunit"].household("region", dataset.time_period)
lha_category = sim.calculate("LHA_category", year)

brma = np.empty(len(region), dtype=object)

# Sample from a random BRMA in the region, weighted by the number of observations in each BRMA
Expand Down Expand Up @@ -808,6 +999,13 @@ def determine_education_level(fted_val, typeed2_val, age_val):
paragraph_3 | paragraph_4 | paragraph_5
)

# Dataset-side claimant-state approximations for future legacy ESA/JSA
# modelling. These are explicit proxies based on observed survey
# conditions, not legislative determinations.
pe_person = attach_legacy_benefit_proxies_from_frs_person(
pe_person, person, sim, year
)

# Generate stochastic take-up decisions
# All randomness is generated here in the data package using take-up rates
# stored in YAML parameter files. This keeps the country package purely
Expand Down
Loading
Loading