In [1]:
from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us import Microsimulation
from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024
from policyengine_us_data.utils import build_loss_matrix
import numpy as np
import os

In [None]:
# Original ECPS 2024 dataset size (for household entity): 41310
# After minimization through "candidate_loss_contribution" and a 1.0 max error change (for household entity): 20655 
# After minimization through "candidate_loss_contribution" and a 0.001 max error change (for household entity): 24786


In [3]:
## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation

files = [
        STORAGE_FOLDER / "enhanced_cps_2024.h5",
    ]

approach = "candidate_loss_contribution" # for which you can specify the fraction
minimization_function = candidate_loss_contribution
# other minimization function approach is "random_sampling_minimization", for which you can specify the tolerance for loss relative change.

for file in files:
    output_path = STORAGE_FOLDER / approach / "enhanced_cps_2024_minimised.h5"
    output_path.parent.mkdir(parents=True, exist_ok=True)

    minimise_dataset(
        file,
        output_path,
        minimization_function=minimization_function, 
        # target_fractions=[0.5] # remove if switching approach
        loss_rel_change_max=0.0001,  # remove if switching approach
    )

Targeting Medicaid enrollment for AK with target 231577k
Targeting Medicaid enrollment for AL with target 766009k
Targeting Medicaid enrollment for AR with target 733561k
Targeting Medicaid enrollment for AZ with target 1778734k
Targeting Medicaid enrollment for CA with target 12172695k
Targeting Medicaid enrollment for CO with target 1058326k
Targeting Medicaid enrollment for CT with target 904321k
Targeting Medicaid enrollment for DC with target 240020k
Targeting Medicaid enrollment for DE with target 236840k
Targeting Medicaid enrollment for FL with target 3568648k
Targeting Medicaid enrollment for GA with target 1699279k
Targeting Medicaid enrollment for HI with target 376318k
Targeting Medicaid enrollment for IA with target 586748k
Targeting Medicaid enrollment for ID with target 296968k
Targeting Medicaid enrollment for IL with target 2918179k
Targeting Medicaid enrollment for IN with target 1623361k
Targeting Medicaid enrollment for KS with target 335902k
Targeting Medicaid enro

In [None]:
## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation

input_dataset = ExtendedCPS_2024

approach = "l0_sigmoid"
# other options are "l0_log", "l0_exp", "l1"

sim = Microsimulation(dataset=input_dataset)
data = sim.dataset.load_dataset()
data["household_weight"] = {}
original_weights = sim.calculate("household_weight")
original_weights = original_weights.values + np.random.normal(
    1, 0.1, len(original_weights)
)
for year in range(2024, 2025):
    loss_matrix, targets_array = build_loss_matrix(
        input_dataset, year
    )
    optimised_weights = reweight(
        original_weights,
        loss_matrix,
        targets_array,
        log_path= STORAGE_FOLDER / approach / "calibration_log.csv",
        penalty_approach=approach,
    )
    data["household_weight"][year] = optimised_weights

output_path = STORAGE_FOLDER / approach / "enhanced_cps_2024_minimised.h5"

data.save_dataset(output_path)