In [5]:
from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us import Microsimulation
from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024
from policyengine_us_data.utils import build_loss_matrix
import numpy as np

In [None]:
## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation

files = [
        STORAGE_FOLDER / "enhanced_cps_2024.h5",
    ]

approach = "random_sampling_minimization" # for which you can specify the fraction
minimization_function = random_sampling_minimization
# other minimization function approach is "candidate_loss_contribution"

for file in files:
    output_path = STORAGE_FOLDER / approach / "enhanced_cps_2024_minimised.h5"
    minimise_dataset(
        file,
        output_path,
        loss_rel_change_max=10,
        minimization_function=minimization_function, 
        target_fractions=[0.5] # remove if switching approach
    )

In [None]:
## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation

input_dataset = ExtendedCPS_2024

approach = "l0_sigmoid"
# other options are "l0_log", "l0_exp", "l1"

sim = Microsimulation(dataset=input_dataset)
data = sim.dataset.load_dataset()
data["household_weight"] = {}
original_weights = sim.calculate("household_weight")
original_weights = original_weights.values + np.random.normal(
    1, 0.1, len(original_weights)
)
for year in range(2024, 2025):
    loss_matrix, targets_array = build_loss_matrix(
        input_dataset, year
    )
    optimised_weights = reweight(
        original_weights,
        loss_matrix,
        targets_array,
        log_path= STORAGE_FOLDER / approach / "calibration_log.csv",
        penalty_approach=approach,
    )
    data["household_weight"][year] = optimised_weights

output_path = STORAGE_FOLDER / approach / "enhanced_cps_2024_minimised.h5"

data.save_dataset(output_path)