In [2]:
from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us import Microsimulation
from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024
from policyengine_us_data.utils import build_loss_matrix
import numpy as np
import os
import h5py

bad_targets = [
    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
]

In [None]:
# Length of household entity in the dataset measured through household_weight:

# Original ECPS 2024 dataset size: 41310
# Through "random_sampling_minimization" with 0.5 of the dataset being pruned: 20655
# Through "random_sampling_minimization" with 0.2 of the dataset being pruned: 33408
# After minimization through "candidate_loss_contribution" and a 1.0 max error change: 20655 
# After minimization through "candidate_loss_contribution" and a 0.001 max error change: 24786

In [None]:
## ALL TESTS

## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation

input_dataset = ExtendedCPS_2024

approaches = ["l0_sigmoid", "l0_log", "l0_exp", "l1"]

for approach in approaches:
    sim = Microsimulation(dataset=input_dataset)
    data = sim.dataset.load_dataset()
    data["household_weight"] = {}
    original_weights = sim.calculate("household_weight")
    original_weights = original_weights.values + np.random.normal(
        1, 0.1, len(original_weights)
    )
    for year in range(2024, 2025):
        loss_matrix, targets_array = build_loss_matrix(
            input_dataset, year
        )

        bad_mask = loss_matrix.columns.isin(bad_targets)
        keep_mask_bool = ~bad_mask
        keep_idx = np.where(keep_mask_bool)[0]
        loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
        targets_array_clean = targets_array[keep_idx]
        assert loss_matrix_clean.shape[1] == targets_array_clean.size

        optimised_weights = reweight(
            original_weights,
            loss_matrix_clean,
            targets_array_clean,
            log_path="calibration_log.csv",
            penalty_approach=approach,
            epochs=250,  # Reduced epochs for faster processing
        )
        data["household_weight"][year] = optimised_weights

    output_path = STORAGE_FOLDER / approach / "enhanced_cps_2024_minimised.h5"
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Save to HDF5 file
    with h5py.File(output_path, "w") as f:
        for variable, values in data.items():
            for year, value in values.items():
                f.create_dataset(f"{variable}/{year}", data=value)


## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation

files = [
        STORAGE_FOLDER / "enhanced_cps_2024.h5",
    ]

approaches = {
        "random_sampling_minimization": random_sampling_minimization,
        "candidate_loss_contribution": candidate_loss_contribution,
}

optional_params = {
        "random_sampling_minimization": {
            "target_fractions": [0.5, 0.6, 0.7, 0.8, 0.9],  # fractions of the dataset to keep
        },
        "candidate_loss_contribution": {
            "loss_rel_change_max": [0.00001, 0.000001, 0.0000001] # maximum relative change in
        }
}

for approach, function in approaches.items():
    minimization_function = function
    # other minimization function approach is "random_sampling_minimization", for which you can specify the tolerance for loss relative change.

    for params, values in optional_params[approach].items():
        for value in values:
            if params == "target_fractions":
                for file in files:
                    output_path = STORAGE_FOLDER / approach / f"{value}_enhanced_cps_2024_minimised.h5"
                    output_path.parent.mkdir(parents=True, exist_ok=True)
                    minimise_dataset(
                        file,
                        output_path,
                        minimization_function=minimization_function, 
                        target_fractions=[value]
                    )
            elif params == "loss_rel_change_max":
                for file in files:
                    output_path = STORAGE_FOLDER / approach / f"{value}_enhanced_cps_2024_minimised.h5"
                    output_path.parent.mkdir(parents=True, exist_ok=True)
                    minimise_dataset(
                        file,
                        output_path,
                        minimization_function=minimization_function, 
                        loss_rel_change_max=value
                    )

In [None]:
## SMALL CHECKS BELOW -- IGNORE ---

## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation

files = [
        STORAGE_FOLDER / "enhanced_cps_2024.h5",
    ]

minimization_function = random_sampling_minimization
# other minimization function approach is "random_sampling_minimization", for which you can specify the tolerance for loss relative change.

for file in files:
    output_path = STORAGE_FOLDER / "random_sampling_minimization" / f"{1.0}_enhanced_cps_2024_minimised.h5"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    minimise_dataset(
        file,
        output_path,
        minimization_function=minimization_function, 
        target_fractions=[1.0]
    )

In [None]:
input_dataset = ExtendedCPS_2024

sim = Microsimulation(dataset=input_dataset)
data = sim.dataset.load_dataset()
data["household_weight"] = {}
original_weights = sim.calculate("household_weight")
original_weights = original_weights.values + np.random.normal(
    1, 0.1, len(original_weights)
)
for year in range(2024, 2025):
    loss_matrix, targets_array = build_loss_matrix(
        input_dataset, year
    )

    bad_mask = loss_matrix.columns.isin(bad_targets)
    keep_mask_bool = ~bad_mask
    keep_idx = np.where(keep_mask_bool)[0]
    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
    targets_array_clean = targets_array[keep_idx]
    assert loss_matrix_clean.shape[1] == targets_array_clean.size
    assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]

    optimised_weights = reweight(
        original_weights,
        loss_matrix_clean,
        targets_array_clean,
        log_path="baseline_calibration_log.csv",
        epochs=250,  # Reduced epochs for faster processing
    )
    data["household_weight"][year] = optimised_weights

output_path = STORAGE_FOLDER / "baseline" / "enhanced_cps_2024_baseline.h5"
output_path.parent.mkdir(parents=True, exist_ok=True)

# Save to HDF5 file
with h5py.File(output_path, "w") as f:
    for variable, values in data.items():
        for year, value in values.items():
            f.create_dataset(f"{variable}/{year}", data=value)

100%|██████████| 250/250 [01:24<00:00,  2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] 
