In [2]:
from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us import Microsimulation
from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024
from policyengine_us_data.utils import build_loss_matrix
import numpy as np
import os
import h5py
import pandas as pd


bad_targets = [
    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
]

In [None]:
# Length of household entity in the dataset measured through household_weight:

# Original ECPS 2024 dataset size: 41310
# Through "random_sampling_minimization" with 0.5 of the dataset being pruned: 20655
# Through "random_sampling_minimization" with 0.2 of the dataset being pruned: 33408
# After minimization through "candidate_loss_contribution" and a 1.0 max error change: 20655 
# After minimization through "candidate_loss_contribution" and a 0.001 max error change: 24786

In [None]:
## ALL TESTS

## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation

input_dataset = ExtendedCPS_2024

approaches = ["l0_sigmoid", "l0_log", "l0_exp", "l1"]
penalty_weights = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

def get_output_path(approach, file_name):
    output_path = STORAGE_FOLDER / approach / file_name
    output_path.parent.mkdir(parents=True, exist_ok=True)
    return output_path

results = []

for approach in approaches:
            for penalty_weight in penalty_weights:
                # Storing files in correct locations
                cal_log_name = f"calibration_log_{approach}_{penalty_weight}.csv"
                h5_name = f"enhanced_cps_2024_{approach}_{penalty_weight}_minimised.h5"
                cal_log_path = get_output_path(approach, cal_log_name)
                h5_path = get_output_path(approach, h5_name)

                sim = Microsimulation(dataset=input_dataset)
                data = sim.dataset.load_dataset()
                data["household_weight"] = {}
                original_weights = sim.calculate("household_weight")
                original_weights = original_weights.values + np.random.normal(
                    1, 0.1, len(original_weights)
                )
                for year in range(2024, 2025):
                    loss_matrix, targets_array = build_loss_matrix(
                        input_dataset, year
                    )

                    bad_mask = loss_matrix.columns.isin(bad_targets)
                    keep_mask_bool = ~bad_mask
                    keep_idx = np.where(keep_mask_bool)[0]
                    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
                    targets_array_clean = targets_array[keep_idx]
                    assert loss_matrix_clean.shape[1] == targets_array_clean.size

                    optimised_weights = reweight(
                        original_weights,
                        loss_matrix_clean,
                        targets_array_clean,
                        log_path=cal_log_path, 
                        penalty_approach=approach,
                        penalty_weight=penalty_weight, 
                        epochs=10,  # Reduced epochs for faster processing
                    )
                    data["household_weight"][year] = optimised_weights

                # Save to HDF5 file
                with h5py.File(h5_path, "w") as f:
                    for variable, values in data.items():
                        for year, value in values.items():
                            f.create_dataset(f"{variable}/{year}", data=value)


## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation

files = [
        STORAGE_FOLDER / "enhanced_cps_2024.h5",
    ]

approaches = {
        "random_sampling_minimization": random_sampling_minimization,
        "candidate_loss_contribution": candidate_loss_contribution,
}

optional_params = {
        "random_sampling_minimization": {
            "target_fractions": [0.5, 0.6, 0.7, 0.8, 0.9],  # fractions of the dataset to keep
        },
        "candidate_loss_contribution": {
            "loss_rel_change_max": [0.001, 0.0001, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss
        }
}

for approach, function in approaches.items():
    minimization_function = function
    # other minimization function approach is "random_sampling_minimization", for which you can specify the tolerance for loss relative change.

    for params, values in optional_params[approach].items():
        for value in values:
            if params == "target_fractions":
                for file in files:
                    output_path = STORAGE_FOLDER / approach / f"{value}_enhanced_cps_2024_minimised.h5"
                    output_path.parent.mkdir(parents=True, exist_ok=True)
                    minimise_dataset(
                        file,
                        output_path,
                        minimization_function=minimization_function, 
                        target_fractions=[value]
                    )
            elif params == "loss_rel_change_max":
                for file in files:
                    output_path = STORAGE_FOLDER / approach / f"{value}_enhanced_cps_2024_minimised.h5"
                    output_path.parent.mkdir(parents=True, exist_ok=True)
                    minimise_dataset(
                        file,
                        output_path,
                        minimization_function=minimization_function, 
                        loss_rel_change_max=value
                    )

100%|██████████| 10/10 [00:03<00:00,  3.00it/s, loss=9.1e-5, loss_rel_change=-0.809] 
100%|██████████| 10/10 [00:03<00:00,  2.96it/s, loss=0.000181, loss_rel_change=-0.679]
100%|██████████| 10/10 [00:03<00:00,  2.98it/s, loss=0.00108, loss_rel_change=-0.273]


In [22]:
## SMALL CHECKS BELOW -- IGNORE ---

## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation

files = [
        STORAGE_FOLDER / "enhanced_cps_2024.h5",
    ]

minimization_function = random_sampling_minimization
# other minimization function approach is "random_sampling_minimization", for which you can specify the tolerance for loss relative change.

for file in files:
    output_path = STORAGE_FOLDER / "random_sampling_minimization" / f"{1.0}_enhanced_cps_2024_random_sampling_minimization_minimised.h5"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    minimise_dataset(
        file,
        output_path,
        minimization_function=minimization_function, 
        target_fractions=[1.0]
    )

Targeting Medicaid enrollment for AK with target 231577k
Targeting Medicaid enrollment for AL with target 766009k
Targeting Medicaid enrollment for AR with target 733561k
Targeting Medicaid enrollment for AZ with target 1778734k
Targeting Medicaid enrollment for CA with target 12172695k
Targeting Medicaid enrollment for CO with target 1058326k
Targeting Medicaid enrollment for CT with target 904321k
Targeting Medicaid enrollment for DC with target 240020k
Targeting Medicaid enrollment for DE with target 236840k
Targeting Medicaid enrollment for FL with target 3568648k
Targeting Medicaid enrollment for GA with target 1699279k
Targeting Medicaid enrollment for HI with target 376318k
Targeting Medicaid enrollment for IA with target 586748k
Targeting Medicaid enrollment for ID with target 296968k
Targeting Medicaid enrollment for IL with target 2918179k
Targeting Medicaid enrollment for IN with target 1623361k
Targeting Medicaid enrollment for KS with target 335902k
Targeting Medicaid enro

In [None]:
input_dataset = ExtendedCPS_2024

sim = Microsimulation(dataset=input_dataset)
data = sim.dataset.load_dataset()
data["household_weight"] = {}
original_weights = sim.calculate("household_weight")
original_weights = original_weights.values + np.random.normal(
    1, 0.1, len(original_weights)
)
for year in range(2024, 2025):
    loss_matrix, targets_array = build_loss_matrix(
        input_dataset, year
    )

    bad_mask = loss_matrix.columns.isin(bad_targets)
    keep_mask_bool = ~bad_mask
    keep_idx = np.where(keep_mask_bool)[0]
    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
    targets_array_clean = targets_array[keep_idx]
    assert loss_matrix_clean.shape[1] == targets_array_clean.size
    assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]

    optimised_weights = reweight(
        original_weights,
        loss_matrix_clean,
        targets_array_clean,
        log_path="baseline_calibration_log.csv",
        epochs=250,  # Reduced epochs for faster processing
    )
    data["household_weight"][year] = optimised_weights

output_path = STORAGE_FOLDER / "baseline" / "enhanced_cps_2024_baseline.h5"
output_path.parent.mkdir(parents=True, exist_ok=True)

# Save to HDF5 file
with h5py.File(output_path, "w") as f:
    for variable, values in data.items():
        for year, value in values.items():
            f.create_dataset(f"{variable}/{year}", data=value)

100%|██████████| 250/250 [01:24<00:00,  2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] 


### Visualization

In [33]:
"""
Creating dataframe to store regularization results
"""

# Initial dataframe setup
reg_results_df = pd.DataFrame({
    'strategy': ['none'],
    'parameter': ['none'],
    'dataset_size': [41310],
    'total_loss': [6.9e-3]
})

def add_result(df, strategy, parameter, dataset_size, total_loss):
    new_rows = pd.DataFrame({
        'strategy': strategy,        
        'parameter': parameter,      
        'dataset_size': dataset_size,
        'total_loss': total_loss
    })
    return pd.concat([reg_results_df, new_rows], ignore_index=True)

# Example usage
#reg_results_df = add_result(reg_results_df, ['L1', 'L2'], ['0.001','0.002'], [35000, 4000], [7.2e-3, 7.2e-3])
reg_results_df

Unnamed: 0,strategy,parameter,dataset_size,total_loss
0,none,none,41310,0.0069


In [34]:
"""
Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe

( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))
"""

approaches = ["l0_exp", "l1"] 
penalty_weights = [1e-2, 1e-1]

def get_output_path(approach, file_name):
    output_path = STORAGE_FOLDER / approach / file_name
    output_path.parent.mkdir(parents=True, exist_ok=True)
    return output_path

for approach in approaches:
    total_size = []
    total_loss = []
    for penalty_weight in penalty_weights:
        strategy = approach
        parameter = penalty_weight

        # Pull length of .h5 file
        h5_name = f"enhanced_cps_2024_{strategy}_{parameter}_minimised.h5"
        h5_path = get_output_path(strategy, h5_name)
        # see if this works
        dataset_size = len(h5py.File(h5_path, "r")['household_weight/2024'])
        total_size.append(dataset_size)

        # Pull sum of loss column
        cal_log_name = f"calibration_log_{approach}_{penalty_weight}.csv"
        cal_log_path = get_output_path(approach, cal_log_name)
        loss_sum = pd.read_csv(cal_log_path)['loss'].sum()
        total_loss.append(loss_sum)

        reg_results_df = add_result(reg_results_df, strategy, parameter, total_size, total_loss)
        # does this weird recursion work?




'''

fraction = [0.5, 0.6, 0.7, 0.8, 0.9]

for fraction in fraction:
    strategy = "random_sampling_minimization"
    parameter = fraction

    # Pull length of .h5 file
    h5_name = f"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised.h5"
    h5_path = STORAGE_FOLDER / strategy / h5_name
    dataset_size = len(h5py.File(h5_path, "r")['household_weight/2024'])

    # Pull sum of loss column
    cal_log_name = f"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised_calibration_log.csv"
    cal_log_path = STORAGE_FOLDER / strategy / cal_log_name
    total_loss = pd.read_csv(cal_log_path)['loss'].sum()

    add_result(df, strategy, parameter, dataset_size, total_loss)

'''
reg_results_df


Unnamed: 0,strategy,parameter,dataset_size,total_loss
0,none,none,41310,0.0069
1,l0_exp,0.01,41310,1263.410322
2,l0_exp,0.1,41310,1263.410322
3,l0_exp,0.1,41310,1263.410322
4,l1,0.01,41310,1263.410322
5,l1,0.1,41310,1263.410322
6,l1,0.1,41310,1263.410322


## Plotting