In [2]:
from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us import Microsimulation
from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024
from policyengine_us_data.utils import build_loss_matrix
import numpy as np
import os
import h5py
import pandas as pd


bad_targets = [
    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
]

In [None]:
# Length of household entity in the dataset measured through household_weight:

# Original ECPS 2024 dataset size: 41310
# Through "random_sampling_minimization" with 0.5 of the dataset being pruned: 20655
# Through "random_sampling_minimization" with 0.2 of the dataset being pruned: 33408
# After minimization through "candidate_loss_contribution" and a 1.0 max error change: 20655 
# After minimization through "candidate_loss_contribution" and a 0.001 max error change: 24786

# Enhanced_CPS_2024.py Approaches

In [None]:
## ALL TESTS

## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation
input_dataset = ExtendedCPS_2024

approaches = ["l0_sigmoid", "l0_log", "l0_exp", "l1"]
penalty_weights = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

def get_output_path(approach, file_name):
    output_path = STORAGE_FOLDER / approach / file_name
    output_path.parent.mkdir(parents=True, exist_ok=True)
    return output_path

results = []

for approach in approaches:
            for penalty_weight in penalty_weights:
                # Storing files in correct locations
                cal_log_name = f"calibration_log_{approach}_{penalty_weight}.csv"
                h5_name = f"enhanced_cps_2024_{approach}_{penalty_weight}_minimised.h5"
                cal_log_path = get_output_path(approach, cal_log_name)
                h5_path = get_output_path(approach, h5_name)

                sim = Microsimulation(dataset=input_dataset)
                data = sim.dataset.load_dataset()
                data["household_weight"] = {}
                original_weights = sim.calculate("household_weight")
                original_weights = original_weights.values + np.random.normal(
                    1, 0.1, len(original_weights)
                )
                for year in range(2024, 2025):
                    loss_matrix, targets_array = build_loss_matrix(
                        input_dataset, year
                    )

                    bad_mask = loss_matrix.columns.isin(bad_targets)
                    keep_mask_bool = ~bad_mask
                    keep_idx = np.where(keep_mask_bool)[0]
                    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
                    targets_array_clean = targets_array[keep_idx]
                    assert loss_matrix_clean.shape[1] == targets_array_clean.size

                    optimised_weights = reweight(
                        original_weights,
                        loss_matrix_clean,
                        targets_array_clean,
                        log_path=cal_log_path, 
                        penalty_approach=approach,
                        penalty_weight=penalty_weight, 
                        epochs=250,  # Reduced epochs for faster processing
                    )
                    data["household_weight"][year] = optimised_weights

                # Save to HDF5 file
                with h5py.File(h5_path, "w") as f:
                    for variable, values in data.items():
                        for year, value in values.items():
                            f.create_dataset(f"{variable}/{year}", data=value)

100%|██████████| 10/10 [00:03<00:00,  3.00it/s, loss=9.1e-5, loss_rel_change=-0.809] 
100%|██████████| 10/10 [00:03<00:00,  2.96it/s, loss=0.000181, loss_rel_change=-0.679]
100%|██████████| 10/10 [00:03<00:00,  2.98it/s, loss=0.00108, loss_rel_change=-0.273]
100%|██████████| 10/10 [00:02<00:00,  3.59it/s, loss=0.0101, loss_rel_change=-0.0377]
100%|██████████| 10/10 [00:02<00:00,  3.46it/s, loss=0.1, loss_rel_change=-0.00391]
100%|██████████| 10/10 [00:02<00:00,  3.52it/s, loss=0.000191, loss_rel_change=-0.672]
100%|██████████| 10/10 [00:03<00:00,  2.89it/s, loss=0.00116, loss_rel_change=-0.274]
100%|██████████| 10/10 [00:02<00:00,  3.57it/s, loss=0.00978, loss_rel_change=-0.166]
100%|██████████| 10/10 [00:02<00:00,  3.34it/s, loss=0.0881, loss_rel_change=-0.22]
100%|██████████| 10/10 [00:02<00:00,  3.55it/s, loss=0.866, loss_rel_change=-0.23]
100%|██████████| 10/10 [00:03<00:00,  3.31it/s, loss=9.12e-5, loss_rel_change=-0.812]
100%|██████████| 10/10 [00:03<00:00,  3.26it/s, loss=0.0001

=== CALIBRATION LOG DEBUG ===
File path: /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5
Epoch: 0
Number of households: 41310
Total weight: 12764381616743.21
Weight range: 0.54 to 1303728.75
Loss matrix shape: (41310, 2813)
Number of targets: 2813
After filtering bad targets:
Loss matrix clean shape: (41310, 2805)
Number of clean targets: 2805
Estimates shape: (2805,)
Estimates sum: 324584770671300.88
First 3 estimates: nation/irs/adjusted gross income/total/AGI in -inf-inf/taxable/All    1.498784e+13
nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/All     1.609638e+10
nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/All     6.707770e+10
dtype: float64
First 3 targets: [1.62972204e+13 1.68634879e+10 6.76819729e+10]
Mean absolute error: 17235490830.73
Mean relative error: 0.0997
=== END DEBUG ===



100%|██████████| 250/250 [01:38<00:00,  2.54it/s, loss=3.62e-5, loss_rel_change=-0.301]
100%|██████████| 250/250 [01:35<00:00,  2.62it/s, loss=3.58e-5, loss_rel_change=-0.294]
100%|██████████| 250/250 [01:33<00:00,  2.68it/s, loss=3.34e-5, loss_rel_change=-0.376]


Weight relative change: 99.95%
Re-calibrating final selected households...


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

# Minimize.py approaches

In [None]:
## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation

files = [
        STORAGE_FOLDER / "enhanced_cps_2024.h5",
    ]

approaches = {
        "random_sampling_minimization": random_sampling_minimization,
        "candidate_loss_contribution": candidate_loss_contribution,
}

optional_params = {
        "random_sampling_minimization": {
            "target_fractions": [0.5, 0.6]#, 0.7, 0.8, 0.9],  # fractions of the dataset to keep
        },
        "candidate_loss_contribution": {
            "loss_rel_change_max": [0.001, 0.0001]#, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss
        }
}

for approach, function in approaches.items():
    minimization_function = function
    # other minimization function approach is "random_sampling_minimization", for which you can specify the tolerance for loss relative change.

    for params, values in optional_params[approach].items():
        for value in values:
            if params == "target_fractions":
                for file in files:
                    output_path = STORAGE_FOLDER / approach / f"{value}_enhanced_cps_2024_minimised.h5"
                    output_path.parent.mkdir(parents=True, exist_ok=True)
                    minimize_dataset(
                        file,
                        output_path,
                        minimization_function=minimization_function, 
                        target_fractions=[value]
                    )
            elif params == "loss_rel_change_max":
                for file in files:
                    output_path = STORAGE_FOLDER / approach / f"{value}_enhanced_cps_2024_minimised.h5"
                    output_path.parent.mkdir(parents=True, exist_ok=True)
                    minimize_dataset(
                        file,
                        output_path,
                        minimization_function=minimization_function, 
                        loss_rel_change_max=value
                    )

### (Temporary) Cleaning of data (removing weights smaller than epsilon)

In [None]:
## this should go in the enhanced_cps_2024.py file, because household removal doesn't happen there
# Need to check Ben's PR.

### Visualization

In [62]:
"""
Creating scoring of loss
Creating dataframe to store regularization results
"""

# Calculate quality categories
def loss_score(calibration_log):
    excellent_count = (
        calibration_log["rel_abs_error"] < 0.05).sum()  # < 5% error
    good_count = (
        (calibration_log["rel_abs_error"] >= 0.05)
        & (calibration_log["rel_abs_error"] < 0.20)).sum()  # 5-20% error
    total_targets = len(calibration_log)
    # Calculate quality score
    quality_score = (excellent_count * 100 + good_count * 75) / total_targets
    return quality_score



# Initial dataframe setup
reg_results_df = pd.DataFrame({
    'strategy': ['none'],
    'parameter': ['none'],
    'dataset_size': [41310],
    'total_loss': [6.9e-3]
})

def add_result(df, strategy, parameter, dataset_size, total_loss):
    new_rows = pd.DataFrame({
        'strategy': strategy,        
        'parameter': parameter,      
        'dataset_size': [dataset_size],
        'total_loss': [total_loss]
    })
    return pd.concat([reg_results_df, new_rows], ignore_index=True)

# Example usage
#reg_results_df = add_result(reg_results_df, ['L1', 'L2'], ['0.001','0.002'], [35000, 4000], [7.2e-3, 7.2e-3])
reg_results_df

Unnamed: 0,strategy,parameter,dataset_size,total_loss
0,none,none,41310,0.0069


In [63]:
"""
Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe

( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))

approaches = ["l0_exp", "l1"] 
penalty_weights = [1e-2, 1e-1]
"""
approaches = ["l0_sigmoid", "l0_log", "l0_exp", "l1"]
penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
og_size = 41310  # Original size of the dataset
og_loss = 6.9e-3  # Original loss from the baseline dataset

for approach in approaches:
    strategy = approach
    reg_results_df = add_result(reg_results_df, strategy, 1.0, og_size, og_loss)
    for penalty_weight in penalty_weights:
        parameter = penalty_weight

        # Pull length of .h5 file
        h5_name = f"enhanced_cps_2024_{strategy}_{parameter}_minimised.h5"
        h5_path = get_output_path(strategy, h5_name)
        # see if this works
        dataset_size = len(h5py.File(h5_path, "r")['household_weight/2024'])
        #total_size.append(dataset_size)

        # Pull sum of loss column
        cal_log_name = f"calibration_log_{strategy}_{parameter}.csv"
        cal_log_path = get_output_path(strategy, cal_log_name)
        calibration_log = pd.read_csv(cal_log_path)
        loss_value = loss_score(calibration_log)
        
        reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)



'''

fraction = [0.5, 0.6, 0.7, 0.8, 0.9]

for fraction in fraction:
    strategy = "random_sampling_minimization"
    parameter = fraction

    # Pull length of .h5 file
    h5_name = f"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised.h5"
    h5_path = STORAGE_FOLDER / strategy / h5_name
    dataset_size = len(h5py.File(h5_path, "r")['household_weight/2024'])

    # Pull sum of loss column
    cal_log_name = f"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised_calibration_log.csv"
    cal_log_path = STORAGE_FOLDER / strategy / cal_log_name
    total_loss = pd.read_csv(cal_log_path)['loss'].sum()

    add_result(df, strategy, parameter, dataset_size, total_loss)

'''
reg_results_df


Unnamed: 0,strategy,parameter,dataset_size,total_loss
0,none,none,41310,0.0069
1,l0_sigmoid,1.0,41310,0.0069
2,l0_sigmoid,0.1,41310,39.2959
3,l0_sigmoid,0.01,41310,39.2959
4,l0_sigmoid,0.001,41310,39.2959
5,l0_sigmoid,0.0001,41310,39.2959
6,l0_sigmoid,0.00001,41310,39.2959
7,l0_log,1.0,41310,0.0069
8,l0_log,0.1,41310,39.2959
9,l0_log,0.01,41310,39.2959


## Plotting

In [None]:
'''
Synthetic dataset
'''

# Define values
strategies = ['l0_sigmoid', 'l0_log', 'l0_exp', 'l1']
parameters = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001]

# Synthetic values
base_size = 41310
min_size = 20000
base_loss = 0.0069
max_loss = 40.0

# Construct rows
rows = [{'strategy': 'none', 'parameter': 'none', 'dataset_size': base_size, 'total_loss': base_loss}]

for strategy in strategies:
    for i, param in enumerate(parameters):
        # Gradually decrease size and increase loss
        size = int(base_size - (base_size - min_size) * (i / (len(parameters) - 1)))
        loss = round(base_loss + (max_loss - base_loss) * (i / (len(parameters) - 1)), 4)
        rows.append({
            'strategy': strategy,
            'parameter': param,
            'dataset_size': size,
            'total_loss': loss
        })

# Create DataFrame
reg_results_df = pd.DataFrame(rows)

# Display
print(reg_results_df)