In [5]:
from policyengine_us import Microsimulation
from tqdm import tqdm
import numpy as np
import pandas as pd

CHUNKS = 20


from policyengine_core.reforms import Reform

reform = Reform.from_dict({
    "gov.contrib.cbo.labor_supply.elasticities": True,
    "gov.simulation.capital_gains_responses.elasticity": -0.7,
})
sim = Microsimulation(reform=reform)

households = sim.calculate("household_id", 2024).values
chunk_size = len(households) // CHUNKS + 1
input_df = sim.to_input_dataframe()

VARIABLES_TO_STORE = [
    ("household_net_income", 2025),
]

variable_data = {}

for i in tqdm(range(CHUNKS)):
    households_in_chunk = households[i * chunk_size : (i + 1) * chunk_size]
    chunk_df = input_df[
        input_df["household_id__2024"].isin(households_in_chunk)
    ].reset_index()

    subset_sim = Microsimulation(dataset=chunk_df, reform=reform)

    for variable, time_period in VARIABLES_TO_STORE:
        chunk_values = subset_sim.calculate(variable, time_period).values
        if (variable, time_period) not in variable_data:
            variable_data[(variable, time_period)] = np.array([])
        variable_data[(variable, time_period)] = np.concatenate(
            [variable_data[(variable, time_period)], chunk_values]
        )

sim = Microsimulation(dataset=input_df, reform=reform)

for variable, time_period in VARIABLES_TO_STORE:
    sim.set_input(variable, time_period, variable_data[(variable, time_period)])

sim.calculate("household_net_income", 2025)

  0%|          | 0/20 [01:45<?, ?it/s]


Exception: RecursionError while calculating taxable_ss_magi for period 2025. The full computation stack is:
  - household_net_income 2025, baseline_lsr_measurement
  - household_benefits 2025, baseline_lsr_measurement
  - wic 2025, baseline_lsr_measurement
  - is_wic_eligible 2025, baseline_lsr_measurement
  - meets_wic_categorical_eligibility 2025, baseline_lsr_measurement
  - medicaid 2025, baseline_lsr_measurement
  - is_medicaid_eligible 2025, baseline_lsr_measurement
  - medicaid_category 2025, baseline_lsr_measurement
  - is_infant_for_medicaid 2025, baseline_lsr_measurement
  - is_infant_for_medicaid_fc 2025, baseline_lsr_measurement
  - medicaid_income_level 2025, baseline_lsr_measurement
  - tax_unit_medicaid_income_level 2025, baseline_lsr_measurement
  - medicaid_magi 2025, baseline_lsr_measurement
  - adjusted_gross_income 2025, baseline_lsr_measurement
  - irs_gross_income 2025, baseline_lsr_measurement
  - taxable_unemployment_compensation 2025, baseline_lsr_measurement
  - tax_unit_taxable_unemployment_compensation 2025, baseline_lsr_measurement
  - taxable_uc_agi 2025, baseline_lsr_measurement
  - taxable_social_security 2025, baseline_lsr_measurement
  - tax_unit_taxable_social_security 2025, baseline_lsr_measurement
  - tax_unit_combined_income_for_social_security_taxability 2025, baseline_lsr_measurement
  - taxable_ss_magi 2025, baseline_lsr_measurement

In [None]:
from time import time


def calc_chunks(count_chunks=1):
    print(f"CHUNKS: {count_chunks}")
    start = time()
    sim = Microsimulation()

    if count_chunks > 1:
        households = sim.calculate("household_id", 2024).values
        chunk_size = len(households) // count_chunks + 1
        input_df = sim.to_input_dataframe()

        VARIABLES = [
            ("household_net_income", 2025),
        ]

        variable_data = {
            variable: np.array([]) for variable, time_period in VARIABLES
        }

        for i in tqdm(range(count_chunks)):
            households_in_chunk = households[
                i * chunk_size : (i + 1) * chunk_size
            ]
            chunk_df = input_df[
                input_df["household_id__2024"].isin(households_in_chunk)
            ]

            subset_sim = Microsimulation(dataset=chunk_df)

            for variable, time_period in VARIABLES:
                chunk_values = subset_sim.calculate(
                    variable, time_period
                ).values
                variable_data[variable] = np.concatenate(
                    [variable_data[variable], chunk_values]
                )

        for variable, time_period in VARIABLES:
            sim.set_input(variable, time_period, variable_data[variable])

    sim.calculate("household_net_income", 2025)

    end = time()
    print(f"Time: {end - start}")
    return end - start

In [None]:
chunk_tests = [1, 2, 3, 4, 5, 10]
times = [calc_chunks(i) for i in chunk_tests]

CHUNKS: 1
Time: 18.444103002548218
CHUNKS: 2


100%|██████████| 2/2 [00:15<00:00,  7.78s/it]


Time: 15.732024908065796
CHUNKS: 3


100%|██████████| 3/3 [00:15<00:00,  5.12s/it]


Time: 15.487107038497925
CHUNKS: 4


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


Time: 15.63786792755127
CHUNKS: 5


100%|██████████| 5/5 [00:15<00:00,  3.18s/it]


Time: 16.04463005065918
CHUNKS: 10


100%|██████████| 10/10 [00:18<00:00,  1.83s/it]

Time: 18.4305682182312





In [None]:
import pandas as pd

df = pd.DataFrame({"Chunks": chunk_tests, "Time": times})
df.round(1)

Unnamed: 0,Chunks,Time
0,1,18.4
1,2,15.7
2,3,15.5
3,4,15.6
4,5,16.0
5,10,18.4
