In [1]:
from policyengine_us import Microsimulation
from tqdm import tqdm
import numpy as np
import pandas as pd

CHUNKS = 5

sim = Microsimulation()

households = sim.calculate("household_id", 2024).values
chunk_size = len(households) // CHUNKS + 1
input_df = sim.to_input_dataframe()

VARIABLES = [
    ("household_net_income", 2025),
]

new_input_df = pd.DataFrame()

for i in tqdm(range(CHUNKS)):
    households_in_chunk = households[i * chunk_size : (i + 1) * chunk_size]
    chunk_df = input_df[
        input_df["household_id__2024"].isin(households_in_chunk)
    ]

    subset_sim = Microsimulation(dataset=chunk_df)

    for variable, time_period in VARIABLES:
        chunk_values = subset_sim.calculate(variable, time_period).values

    chunk_df = subset_sim.to_input_dataframe()
    
    new_input_df = pd.concat([new_input_df, chunk_df])

sim = Microsimulation(dataset=new_input_df)

sim.calculate("household_net_income", 2025)

100%|██████████| 5/5 [00:22<00:00,  4.45s/it]


ValueError: Inconsistent input: variable employment_income has already been set for all months contained in period 2025-01, and value [0. 0. 0. ... 0. 0. 0.] provided for 2025-01 doesn't match the total ([0. 0. 0. ... 0. 0. 0.]). This error may also be thrown if you try to call set_input twice for the same variable and period.

In [2]:
new_input_df

Unnamed: 0,employment_income_before_lsr__2024,employment_income_before_lsr__2025,self_employment_income_before_lsr__2024,self_employment_income_before_lsr__2025,employment_income__2025,employment_income__2025-01,employment_income__2025-12,employment_income__2025-02,employment_income__2025-03,employment_income__2025-04,...,or_retirement_credit_eligible__2025,or_retirement_credit__2025,or_kicker__2025,or_tax_before_credits_in_prior_year__2025,wa_capital_gains_tax__2025,wa_working_families_tax_credit__2025,eitc_agi_limit__2025,is_tanf_enrolled__2025,is_demographic_tanf_eligible__2025,is_person_demographic_tanf_eligible__2025
0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
1,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
2,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
3,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
4,41750.621094,43643.113281,0.0,0.0,43643.113281,3636.926025,3636.926025,3636.926025,3636.926025,3636.926025,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30818,83501.242188,87286.226562,0.0,0.0,87286.226562,7273.852051,7273.852051,7273.852051,7273.852051,7273.852051,...,False,0.0,0.0,0.0,0.0,0.0,68675.125,False,True,False
30819,52188.273438,54553.890625,0.0,0.0,54553.890625,4546.157715,4546.157715,4546.157715,4546.157715,4546.157715,...,False,0.0,0.0,0.0,0.0,0.0,68675.125,False,True,False
30820,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,False,0.0,0.0,0.0,0.0,0.0,68675.125,False,True,True
30821,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,False,0.0,0.0,0.0,0.0,0.0,68675.125,False,True,True


In [2]:
from time import time


def calc_chunks(count_chunks=1):
    print(f"CHUNKS: {count_chunks}")
    start = time()
    sim = Microsimulation()

    if count_chunks > 1:
        households = sim.calculate("household_id", 2024).values
        chunk_size = len(households) // count_chunks + 1
        input_df = sim.to_input_dataframe()

        VARIABLES = [
            ("household_net_income", 2025),
        ]

        variable_data = {
            variable: np.array([]) for variable, time_period in VARIABLES
        }

        for i in tqdm(range(count_chunks)):
            households_in_chunk = households[
                i * chunk_size : (i + 1) * chunk_size
            ]
            chunk_df = input_df[
                input_df["household_id__2024"].isin(households_in_chunk)
            ]

            subset_sim = Microsimulation(dataset=chunk_df)

            for variable, time_period in VARIABLES:
                chunk_values = subset_sim.calculate(
                    variable, time_period
                ).values
                variable_data[variable] = np.concatenate(
                    [variable_data[variable], chunk_values]
                )

        for variable, time_period in VARIABLES:
            sim.set_input(variable, time_period, variable_data[variable])

    sim.calculate("household_net_income", 2025)

    end = time()
    print(f"Time: {end - start}")
    return end - start

In [3]:
chunk_tests = [1, 2, 3, 4, 5, 10]
times = [calc_chunks(i) for i in chunk_tests]

CHUNKS: 1
Time: 18.444103002548218
CHUNKS: 2


100%|██████████| 2/2 [00:15<00:00,  7.78s/it]


Time: 15.732024908065796
CHUNKS: 3


100%|██████████| 3/3 [00:15<00:00,  5.12s/it]


Time: 15.487107038497925
CHUNKS: 4


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


Time: 15.63786792755127
CHUNKS: 5


100%|██████████| 5/5 [00:15<00:00,  3.18s/it]


Time: 16.04463005065918
CHUNKS: 10


100%|██████████| 10/10 [00:18<00:00,  1.83s/it]

Time: 18.4305682182312





In [8]:
import pandas as pd

df = pd.DataFrame({"Chunks": chunk_tests, "Time": times})
df.round(1)

Unnamed: 0,Chunks,Time
0,1,18.4
1,2,15.7
2,3,15.5
3,4,15.6
4,5,16.0
5,10,18.4
