In [1]:
from policyengine_us import Microsimulation
from tqdm import tqdm
import numpy as np

CHUNKS = 5

sim = Microsimulation()

households = sim.calculate("household_id", 2024).values
chunk_size = len(households) // CHUNKS + 1
input_df = sim.to_input_dataframe()

VARIABLES = [
    ("household_net_income", 2025),
]

variable_data = {variable: np.array([]) for variable, time_period in VARIABLES}

for i in tqdm(range(CHUNKS)):
    households_in_chunk = households[i * chunk_size : (i + 1) * chunk_size]
    chunk_df = input_df[
        input_df["household_id__2024"].isin(households_in_chunk)
    ]

    subset_sim = Microsimulation(dataset=chunk_df)

    for variable, time_period in VARIABLES:
        chunk_values = subset_sim.calculate(variable, time_period).values
        variable_data[variable] = np.concatenate(
            [variable_data[variable], chunk_values]
        )

for variable, time_period in VARIABLES:
    sim.set_input(variable, time_period, variable_data[variable])

sim.calculate("household_net_income", 2025)

100%|██████████| 5/5 [00:16<00:00,  3.25s/it]


               value       weight
0       40366.988281   615.080688
1       51688.554688  1595.025757
2       15619.054688   593.639709
3       20304.685547  1527.942261
4        4490.224609  1555.630493
...              ...          ...
56246   38931.617188   406.693298
56247   52792.472656   717.275391
56248    6511.081055   592.954590
56249  127794.601562   391.826752
56250  126043.585938   731.477356

[56251 rows x 2 columns]

In [2]:
from time import time


def calc_chunks(count_chunks=1):
    print(f"CHUNKS: {count_chunks}")
    start = time()
    sim = Microsimulation()

    if count_chunks > 1:
        households = sim.calculate("household_id", 2024).values
        chunk_size = len(households) // count_chunks + 1
        input_df = sim.to_input_dataframe()

        VARIABLES = [
            ("household_net_income", 2025),
        ]

        variable_data = {
            variable: np.array([]) for variable, time_period in VARIABLES
        }

        for i in tqdm(range(count_chunks)):
            households_in_chunk = households[
                i * chunk_size : (i + 1) * chunk_size
            ]
            chunk_df = input_df[
                input_df["household_id__2024"].isin(households_in_chunk)
            ]

            subset_sim = Microsimulation(dataset=chunk_df)

            for variable, time_period in VARIABLES:
                chunk_values = subset_sim.calculate(
                    variable, time_period
                ).values
                variable_data[variable] = np.concatenate(
                    [variable_data[variable], chunk_values]
                )

        for variable, time_period in VARIABLES:
            sim.set_input(variable, time_period, variable_data[variable])

    sim.calculate("household_net_income", 2025)

    end = time()
    print(f"Time: {end - start}")
    return end - start

In [3]:
chunk_tests = [1, 2, 3, 4, 5, 10]
times = [calc_chunks(i) for i in chunk_tests]

CHUNKS: 1
Time: 18.444103002548218
CHUNKS: 2


100%|██████████| 2/2 [00:15<00:00,  7.78s/it]


Time: 15.732024908065796
CHUNKS: 3


100%|██████████| 3/3 [00:15<00:00,  5.12s/it]


Time: 15.487107038497925
CHUNKS: 4


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


Time: 15.63786792755127
CHUNKS: 5


100%|██████████| 5/5 [00:15<00:00,  3.18s/it]


Time: 16.04463005065918
CHUNKS: 10


100%|██████████| 10/10 [00:18<00:00,  1.83s/it]

Time: 18.4305682182312





In [8]:
import pandas as pd

df = pd.DataFrame({"Chunks": chunk_tests, "Time": times})
df.round(1)

Unnamed: 0,Chunks,Time
0,1,18.4
1,2,15.7
2,3,15.5
3,4,15.6
4,5,16.0
5,10,18.4
