In [None]:
import torch
import pandas as pd
import numpy as np
from policyengine_us import Microsimulation
import plotly.express as px

simulation = Microsimulation()
parameters = simulation.tax_benefit_system.parameters.calibration("2023-01-01")

household_weights = torch.tensor(simulation.calculate("household_weight", 2023).values, dtype=torch.float32)
weight_adjustment = torch.tensor(np.random.random(household_weights.shape) * 0, requires_grad=True, dtype=torch.float32)

values_df = pd.DataFrame()
targets = {}
equivalisation = {}

# We need to normalise the targets. Common regression targets are often 1e1 to 1e3 (this informs the scale of the learning rate).
COUNT_HOUSEHOLDS = household_weights.sum().item()
FINANCIAL_EQUIVALISATION = COUNT_HOUSEHOLDS
POPULATION_EQUIVALISATION = COUNT_HOUSEHOLDS / 1e5

# Financial totals
values_df["adjusted_gross_income"] = simulation.calculate("adjusted_gross_income", 2023, map_to="household").values
targets["adjusted_gross_income"] = parameters.agi_by_source.projections.adjusted_gross_income
equivalisation["adjusted_gross_income"] = FINANCIAL_EQUIVALISATION

values_df["employment_income"] = simulation.calculate("employment_income", 2023, map_to="household").values
targets["employment_income"] = parameters.agi_by_source.projections.employment_income
equivalisation["employment_income"] = FINANCIAL_EQUIVALISATION

values_df["taxable_interest_and_ordinary_dividends"] = simulation.calculate("taxable_interest_and_ordinary_dividends", 2023, map_to="household").values
targets["taxable_interest_and_ordinary_dividends"] = parameters.agi_by_source.projections.taxable_interest_and_ordinary_dividends
equivalisation["taxable_interest_and_ordinary_dividends"] = FINANCIAL_EQUIVALISATION

values_df["qualified_dividend_income"] = simulation.calculate("qualified_dividend_income", 2023, map_to="household").values
targets["qualified_dividend_income"] = parameters.agi_by_source.projections.qualified_dividend_income
equivalisation["qualified_dividend_income"] = FINANCIAL_EQUIVALISATION

# adjusted_gross_income, employment_income, taxable_interest_and_ordinary_dividends, qualified_dividend_income, net_capital_gain, self_employment_income, taxable_pension_income, taxable_social_security, irs_other_income, above_the_line_deductions

values

# Total population
values_df["population"] = simulation.calculate("people", 2023, map_to="household").values
targets["population"] = parameters.populations.total
equivalisation["population"] = POPULATION_EQUIVALISATION

# Population by 5-year age group
age = simulation.calculate("age").values
for lower_age_group in range(0, 90, 5):
    in_age_range = (age >= lower_age_group) & (age < lower_age_group + 5)
    count_people_in_range = simulation.map_result(in_age_range, "person", "household")
    values_df[f"population_{lower_age_group}_to_{lower_age_group + 5}"] = count_people_in_range
    targets[f"population_{lower_age_group}_to_{lower_age_group + 5}"] = (household_weights.numpy() * count_people_in_range).sum()
    equivalisation[f"population_{lower_age_group}_to_{lower_age_group + 5}"] = POPULATION_EQUIVALISATION

targets_array = torch.tensor(list(targets.values()), dtype=torch.float32)
equivalisation_factors_array = torch.tensor(list(equivalisation.values()), dtype=torch.float32)

loss_values = []

def aggregate(adjusted_weights: torch.Tensor, values: pd.DataFrame) -> torch.Tensor:
    broadcasted_weights = adjusted_weights.reshape(-1, 1)
    weighted_values = torch.matmul(
        broadcasted_weights.T, 
        torch.tensor(values.values, dtype=torch.float32)
    )
    return weighted_values

training_log_df = pd.DataFrame()

for i in range(10_000):
    adjusted_weights = torch.relu(household_weights + weight_adjustment)
    result = aggregate(adjusted_weights, values_df) / equivalisation_factors_array
    loss = torch.sum((result - targets_array / equivalisation_factors_array) ** 2)
    loss.backward()
    if i % 50 == 0:
        current_loss = loss.item()
        current_aggregates = (result * equivalisation_factors_array).detach().numpy()[0]
        training_log_df = training_log_df.append(
            pd.DataFrame({
                "name": list(targets.keys()),
                "epoch": [i] * len(targets),
                "value": list(current_aggregates),
                "target": list(targets.values()),
            })
        )
    weight_adjustment.data -= 1e-1 * weight_adjustment.grad
    weight_adjustment.grad.zero_()

training_log_df.to_csv("training_log.csv.gz", compression="gzip")