In [20]:
import torch
from policyengine_uk import Microsimulation
import pandas as pd
import numpy as np
from tqdm import tqdm

# Fill in missing constituencies with average column values
import pandas as pd
import numpy as np

from policyengine_uk_data.utils.loss import create_target_matrix as create_national_target_matrix

ages = pd.read_csv("targets/age.csv")
incomes = pd.read_csv("targets/income.csv")

ENGLAND_CONSTITUENCY = "E14"
NI_CONSTITUENCY = "N06"
SCOTLAND_CONSTITUENCY = "S14"
WALES_CONSTITUENCY = "W07"

incomes = incomes[np.any([
    incomes["code"].str.contains(country_code) for country_code in [ENGLAND_CONSTITUENCY, NI_CONSTITUENCY, SCOTLAND_CONSTITUENCY, WALES_CONSTITUENCY]
], axis=0)]

full_constituencies = incomes.code
missing_constituencies = pd.Series(list(set(incomes.code) - set(ages.code)))
missing_constituencies = pd.DataFrame({"code": missing_constituencies.values, "name": incomes.set_index("code").loc[missing_constituencies].name.values})
for col in ages.columns[2:]:
    missing_constituencies[col] = ages[col].mean()

ages = pd.concat([ages, missing_constituencies])

sim = Microsimulation()

def create_target_matrix():
    matrix = pd.DataFrame()
    y = pd.DataFrame()

    total_income = sim.calculate("total_income", period=2025).values
    matrix["hmrc/total_income_amount"] = sim.map_result(total_income, "person", "household")
    y["hmrc/total_income_amount"] = incomes["total_income_amount"]

    matrix["hmrc/total_income_count"] = sim.map_result(total_income != 0, "person", "household")
    y["hmrc/total_income_count"] = incomes["total_income_count"]

    age = sim.calculate("age", period=2025)

    for lower_age in range(0, 80, 10):
        upper_age = lower_age + 10
        
        in_age_band = (age >= lower_age) & (age < upper_age)

        age_str = f"{lower_age}_{upper_age}"
        matrix[f"age/{age_str}"] = sim.map_result(in_age_band, "person", "household")

        age_count = ages[[str(age) for age in range(lower_age, upper_age)]].sum(axis=1)

        age_str = f"{lower_age}_{upper_age}"
        y[f"age/{age_str}"] = age_count.values

    return matrix, y

matrix, y = create_target_matrix()

m_national, y_national = create_national_target_matrix("enhanced_frs_2022_23", 2022)

# Weights - 650 x 100180
original_weights = np.log(sim.calculate("household_weight", 2022).values / 650)
weights = torch.tensor(np.ones((650, 100180)) * original_weights, dtype=torch.float32, requires_grad=True)
metrics = torch.tensor(matrix.values, dtype=torch.float32)
weighted_metrics = weights.unsqueeze(-1) * metrics.unsqueeze(0)
totals = weighted_metrics.sum(dim=1)
y = torch.tensor(y.values, dtype=torch.float32)
matrix_national = torch.tensor(m_national.values, dtype=torch.float32)
y_national = torch.tensor(y_national.values, dtype=torch.float32)

def loss(w):
    pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
    mse_c = torch.mean((pred_c / (1 + y) - 1) ** 2)

    pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
    mse_n = torch.mean((pred_n / (1 + y_national) - 1) ** 2)

    return mse_c + mse_n

optimizer = torch.optim.Adam([weights], lr=0.5)

desc = tqdm(range(100))

for epoch in desc:
    optimizer.zero_grad()
    l = loss(torch.exp(weights))
    desc.set_description(f"Loss: {l.item()}")
    l.backward()
    optimizer.step()

final_weights = torch.exp(weights).detach().numpy()

final_weights.sum(axis=1)

Loss: 0.005292116664350033: 100%|██████████| 100/100 [00:45<00:00,  2.19it/s]


array([36673.11 , 28450.59 , 33809.56 , 35723.895, 38148.88 , 37772.32 ,
       43031.355, 33246.973, 42625.168, 42171.363, 43424.082, 33290.605,
       34128.65 , 33101.945, 32351.42 , 40592.89 , 38468.766, 34494.137,
       38954.09 , 51673.41 , 36150.996, 33272.844, 36657.934, 49753.965,
       31133.875, 51011.074, 38906.37 , 40479.145, 33308.11 , 36028.07 ,
       37444.01 , 36049.72 , 39619.09 , 39609.92 , 46079.664, 34784.676,
       38071.707, 37473.004, 38377.22 , 35551.973, 36523.598, 40878.223,
       29612.818, 33639.87 , 32547.094, 33048.547, 41008.27 , 36963.453,
       38990.15 , 35828.312, 43681.3  , 35461.48 , 39694.117, 38584.58 ,
       36798.758, 38176.15 , 38390.734, 37937.54 , 38299.375, 39924.59 ,
       38016.008, 48653.258, 46662.05 , 44722.004, 35128.035, 40026.043,
       33289.965, 38204.605, 39409.77 , 45339.945, 36643.727, 40771.508,
       54665.848, 38320.844, 33847.926, 37206.766, 34793.04 , 36775.547,
       47159.62 , 31018.125, 37504.79 , 43079.535, 

In [55]:
from policyengine_core.reforms import Reform

emp_ni = Reform.from_dict({
  "gov.contrib.policyengine.employer_ni.employee_incidence": {
    "2025-01-01.2025-12-31": 0.4,
    "2026-01-01.2026-12-31": 0.5,
    "2027-01-01.2027-12-31": 0.6,
    "2028-01-01.2028-12-31": 0.7,
    "2028-01-01.2029-12-31": 0.7
  },
  "gov.hmrc.national_insurance.class_1.rates.employer": {
    "2025-01-01.2100-12-31": 0.15
  },
  "gov.hmrc.national_insurance.class_1.thresholds.secondary_threshold": {
    "2025-01-01.2100-12-31": 96.14
  }
}, country_id="uk")

baseline = Microsimulation()
reformed = Microsimulation(reform=emp_ni)

gain = reformed.calculate("household_net_income", 2025) - baseline.calculate("household_net_income", 2025)

In [62]:
weighted_gain = np.dot(final_weights, gain)
populations = np.dot(final_weights, np.ones((100180)))
mean_gain = weighted_gain / populations

In [63]:
incomes

Unnamed: 0,code,name,total_income_count,total_income_amount
1,E14000554,Berwick-upon-Tweed,37000.0,1.184000e+09
2,E14000569,Bishop Auckland,44000.0,1.315600e+09
3,E14000574,Blaydon,44000.0,1.425600e+09
4,E14000575,Blyth Valley,45000.0,1.305000e+09
5,E14000641,City of Durham,45000.0,1.552500e+09
...,...,...,...,...
657,N06000014,South Antrim,48000.0,1.555200e+09
658,N06000015,South Down,48000.0,1.531200e+09
659,N06000016,Strangford,40000.0,1.272000e+09
660,N06000017,Upper Bann,59000.0,1.728700e+09


In [65]:
df = pd.DataFrame({
    "mean_gain": mean_gain,
    "name": incomes.name.values,
})

In [71]:
import plotly.express as px

px.scatter(df.sort_values("mean_gain"), x="name", y="mean_gain")