In [1]:
from survey_enhance.impute import Imputation

income = Imputation.load("income.pkl")

In [5]:
from datasets.frs import SPIEnhancedFRS2019_20
from policyengine_uk import Microsimulation
from datasets.output_dataset import OutputDataset

dataset = OutputDataset.from_dataset(
    SPIEnhancedFRS2019_20, 2019, 2022
)

In [6]:
dataset().person.person_household_id

0            1000.0
1            2000.0
2            3000.0
3            3000.0
4            4000.0
            ...    
86623    19210000.0
86624    19210000.0
86625    19210000.0
86626    19210000.0
86627    19210000.0
Name: person_household_id, Length: 86628, dtype: float32

In [32]:
dataset().person.person_household_id

0            1000.0
1            2000.0
2            3000.0
3            3000.0
4            4000.0
            ...    
86623    19210000.0
86624    19210000.0
86625    19210000.0
86626    19210000.0
86627    19210000.0
Name: person_household_id, Length: 86628, dtype: float32

In [3]:
input_df = sim.calculate_dataframe(["age", "gender", "region"], 2019)

In [4]:
import pandas as pd

df = pd.read_csv("~/ukda/spi_2019_20/put1920uk.tab", delimiter="\t")

In [5]:
uprating = 1_016 / ((df.FACT * df[["PAY", "EPB", "TAXTERM"]].sum(axis=1)).sum()/1e9)

In [6]:
(df.FACT * df.PENSRLF).sum() * 1.16 / 1e9

31.875880377859442

In [7]:
income.models[1].model.n_features_in_

4

In [8]:
TARGETS = [
    1.016e12, # From up-to-date published RTI data
    123.3e9,
    7.25e9,
    78.0e9,
    133.0e9,
    10.3e9,
    30.9e9,
    4.0e9,
    31.9e9,
]

In [9]:
mean_quantiles = income.solve_for_mean_quantiles(
    TARGETS,
    input_df,
    sim.calculate("household_weight", map_to="person").values,
)

PREDICTED: 1460.1 (target: 1016.0)
Iteration 0: 0.5000 (loss: 197220077408910871363584.0000)
PREDICTED: 606.6 (target: 1016.0)
Iteration 1: 0.2500 (loss: 167613208197332740341760.0000)
PREDICTED: 1003.4 (target: 1016.0)
Iteration 2: 0.3750 (loss: 159901594997903917056.0000)
PREDICTED: 1202.7 (target: 1016.0)
Iteration 3: 0.4375 (loss: 34848014078835020529664.0000)
PREDICTED: 1094.5 (target: 1016.0)
Iteration 4: 0.4062 (loss: 6166413878074136330240.0000)
PREDICTED: 1057.4 (target: 1016.0)
Iteration 5: 0.3906 (loss: 1713683795859130548224.0000)
PREDICTED: 1012.9 (target: 1016.0)
Iteration 6: 0.3828 (loss: 9844276874614679552.0000)
PREDICTED: 1043.1 (target: 1016.0)
Iteration 7: 0.3867 (loss: 735348074049954643968.0000)
PREDICTED: 1032.1 (target: 1016.0)
Iteration 8: 0.3848 (loss: 259231172415278809088.0000)
PREDICTED: 1024.7 (target: 1016.0)
Iteration 9: 0.3838 (loss: 75817871100723150848.0000)
PREDICTED: 349.2 (target: 123.3)
Iteration 0: 0.5000 (loss: 51011938153385940746240.0000)
PRED

In [15]:
[float(f"{q:.2f}") for q in mean_quantiles]

[0.38, 0.24, 0.39, 0.28, 0.45, 0.43, 0.29, 0.52, 0.5]

In [10]:
pred_emp = income.models[0].predict(input_df, 0.5)

weights = sim.calculate("household_weight", map_to="person").values
TARGET = 950e9



In [11]:
(pred_emp * weights).sum()/1e9

def get_predictions(mean_quantile):
    return income.models[0].predict(input_df, mean_quantile)

def loss(pred_values):
    return ((pred_values * weights).sum() - TARGET)**2

In [12]:
# Use a binary search to find the mean quantile that minimises the loss

def binary_search(min_quantile, max_quantile, max_iterations=10):
    best_loss = float("inf")
    for i in range(max_iterations):
        mid_quantile = (min_quantile + max_quantile)/2
        pred_values = get_predictions(mid_quantile)
        loss_value = loss(pred_values)
        if loss_value < best_loss:
            max_quantile = mid_quantile
            best_loss = loss_value
        else:
            min_quantile = mid_quantile
        print(f"iteration {i}: {mid_quantile} {loss_value} {best_loss}")
    return mid_quantile

binary_search(0, 1)

KeyboardInterrupt: 

In [None]:
from policyengine_uk import Microsimulation
from datasets.frs import SPIEnhancedFRS2019_20

sim = Microsimulation(
    dataset=SPIEnhancedFRS2019_20(),
    dataset_year=2019,
)

In [None]:
sim.calculate("income_tax")

              value  weight
0       1614.281250  1996.0
1          0.000000   928.0
2      10356.603516  1451.0
3       5484.944336  1451.0
4      11486.338867  1483.0
...             ...     ...
86623   6309.571289     0.0
86624      0.000000     0.0
86625      0.000000     0.0
86626      0.000000     0.0
86627      0.000000     0.0

[86628 rows x 2 columns]

In [None]:
import pandas as pd

df = pd.read_csv("log.csv")

In [None]:
df[df.name.str.contains("income_tax")].sample(10)

Unnamed: 0.1,Unnamed: 0,epoch,name,y_true,y_pred,loss,type,full_name
210618,985,34,income_tax_by_income_8,10773580000.0,923781600.0,0.835862,individual,Loss.Programs.IncomeTax
215758,6125,224,income_tax_payers_NORTHERN_IRELAND_ADDITIONAL,4000.0,4021.772,2.7e-05,individual,Loss.Programs.IncomeTax
219459,9826,361,income_tax_payers_SCOTLAND_HIGHER,405000.0,412089.7,0.000306,individual,Loss.Programs.IncomeTax
214260,4627,169,income_tax_by_income_5,22856090000.0,18941110000.0,0.02934,individual,Loss.Programs.IncomeTax
214034,4401,161,income_tax_ENGLAND,180994200000.0,143284500000.0,0.043409,individual,Loss.Programs.IncomeTax
210650,1017,35,income_tax_payers_WALES_BASIC,1260000.0,1380221.0,0.009104,individual,Loss.Programs.IncomeTax
219830,10197,375,income_tax_payers_WALES_BASIC,1260000.0,1391929.0,0.010963,individual,Loss.Programs.IncomeTax
220054,10421,383,income_tax_payers_SCOTLAND_ADDITIONAL,20000.0,19898.7,2.6e-05,individual,Loss.Programs.IncomeTax
214440,4807,176,income_tax_WALES,5574935000.0,5524641000.0,8.1e-05,individual,Loss.Programs.IncomeTax
209974,341,10,income_tax_payers_ENGLAND_ADDITIONAL,407000.0,318774.9,0.04699,individual,Loss.Programs.IncomeTax


In [None]:
import plotly.express as px

px.line(df[df.name == "Programs"], y=["loss", "y_pred", "y_true"])