In [1]:
import pandas as pd
from datasets.frs.imputations.income import SPI_TAB_FOLDER, generate_spi_table
from datasets.frs.frs import FRS_2019_20
from survey_enhance.percentile_matching import match_percentiles

spi = pd.read_csv(SPI_TAB_FOLDER / "put1920uk.tab", delimiter="\t")
spi = generate_spi_table(spi)

frs = FRS_2019_20().load()

percentile_matched_variables = [
    "employment_income",
    "self_employment_income",
    "pension_income",
    "dividend_income",
    "savings_interest_income",
]

new_values = {}

for variable in frs.keys():
    if variable not in percentile_matched_variables:
        new_values[variable] = frs[variable][...]
    else:
        new_values[variable] = match_percentiles(
            frs[variable][...], spi[variable]
        )

In [9]:
new_values["dividend_income"].describe()

count     43314.000000
mean        596.328980
std       14475.300716
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max      509091.498029
dtype: float64

In [11]:
pd.Series(frs["dividend_income"]).describe()

count    43314.000000
mean       143.013337
std       1871.871869
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      99723.035616
dtype: float64

In [19]:
from policyengine_uk import Microsimulation

sim = Microsimulation(dataset=FRS_2019_20(), dataset_year=2019)
person_weight = sim.calc("person_weight", map_to="person").values

In [23]:
(person_weight * new_values["employment_income"]).sum() / 1e9

1111.5268326586258