In [1]:
from policyengine_uk.data import SPI_2020_21

SPI_2020_21().generate()

In [2]:
from policyengine_uk import Microsimulation

sim = Microsimulation(dataset=SPI_2020_21)

In [3]:
sim.calculate("income_tax").sum()/1e9

194.97527893151815

In [4]:
import pandas as pd

df = pd.read_csv(SPI_2020_21.spi_data_file_path, delimiter="\t")

In [5]:
variables_to_add = [
    "income_tax",
    "adjusted_net_income",
    "other_tax_credits",
    "gift_aid",
    "received_allowances",
    "personal_allowance",
    "taxed_income",
    "taxed_dividend_income",
    "taxed_savings_income",
    "earned_taxable_income",
    "pension_contributions",
    "pension_contributions_relief",
    "earned_taxable_income",
    "taxable_pension_income",
    "private_pension_contributions_tax",
    "savings_allowance",
    "dividend_allowance",
    "property_allowance",
    "taxable_property_income",
    "property_income"
]

for variable in variables_to_add:
    df[f"zzz_policyengine_{variable}"] = sim.calculate(variable)

In [6]:
df["error"] = df.zzz_policyengine_income_tax - df.TOTTAX_DEVO_TXP
df["abs_error"] = df.error.abs()

# Add the EMPINC column
df['EMPINC'] = df.apply(lambda row: max(0, row['PAY'] + row['EPB'] - row['EXPS']) + 
                                    row['INCPBEN'] + row['OSSBEN'] + row['TAXTERM'] + 
                                    row['UBISJA'] + row['MOTHINC'], axis=1)

import numpy as np

# pandas show all rows
pd.set_option('display.max_rows', None)

In [7]:
(df[df.abs_error.between(100, np.inf)]
 .sort_values("abs_error", ascending=True)
 .head()
 .T
 .sort_index(key=lambda x: x.str.lower()))



Unnamed: 0,662885,762734,635268,287353,778442
abs_error,100.0,100.0,100.0,100.0,100.0
AGERANGE,4,5,5,3,3
BPADUE,0,0,0,0,0
CAPALL,0,0,0,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,2000,0,0,15000
DSHIPS,3,3,1,3,2
EIDF,1200,15400,0,140,0
EMPINC,100500,133800,13400,11900,49600


In [8]:
def error_category(error):
    if error <= 10:
        return "<=10"
    elif error < 100:
        return "<100"
    elif error < 1000:
        return "<1000"
    else:
        return ">=1000"
df.abs_error.apply(error_category).value_counts(normalize=True) * 100


#495286

abs_error
<=10      83.907681
<1000      8.406449
<100       4.066396
>=1000     3.619475
Name: proportion, dtype: float64

In [10]:
result_df = df[(df["abs_error"] == 250)].sort_values("zzz_policyengine_earned_taxable_income", ascending=True).head().T.sort_index(key=lambda x: x.str.lower())

# Display the result
display(result_df)

Unnamed: 0,200975,684906,525967,232363,501707
abs_error,250.0,250.0,250.0,250.0,250.0
AGERANGE,5,5,6,5,2
BPADUE,0,0,0,0,0
CAPALL,0,0,0,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,0,0,0,0
DSHIPS,3,3,3,3,2
EIDF,0,0,0,0,0
EMPINC,0,5980,0,0,12500


In [11]:
result_df = df[(df['TAXINC'] > 100_000) & (df['TAXINC'] < 125_000)].sort_values("abs_error", ascending=True).head().T.sort_index(key=lambda x: x.str.lower())

# Display the result
display(result_df)

Unnamed: 0,594775,549801,504837,549763,549726
abs_error,0.0,0.0,0.0,0.0,0.0
AGERANGE,3,3,2,4,5
BPADUE,0,0,0,0,0
CAPALL,0,0,0,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,0,0,0,70000
DSHIPS,3,3,3,3,2
EIDF,0,0,0,0,0
EMPINC,117000,120000,107000,118800,45550
