In [1]:
from policyengine_uk.data import SPI_2020_21

SPI_2020_21().generate()

In [2]:
from policyengine_uk import Microsimulation

sim = Microsimulation(dataset=SPI_2020_21)

In [3]:
sim.calculate("income_tax").sum()/1e9

194.97527893151815

In [4]:
import pandas as pd

df = pd.read_csv(SPI_2020_21.spi_data_file_path, delimiter="\t")

In [5]:
variables_to_add = [
    "income_tax",
    "adjusted_net_income",
    "other_tax_credits",
    "gift_aid",
    "received_allowances",
    "personal_allowance",
    "taxed_income",
    "taxed_dividend_income",
    "taxed_savings_income",
    "earned_taxable_income",
    "pension_contributions",
    "pension_contributions_relief",
    "earned_taxable_income",
    "taxable_pension_income",
    "private_pension_contributions_tax",
    "savings_allowance",
    "dividend_allowance",
    "property_allowance",
    "taxable_property_income",
    "property_income"
]

for variable in variables_to_add:
    df[f"zzz_policyengine_{variable}"] = sim.calculate(variable)

In [6]:
df["error"] = df.zzz_policyengine_income_tax - df.TOTTAX_DEVO_TXP
df["abs_error"] = df.error.abs()

# Add the EMPINC column
df['EMPINC'] = df.apply(lambda row: max(0, row['PAY'] + row['EPB'] - row['EXPS']) + 
                                    row['INCPBEN'] + row['OSSBEN'] + row['TAXTERM'] + 
                                    row['UBISJA'] + row['MOTHINC'], axis=1)

import numpy as np

# pandas show all rows
pd.set_option('display.max_rows', None)

In [None]:
(df[df.abs_error.between(100, np.inf)]
 .sort_values("abs_error", ascending=True)
 .head()
 .T
 .sort_index(key=lambda x: x.str.lower()))



Unnamed: 0,118935,401891,401779,474202,83159
abs_error,100.0,100.0,100.0,100.0,100.0
AGERANGE,6,2,1,5,3
BPADUE,0,0,0,0,0
CAPALL,0,0,41000,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,15,0,0,0,0
DSHIPS,3,3,3,3,3
EIDF,0,0,90,95,0
EMPINC,200,31200,28090,2370,37250


In [None]:
def error_category(error):
    if error <= 10:
        return "<=10"
    elif error < 100:
        return "<100"
    elif error < 1000:
        return "<1000"
    else:
        return ">=1000"
df.abs_error.apply(error_category).value_counts(normalize=True) * 100


#495286

abs_error
<=10      83.793343
<1000      8.434593
<100       4.100823
>=1000     3.671241
Name: proportion, dtype: float64

In [None]:
result_df = df[(df['TI'] - df['zzz_policyengine_adjusted_net_income'] == 1000)].sort_values("abs_error", ascending=True).head().T.sort_index(key=lambda x: x.str.lower())

# Display the result
display(result_df)

abs_error
AGERANGE
BPADUE
CAPALL
COVNTS
DEFICIEN
DIVIDENDS
DSHIPS
EIDF
EMPINC
EPB


In [None]:
result_df = df[(df['TAXINC'] > 100_000) & (df['TAXINC'] < 125_000)].sort_values("abs_error", ascending=True).head().T.sort_index(key=lambda x: x.str.lower())

# Display the result
display(result_df)

Unnamed: 0,594775,676172,447861,676180,447882
abs_error,0.0,0.0,0.0,0.0,0.0
AGERANGE,3,5,5,3,5
BPADUE,0,0,0,0,0
CAPALL,0,0,13500,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,400,0,2000,0
DSHIPS,3,1,3,3,3
EIDF,0,0,0,0,0
EMPINC,117000,76800,0,0,110000
