In [1]:
from policyengine_us.data.datasets.cps.raw_cps import RawCPS_2018, RawCPS_2019

In [2]:
RawCPS_2019()
RawCPS_2018().generate()

Downloaded ASEC: 100%|██████████| 184M/184M [00:09<00:00, 19.5MiB/s]  


['cpspb/asec/prod/data/2019/hhpub19.csv', 'cpspb/asec/prod/data/2019/ffpub19.csv', 'cpspb/asec/prod/data/2019/pppub19.csv', 'cpspb/asec/prod/data/2019/asec_csv_repwgt_2019.csv']


In [3]:
cps_current_year_data = RawCPS_2019().load()
cps_previous_year_data = RawCPS_2018().load()
cps_previous_year = cps_previous_year_data.person.set_index(
    cps_previous_year_data.person.PERIDNUM
)
cps_current_year = cps_current_year_data.person.set_index(
    cps_current_year_data.person.PERIDNUM
)

previous_year_data = cps_previous_year[["WSAL_VAL", "SEMP_VAL"]].rename(
    {
        "WSAL_VAL": "employment_income_last_year",
        "SEMP_VAL": "self_employment_income_last_year",
    },
    axis=1,
)

joined_data = cps_current_year.join(previous_year_data)[
    ["employment_income_last_year", "self_employment_income_last_year"]
]
joined_data["previous_year_income_available"] = (
    ~joined_data.employment_income_last_year.isna()
    & ~joined_data.self_employment_income_last_year.isna()
)
joined_data = joined_data.fillna(-1)

import pandas as pd

cps = pd.DataFrame()

cps["employment_income"] = cps_current_year.WSAL_VAL.values
cps["self_employment_income"] = cps_current_year.SEMP_VAL.values

# CPS already ordered by PERIDNUM, so the join wouldn't change the order.
cps["employment_income_last_year"] = joined_data[
    "employment_income_last_year"
].values
cps["self_employment_income_last_year"] = joined_data[
    "self_employment_income_last_year"
].values
cps["previous_year_income_available"] = joined_data[
    "previous_year_income_available"
].values

In [1]:
from policyengine_us.data.datasets.cps.cps import CPS_2019

CPS_2019().generate()

In [3]:
from policyengine_us import Microsimulation

sim = Microsimulation(dataset=CPS_2019)

VARIABLES = [
    "previous_year_income_available",
    "employment_income",
    "self_employment_income",
    "age",
    "is_male",
    "spm_unit_state_fips",
    "dividend_income",
    "interest_income",
    "social_security",
    "capital_gains",
    "is_disabled",
    "is_blind",
    "is_married",
    "tax_unit_children",
    "pension_income",
]

OUTPUTS = [
    "employment_income_last_year",
    "self_employment_income_last_year",
]

df = sim.calculate_dataframe(VARIABLES + OUTPUTS, 2023, map_to="person")
df_train = df[df.previous_year_income_available]

from survey_enhance import Imputation

income_last_year = Imputation()
X = df_train[VARIABLES[1:]]
y = df_train[OUTPUTS]

income_last_year.train(X, y)

In [6]:
parameters = sim.tax_benefit_system.parameters
projections = parameters("2022-01-01").calibration.gov.irs.soi

from policyengine_us.data.datasets.cps.enhanced_cps.enhanced_cps import (
    CalibratedPUFExtendedCPS,
)

sim = Microsimulation(dataset=CalibratedPUFExtendedCPS)


quantiles = income_last_year.solve_for_mean_quantiles(
    [
        projections.employment_income,
        projections.self_employment_income,
    ],
    df[VARIABLES[1:]],
    sim.calculate("household_weight", 2023, map_to="person").values,
    max_iterations=7,
)
print(f"Mean quantiles: {quantiles}")
y_pred = income_last_year.predict(
    df.drop(columns=OUTPUTS), mean_quantile=quantiles
)

Imputing employment_income_last_year...


KeyboardInterrupt: 

In [2]:
from policyengine_us.data.datasets.cps.enhanced_cps.enhanced_cps import (
    EnhancedCPS_2023,
)

EnhancedCPS_2023().generate()