# Auto loan interest rate imputation

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import plotly.io as pio
pio.templates.default = "plotly_white"

In [125]:
# Import the SCF_2022 dataset class
from policyengine_us_data.datasets.scf.scf import SCF_2022

# Initialize and load the SCF dataset
# This will automatically download and process the data if needed
print("Loading SCF 2022 dataset...")
scf_dataset = SCF_2022()
scf_dataset.generate()
scf_data = scf_dataset.load()

# Show basic information about the dataset
print(f"\nSCF 2022 Dataset Shape: {scf_data.shape}")
print(f"Dataset columns: {len(scf_data.columns)}")

Loading SCF 2022 dataset...
SCF dataset for 2022 has been generated.

SCF 2022 Dataset Shape: (22975, 358)
Dataset columns: 358


In [78]:
# Display the first few rows of the dataset
scf_data.head(20)

Unnamed: 0,yy1,y1,wgt,hhsex,age,agecl,educ,edcl,married,kids,...,inccat,assetcat,ninccat,ninc2cat,nwpctlecat,incpctlecat,nincpctlecat,incqrtcat,nincqrtcat,year
0,1,11,3027.95612,2,70,5,9,3,2,2,...,2,4,2,1,8,3,3,2,1,2022
1,1,12,3054.900065,2,70,5,9,3,2,2,...,2,5,2,1,8,3,3,2,1,2022
2,1,13,3163.637766,2,70,5,9,3,2,2,...,2,4,2,1,8,3,3,1,1,2022
3,1,14,3166.228463,2,70,5,9,3,2,2,...,2,4,1,1,6,3,2,1,1,2022
4,1,15,3235.624715,2,70,5,9,3,2,2,...,2,4,2,1,8,3,3,1,1,2022
5,2,21,236.634754,1,46,3,12,4,2,0,...,5,5,5,2,8,9,9,4,4,2022
6,2,22,245.848398,1,46,3,12,4,2,0,...,5,5,5,2,8,9,9,4,4,2022
7,2,23,253.103477,1,46,3,12,4,2,0,...,5,5,5,2,8,9,9,4,4,2022
8,2,24,252.908118,1,46,3,12,4,2,0,...,5,5,5,2,8,9,9,4,4,2022
9,2,25,253.811312,1,46,3,12,4,2,0,...,5,5,5,2,8,9,9,4,4,2022


In [79]:
scf_data.columns.to_list()

['yy1',
 'y1',
 'wgt',
 'hhsex',
 'age',
 'agecl',
 'educ',
 'edcl',
 'married',
 'kids',
 'lf',
 'lifecl',
 'famstruct',
 'racecl',
 'racecl4',
 'racecl5',
 'racecl_ex',
 'race',
 'occat1',
 'occat2',
 'indcat',
 'foodhome',
 'foodaway',
 'fooddelv',
 'rent',
 'income',
 'wageinc',
 'bussefarminc',
 'intdivinc',
 'kginc',
 'ssretinc',
 'transfothinc',
 'penacctwd',
 'norminc',
 'wsaved',
 'saved',
 'savres1',
 'savres2',
 'savres3',
 'savres4',
 'savres5',
 'savres6',
 'savres7',
 'savres8',
 'savres9',
 'spendmor',
 'spendless',
 'expenshilo',
 'late',
 'late60',
 'hpayday',
 'bnkruplast5',
 'knowl',
 'yesfinrisk',
 'nofinrisk',
 'crdapp',
 'turndown',
 'feardenial',
 'turnfear',
 'forecllast5',
 'emergborr',
 'emergsav',
 'emergpstp',
 'emergcut',
 'emergwork',
 'hborrff',
 'hborrcc',
 'hborralt',
 'hborrfin',
 'hsavfin',
 'hsavnfin',
 'hpstppay',
 'hpstpln',
 'hpstpoth',
 'hcutfood',
 'hcutent',
 'hcutoth',
 'finlit',
 'bshopnone',
 'bshopgrdl',
 'bshopmodr',
 'ishopnone',
 'ishopg

In [80]:
scf_data.racecl5

0        1
1        1
2        1
3        1
4        1
        ..
22970    2
22971    2
22972    2
22973    2
22974    2
Name: racecl5, Length: 22975, dtype: int8

In [None]:
# Preprocess SCF to follow CPS variable naming conventions

# 1. Age – already in years, same name
#    (CPS randomly spreads the 80+ bin; SCF gives exact age or top‐coded – accept as-is)
scf_data['age'] = scf_data['age']

# 2. Sex → is_female (CPS: A_SEX==2)
#    SCF hhsex: 1=male, 2=female
scf_data['is_female'] = scf_data['hhsex'] == 2

# 3. Race → cps_race
#    SCF’s racecl5: 1=White, 2=Black, 3=Hispanic, 4=Other
#    CPS uses PRDTRACE codes: 1=White, 2=Black, 3=American Indian, 4=Asian, 5=Pacific Islander, 6=Other, 7=Two+ races.
#    We’ll collapse SCF categories as best‐match:
race_map = {
    1: 1,  # White
    2: 2,  # Black
    3: 3,  # Hispanic 
    4: 4,  # Asian
    5: 7,  # Other
}
scf_data['cps_race'] = scf_data['racecl5'].map(race_map).fillna(6).astype(int)

# 4. Children in household → own_children_in_household
scf_data['own_children_in_household'] = scf_data['kids'].fillna(0).astype(int)

# 5. Employment & self‐employment income
#    CPS: WSAL_VAL → employment_income; SEMP_VAL → self_employment_income
scf_data['employment_income']      = scf_data['wageinc'].fillna(0)
scf_data['self_employment_income'] = scf_data['bussefarminc'].fillna(0)

# 6. Farm income – CPS includes separately; SCF bundles business+farm in bussefarminc.
#    If you want to split them, you’ll need SCF sub‐variables; otherwise:
scf_data['farm_income'] = scf_data['bussefarminc'].fillna(0)

# 7. Rent
scf_data['rent']              = scf_data['rent'].fillna(0)

# 8. Household weight
#     CPS household_weight = HSUP_WGT/100; SCF uses wgt
scf_data['household_weight'] = scf_data['wgt']

# 9. Vehicle installment payment
#    CPS: VEH_INST → vehicle_installment_payment; SCF: veh_inst
scf_data['auto_loan_bal'] = scf_data['veh_inst'].fillna(0)

In [82]:
total_auto_loan_bal = (scf_data.auto_loan_bal * scf_data.household_weight).sum()
total_auto_loan_bal / 1e6

968718.9883374859

In [83]:
scf_data.household_weight.sum() / 1e6

131.30638938346357

In [115]:
# Import the CPS 2022 dataset

from policyengine_us_data.datasets.cps.cps import CPS_2022
import h5py

# Initialize and load the CPS dataset
print("Loading CPS 2022 dataset...")
cps_dataset = CPS_2022()
cps_data = cps_dataset.load_dataset()  # Use load_dataset() instead of load()

# Show basic information about the dataset
print(f"\nCPS 2022 Dataset type: {type(cps_data)}")

# CPS data is stored in a dictionary-like format, let's see what variables are available
print(f"\nAvailable variables: {len(cps_data.keys())}")
print("\nSample variables:")
for var in list(cps_data.keys())[:10]:  # Show first 10 variables
    print(f"- {var}: {type(cps_data[var])}, shape: {cps_data[var].shape}")

cps_data.keys()

Loading CPS 2022 dataset...

CPS 2022 Dataset type: <class 'dict'>

Available variables: 102

Sample variables:
- age: <class 'numpy.ndarray'>, shape: (146133,)
- alimony_income: <class 'numpy.ndarray'>, shape: (146133,)
- child_support_expense: <class 'numpy.ndarray'>, shape: (146133,)
- child_support_received: <class 'numpy.ndarray'>, shape: (146133,)
- county_fips: <class 'numpy.ndarray'>, shape: (56839,)
- cps_race: <class 'numpy.ndarray'>, shape: (146133,)
- disability_benefits: <class 'numpy.ndarray'>, shape: (146133,)
- employment_income: <class 'numpy.ndarray'>, shape: (146133,)
- employment_income_last_year: <class 'numpy.ndarray'>, shape: (146133,)
- family_id: <class 'numpy.ndarray'>, shape: (63685,)


dict_keys(['age', 'alimony_income', 'child_support_expense', 'child_support_received', 'county_fips', 'cps_race', 'disability_benefits', 'employment_income', 'employment_income_last_year', 'family_id', 'farm_income', 'free_school_meals_reported', 'has_marketplace_health_coverage', 'health_insurance_premiums_without_medicare_part_b', 'household_id', 'household_weight', 'housing_assistance', 'in_nyc', 'is_blind', 'is_disabled', 'is_female', 'is_full_time_college_student', 'is_hispanic', 'is_household_head', 'is_separated', 'is_widowed', 'keogh_distributions', 'long_term_capital_gains', 'marital_unit_id', 'medicare_part_b_premiums', 'non_qualified_dividend_income', 'other_medical_expenses', 'other_type_retirement_account_distributions', 'over_the_counter_health_expenses', 'own_children_in_household', 'person_family_id', 'person_household_id', 'person_id', 'person_marital_unit_id', 'person_spm_unit_id', 'person_tax_unit_id', 'pre_subsidy_rent', 'previous_year_income_available', 'qualified_

In [116]:
cps_race_mapping = {
    1: 1,  # White only -> WHITE
    2: 2,  # Black only -> BLACK/AFRICAN-AMERICAN
    3: 5,  # American Indian, Alaskan Native only -> AMERICAN INDIAN/ALASKA NATIVE
    4: 4,  # Asian only -> ASIAN
    5: 6,  # Hawaiian/Pacific Islander only -> NATIVE HAWAIIAN/PACIFIC ISLANDER
    6: 7,  # White-Black -> OTHER
    7: 7,  # White-AI -> OTHER
    8: 7,  # White-Asian -> OTHER
    9: 7,  # White-HP -> OTHER
    10: 7,  # Black-AI -> OTHER
    11: 7,  # Black-Asian -> OTHER
    12: 7,  # Black-HP -> OTHER
    13: 7,  # AI-Asian -> OTHER
    14: 7,  # AI-HP -> OTHER
    15: 7,  # Asian-HP -> OTHER
    16: 7,  # White-Black-AI -> OTHER
    17: 7,  # White-Black-Asian -> OTHER
    18: 7,  # White-Black-HP -> OTHER
    19: 7,  # White-AI-Asian -> OTHER
    20: 7,  # White-AI-HP -> OTHER
    21: 7,  # White-Asian-HP -> OTHER
    22: 7,  # Black-AI-Asian -> OTHER
    23: 7,  # White-Black-AI-Asian -> OTHER
    24: 7,  # White-AI-Asian-HP -> OTHER
    25: 7,  # Other 3 race comb. -> OTHER
    26: 7,  # Other 4 or 5 race comb. -> OTHER
}

# Apply the mapping to recode the race values
cps_data["cps_race"] = np.vectorize(cps_race_mapping.get)(cps_data["cps_race"])

cps_data["cps_race"]

array([1, 1, 1, ..., 7, 4, 7])

In [117]:
lengths = {k: len(v) for k, v in cps_data.items()}
var_len = 146133
vars_of_interest = [name for name, ln in lengths.items() if ln == var_len]
receiver_data = pd.DataFrame({n: cps_data[n] for n in vars_of_interest})

# Auto loan interest imputation

In [104]:
PREDICTORS = [
    "age",
    "is_female",
    "cps_race",
    "own_children_in_household",
    "employment_income",
    "self_employment_income",
    "farm_income",
]

IMPUTED_VARIABLES = ["auto_loan_bal"]

In [105]:
donor_data = scf_data[PREDICTORS + IMPUTED_VARIABLES].copy()

In [118]:
from microimpute.comparisons.data import preprocess_data

donor_data, dummy_info, normalizing_params = preprocess_data(donor_data, full_data=True, normalizing_features=True)

receiver_data, dummy_info = preprocess_data(receiver_data, full_data=True)

Found constant columns (std=0): ['roth_ira_contributions', 'social_security_dependents', 'social_security_survivors', 'strike_benefits', 'tax_exempt_401k_distributions', 'tax_exempt_403b_distributions', 'tax_exempt_private_pension_income', 'tax_exempt_sep_distributions', 'traditional_ira_contributions']


In [123]:
from microimpute.models.qrf import QRF
from microimpute.evaluations import cross_validate_model

final_results = cross_validate_model(
    data=donor_data,
    model_class=QRF,
    predictors=PREDICTORS,
    imputed_variables=IMPUTED_VARIABLES,
    n_splits=5,
    random_state=42,
    tune_hyperparameters=True,
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.0min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  2.5min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.7min finished


In [124]:
from microimpute.visualizations.plotting import model_performance_results

fig = model_performance_results(final_results, model_name="QRF")
fig.plot()

In [111]:
qrf_model = QRF()
fitted_model = qrf_model.fit(
    X_train=donor_data,
    predictors=PREDICTORS,
    imputed_variables=IMPUTED_VARIABLES,
    tune_hyperparameters=True,
)

In [119]:
imputations = fitted_model.predict(X_test=receiver_data)

# Unnormalize the imputations
mean = pd.Series(
    {col: p["mean"] for col, p in normalizing_params.items()}
)
std = pd.Series(
    {col: p["std"] for col, p in normalizing_params.items()}
)
unnormalized_imputations = {}
for q, df in imputations.items():
    cols = df.columns  # the imputed variables
    df_unnorm = df.mul(std[cols], axis=1)  # × std
    df_unnorm = df_unnorm.add(mean[cols], axis=1)  # + mean
    unnormalized_imputations[q] = df_unnorm

for var in IMPUTED_VARIABLES:
    receiver_data[var] = unnormalized_imputations[0.5][var]

In [126]:
def plot_log_transformed_distributions(
    scf_data: pd.DataFrame,
    imputed_data: pd.DataFrame,
    variable: str,
) -> go.Figure:
    """Plot the log-transformed distribution of net worth in SCF and imputed CPS data.
    
    Args:
        scf_data: DataFrame containing SCF data.
        imputed_data: DataFrame containing imputed CPS data.
        variable: The variable to plot (e.g., 'networth').

    Returns:
        Plotly figure object.
    """ 
    # Create a log transformation function that handles negative values
    def safe_log(x):
        # For negative values, take log of absolute value and negate
        # For zero, replace with a small positive value
        sign = np.sign(x)
        log_x = np.log10(np.maximum(np.abs(x), 1e-10))
        return sign * log_x

    # Create log-transformed data
    scf_log = safe_log(scf_data[variable])
    cps_log = safe_log(imputed_data[variable])

    # Calculate statistics for log-transformed data
    scf_log_median = np.median(scf_log)
    cps_log_median = np.median(cps_log)
    scf_log_mean = np.mean(scf_log)
    cps_log_mean = np.mean(cps_log)

    # Create a single plot with both distributions
    fig = go.Figure()

    # Add histograms for both datasets
    fig.add_trace(
        go.Histogram(
            x=scf_log,
            nbinsx=60,
            opacity=0.7,
            name=f"SCF Log {variable}",
            marker_color='blue'
        )
    )

    fig.add_trace(
        go.Histogram(
            x=cps_log,
            nbinsx=60,
            opacity=0.7,
            name=f"CPS Imputed Log {variable}",
            marker_color='purple'
        )
    )

    # Add vertical lines for medians
    fig.add_trace(
        go.Scatter(
            x=[scf_log_median, scf_log_median],
            y=[0, 100000],
            mode="lines",
            line=dict(color="blue", width=2, dash="dash"),
            name=f"SCF Median: ${10**scf_log_median:,.0f}"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=[cps_log_median, cps_log_median],
            y=[0, 100000],
            mode="lines",
            line=dict(color="purple", width=2, dash="dash"),
            name=f"CPS Median: ${10**cps_log_median:,.0f}"
        )
    )

    # Add vertical lines for means
    fig.add_trace(
        go.Scatter(
            x=[scf_log_mean, scf_log_mean],
            y=[0, 100000],
            mode="lines",
            line=dict(color="blue", width=2, dash="dot"),
            name=f"SCF Mean: ${10**scf_log_mean:,.0f}"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=[cps_log_mean, cps_log_mean],
            y=[0, 100000],
            mode="lines",
            line=dict(color="purple", width=2, dash="dot"),
            name=f"CPS Mean: ${10**cps_log_mean:,.0f}"
        )
    )

    # Update layout with improved titles and labels
    fig.update_layout(
        title=f"Log-Transformed {variable} Distribution Comparison",
        xaxis_title=f"Log10 of {variable}",
        yaxis_title="Frequency",
        height=600,
        width=1000, 
        barmode='overlay',
        bargap=0.1,
        legend=dict(
            x=0.01,
            y=0.99,
            bgcolor="rgba(255, 255, 255, 0.8)",
            bordercolor="rgba(0, 0, 0, 0.3)",
            borderwidth=1,
            orientation="v",
            xanchor="left",
            yanchor="top"
        )
    )

    # Add tick labels showing the actual dollar values
    tick_values = [-6, -4, -2, 0, 2, 4, 6, 8]
    tick_labels = ['$' + format(10**x if x >= 0 else -10**abs(x), ',.0f') for x in tick_values]
    fig.update_xaxes(
        tickvals=tick_values,
        ticktext=tick_labels
    )

    return fig

plot_log_transformed_distributions(donor_data, receiver_data, IMPUTED_VARIABLES[0]).show()