# Auto loan interest rate imputation

In [9]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import logging
from tqdm import tqdm
import requests
import io
import zipfile

import plotly.io as pio
pio.templates.default = "plotly_white"

In [12]:
from policyengine_us_data.datasets.scf.scf import SCF_2022

# Initialize and load the SCF dataset
# This will automatically download and process the data
print("Loading SCF 2022 dataset...")
scf_dataset = SCF_2022()
scf_data = scf_dataset.load_dataset()
scf_data.keys()
scf_data = pd.DataFrame({key: scf_data[key] for key in scf_data.keys()})
scf_data.head()

Loading SCF 2022 dataset...


Unnamed: 0,actbus,age,agecl,annuit,anypen,asset,assetcat,auto_loan_balance,auto_loan_interest,bcall,...,vlease,wageinc,wgt,whynockg,wilsh,wsaved,y1,year,yesfinrisk,yy1
0,0,70,5,0,0,957100.0,4,0,0.0,0,...,0,0.0,3027.95612,0,41800,3,11,2022,0,1
1,0,70,5,0,0,1067300.0,5,0,0.0,0,...,0,0.0,3054.900065,0,41800,3,12,2022,0,1
2,0,70,5,0,0,957200.0,4,0,0.0,0,...,0,0.0,3163.637766,0,41800,3,13,2022,0,1
3,0,70,5,0,0,828600.0,4,0,0.0,0,...,0,0.0,3166.228463,0,41800,3,14,2022,0,1
4,0,70,5,0,0,825600.0,4,0,0.0,0,...,0,0.0,3235.624715,0,41800,3,15,2022,0,1


In [None]:
# Import the CPS 2022 dataset

from policyengine_us_data.datasets.cps.cps import CPS_2022
import h5py

# Initialize and load the CPS dataset
print("Loading CPS 2022 dataset...")
cps_dataset = CPS_2022()
cps_data = cps_dataset.load_dataset()  # Use load_dataset() instead of load()

# Show basic information about the dataset
print(f"\nCPS 2022 Dataset type: {type(cps_data)}")

# Preprocess CPS data to aggregate household-level variables
lengths = {k: len(v) for k, v in cps_data.items()}
var_len = cps_data["person_household_id"].shape[0]
vars_of_interest = [name for name, ln in lengths.items() if ln == var_len]
agg_data = pd.DataFrame({n: cps_data[n] for n in vars_of_interest})

agg = (
    agg_data
      .groupby('person_household_id')[[
          'employment_income',
          'self_employment_income',
          'farm_income'
      ]]
      .sum()
      .rename(columns={
          'employment_income':           'household_employment_income',
          'self_employment_income':      'household_self_employment_income',
          'farm_income':                 'household_farm_income',
      })
      .reset_index()
)

mask = cps_data['is_household_head']
mask_len = mask.shape[0]

cps_data = {
    var: data[mask] if data.shape[0] == mask_len else data
    for var, data in cps_data.items()
}

cps_race_mapping = {
    1: 1,  # White only -> WHITE
    2: 2,  # Black only -> BLACK/AFRICAN-AMERICAN
    3: 5,  # American Indian, Alaskan Native only -> AMERICAN INDIAN/ALASKA NATIVE
    4: 4,  # Asian only -> ASIAN
    5: 6,  # Hawaiian/Pacific Islander only -> NATIVE HAWAIIAN/PACIFIC ISLANDER
    6: 7,  # White-Black -> OTHER
    7: 7,  # White-AI -> OTHER
    8: 7,  # White-Asian -> OTHER
    9: 7,  # White-HP -> OTHER
    10: 7,  # Black-AI -> OTHER
    11: 7,  # Black-Asian -> OTHER
    12: 7,  # Black-HP -> OTHER
    13: 7,  # AI-Asian -> OTHER
    14: 7,  # AI-HP -> OTHER
    15: 7,  # Asian-HP -> OTHER
    16: 7,  # White-Black-AI -> OTHER
    17: 7,  # White-Black-Asian -> OTHER
    18: 7,  # White-Black-HP -> OTHER
    19: 7,  # White-AI-Asian -> OTHER
    20: 7,  # White-AI-HP -> OTHER
    21: 7,  # White-Asian-HP -> OTHER
    22: 7,  # Black-AI-Asian -> OTHER
    23: 7,  # White-Black-AI-Asian -> OTHER
    24: 7,  # White-AI-Asian-HP -> OTHER
    25: 7,  # Other 3 race comb. -> OTHER
    26: 7,  # Other 4 or 5 race comb. -> OTHER
}

# Apply the mapping to recode the race values
cps_data["cps_race"] = np.vectorize(cps_race_mapping.get)(cps_data["cps_race"])

lengths = {k: len(v) for k, v in cps_data.items()}
var_len = cps_data["household_id"].shape[0]
vars_of_interest = [name for name, ln in lengths.items() if ln == var_len]
receiver_data = pd.DataFrame({n: cps_data[n] for n in vars_of_interest})

receiver_data = receiver_data.merge(
    agg[[
        'person_household_id',
        'household_employment_income',
        'household_self_employment_income',
        'household_farm_income'
    ]],
    on='person_household_id',
    how='left'
)
receiver_data.drop('employment_income', axis=1, inplace=True)
receiver_data.drop('self_employment_income', axis=1, inplace=True)
receiver_data.drop('farm_income', axis=1, inplace=True)

receiver_data.rename(columns={'household_employment_income': 'employment_income', 'household_self_employment_income':'self_employment_income', 'household_farm_income':'farm_income'}, inplace=True)

receiver_data.shape

Loading CPS 2022 dataset...

CPS 2022 Dataset type: <class 'dict'>


(56839, 76)

# Auto loan interest imputation

In [None]:
PREDICTORS = [
    "age",
    "is_female",
    "cps_race",
    "own_children_in_household",
    "employment_income",
    "self_employment_income",
    "farm_income",
]

IMPUTED_VARIABLES = ["auto_loan_balance"]

donor_data = scf_data[PREDICTORS + IMPUTED_VARIABLES].copy()

In [48]:
from microimpute.evaluations import cross_validate_model
from microimpute.models.qrf import QRF

final_results = cross_validate_model(
    data=donor_data,
    model_class=QRF,
    predictors=PREDICTORS,
    imputed_variables=IMPUTED_VARIABLES,
    n_splits=5,
    random_state=42,
    tune_hyperparameters=True,
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.4min remaining:  5.0min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  4.3min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.8min finished


In [49]:
from microimpute.visualizations.plotting import model_performance_results

fig = model_performance_results(final_results, model_name="QRF")
fig.plot()

In [32]:
from microimpute.models.qrf import QRF

qrf_model = QRF()
fitted_model = qrf_model.fit(
    X_train=donor_data,
    predictors=PREDICTORS,
    imputed_variables=IMPUTED_VARIABLES,
    tune_hyperparameters=True,
)

imputations = fitted_model.predict(X_test=receiver_data)

for var in IMPUTED_VARIABLES:
    receiver_data[var] = imputations[0.5][var]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [37]:
def plot_log_transformed_distributions(
    scf_data: pd.DataFrame,
    imputed_data: pd.DataFrame,
    variable: str,
) -> go.Figure:
    """Plot the log-transformed distribution of net worth in SCF and imputed CPS data.
    
    Args:
        scf_data: DataFrame containing SCF data.
        imputed_data: DataFrame containing imputed CPS data.
        variable: The variable to plot (e.g., 'networth').

    Returns:
        Plotly figure object.
    """ 
    # Create a log transformation function that handles negative values
    def safe_log(x):
        # For negative values, take log of absolute value and negate
        # For zero, replace with a small positive value
        sign = np.sign(x)
        log_x = np.log10(np.maximum(np.abs(x), 1e-10))
        return sign * log_x

    # Create log-transformed data
    scf_log = safe_log(scf_data[variable])
    cps_log = safe_log(imputed_data[variable])

    # Calculate statistics for log-transformed data
    scf_log_median = np.median(scf_log)
    cps_log_median = np.median(cps_log)
    scf_log_mean = np.mean(scf_log)
    cps_log_mean = np.mean(cps_log)

    # Create a single plot with both distributions
    fig = go.Figure()

    # Add histograms for both datasets
    fig.add_trace(
        go.Histogram(
            x=scf_log,
            nbinsx=60,
            opacity=0.7,
            name=f"SCF Log {variable}",
            marker_color='blue'
        )
    )

    fig.add_trace(
        go.Histogram(
            x=cps_log,
            nbinsx=60,
            opacity=0.7,
            name=f"CPS Imputed Log {variable}",
            marker_color='purple'
        )
    )

    # Add vertical lines for medians
    fig.add_trace(
        go.Scatter(
            x=[scf_log_median, scf_log_median],
            y=[0, 40000],
            mode="lines",
            line=dict(color="blue", width=2, dash="dash"),
            name=f"SCF Median: ${10**scf_log_median:,.0f}"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=[cps_log_median, cps_log_median],
            y=[0, 40000],
            mode="lines",
            line=dict(color="purple", width=2, dash="dash"),
            name=f"CPS Median: ${10**cps_log_median:,.0f}"
        )
    )

    # Add vertical lines for means
    fig.add_trace(
        go.Scatter(
            x=[scf_log_mean, scf_log_mean],
            y=[0, 40000],
            mode="lines",
            line=dict(color="blue", width=2, dash="dot"),
            name=f"SCF Mean: ${10**scf_log_mean:,.0f}"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=[cps_log_mean, cps_log_mean],
            y=[0, 40000],
            mode="lines",
            line=dict(color="purple", width=2, dash="dot"),
            name=f"CPS Mean: ${10**cps_log_mean:,.0f}"
        )
    )

    # Update layout with improved titles and labels
    fig.update_layout(
        title=f"Log-Transformed {variable} Distribution Comparison",
        xaxis_title=f"{variable}",
        yaxis_title="Frequency",
        height=600,
        width=1000, 
        barmode='overlay',
        bargap=0.1,
        legend=dict(
            x=0.01,
            y=0.99,
            bgcolor="rgba(255, 255, 255, 0.8)",
            bordercolor="rgba(0, 0, 0, 0.3)",
            borderwidth=1,
            orientation="v",
            xanchor="left",
            yanchor="top"
        )
    )

    # Add tick labels showing the actual dollar values
    tick_values = [-6, -4, -2, 0, 2, 4, 6, 8]
    tick_labels = ['$' + format(10**x if x >= 0 else -10**abs(x), ',.0f') for x in tick_values]
    fig.update_xaxes(
        tickvals=tick_values,
        ticktext=tick_labels
    )

    return fig

plot_log_transformed_distributions(donor_data, receiver_data, "auto_loan_balance").show()