diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3f044cf --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,51 @@ +name: CI Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11"] + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + + - name: Set up R + uses: r-lib/actions/setup-r@v2 + with: + r-version: '4.x' + + - name: Install R dependencies + run: | + install.packages(c("StatMatch")) + shell: Rscript {0} + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + python -m pip install .[dev] + + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics + + - name: Test with pytest + run: | + # For now, just run the existing tests.py script + # Later, this should be replaced with proper pytest + python us_imputation_benchmarking/tests.py \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..04dacbd --- /dev/null +++ b/.gitignore @@ -0,0 +1,79 @@ +# OS-generated files +**/.DS_Store + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.pyc +*.pyo +*.pyd +*$py.class + +# Virtual Environments +.venv/ +venv/ +env/ +ENV/ +.env +env.bak/ +venv.bak/ + +# Distribution / Packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Testing / Coverage +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Logs +*.log + +# Jupyter Notebooks +.ipynb_checkpoints/ + +# PyPI Configuration +.pypirc + +# Sphinx Documentation +docs/_build/ + +# Celery +celerybeat-schedule +celerybeat.pid + +# Ignore Data Files +*.csv +*.h5 \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..2a9dabd --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,40 @@ +# US Imputation Benchmarking - Developer Guide + +## Build & Test Commands +```bash +# Install package in development mode +pip install -e . + +# Run all tests +python us_imputation_benchmarking/tests.py + +# Run specific model test (example) +python -c "from us_imputation_benchmarking import tests; tests.test_qrf()" + +# Install development dependencies +pip install black isort mypy pytest +``` + +## Code Style Guidelines + +### Formatting & Organization +- Use 4 spaces for indentation +- Maximum line length: 88 characters (Black default) +- Format code with Black: `black us_imputation_benchmarking/` +- Sort imports with isort: `isort us_imputation_benchmarking/` + +### Naming & Types +- Use snake_case for variables, functions, and modules +- Use CamelCase for classes +- Constants should be UPPERCASE +- Add type hints to all function parameters and return values +- Document functions with ReStructuredText-style docstrings + +### Imports +- Group imports: standard library, third-party, local modules +- Import specific functions/classes rather than entire modules when practical + +### Error Handling +- Use assertions for validation +- Raise appropriate exceptions with informative messages +- Add context to exceptions when re-raising \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 88ab75c..50181cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,28 @@ [project] name = "us-imputation-benchmarking" version = "0.1.0" -description = "Add your description here" +description = "Benchmarking imputation models for US household survey data" readme = "README.md" requires-python = ">=3.11" -dependencies = [] \ No newline at end of file +dependencies = [ + "numpy", + "pandas", + "matplotlib", + "seaborn", + "scikit-learn", + "statsmodels", + "microdf", + "scf", + "policyengine_us_data", + "scipy", + "rpy2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "flake8", + "black", + "isort", + "mypy", +] \ No newline at end of file diff --git a/us_imputation_benchmarking/__init__.py b/us_imputation_benchmarking/__init__.py new file mode 100644 index 0000000..139597f --- /dev/null +++ b/us_imputation_benchmarking/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/us_imputation_benchmarking/comparisons/__init__.py b/us_imputation_benchmarking/comparisons/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/us_imputation_benchmarking/comparisons/data.py b/us_imputation_benchmarking/comparisons/data.py new file mode 100644 index 0000000..ebf1c79 --- /dev/null +++ b/us_imputation_benchmarking/comparisons/data.py @@ -0,0 +1,122 @@ +import microdf as mdf +import scf +from sklearn.model_selection import train_test_split +import pandas as pd +from typing import Union + + +VALID_YEARS = [ + 1989, + 1992, + 1995, + 1998, + 2001, + 2004, + 2007, + 2010, + 2013, + 2016, + 2019, +] + +def scf_url(year: int) -> str: + """ Returns the URL of the SCF summary microdata zip file for a year. + + :param year: Year of SCF summary microdata to retrieve. + :type year: int + :return: URL of summary microdata zip file for the given year. + :rtype: str + """ + assert year in VALID_YEARS, "The SCF is not available for " + str(year) + return ( + "https://www.federalreserve.gov/econres/files/scfp" + + str(year) + + "s.zip" + ) + + +def load_single_scf(year: int, columns: list) -> pd.DataFrame: + """ Loads SCF summary microdata for a given year and set of columns. + + :param year: Year of SCF summary microdata to retrieve. + :type year: int + :param columns: List of columns. The weight column `wgt` is always + returned. Defaults to all columns in the summary dataset. + :type columns: list + :return: SCF summary microdata for the given year. + :rtype: pd.DataFrame + """ + # Add wgt to all returns. + if columns is not None: + columns = list(set(columns) | set(["wgt"])) + return mdf.read_stata_zip(scf_url(year), columns=columns) + + +def load( + years: list = VALID_YEARS, + columns: list = None, + as_microdataframe: bool = False, +) -> Union[pd.DataFrame, mdf.MicroDataFrame]: + """ Loads SCF summary microdata for a set of years and columns. + + :param years: Year(s) to load SCF data for. Can be a list or single number. + Defaults to all available years, starting with 1989. + :type years: list + :param columns: List of columns. The weight column `wgt` is always returned. + :type columns: list + :param as_microdataframe: Whether to return as a MicroDataFrame with + weight set, defaults to False. + :type as_microdataframe: bool + :return: SCF summary microdata for the set of years. + :rtype: Union[pd.DataFrame, mdf.MicroDataFrame] + """ + # Make cols a list if a single column is passed. + if columns is not None: + columns = mdf.listify(columns) + # If years is a single year rather than a list, don't use a loop. + if isinstance(years, int): + res = load_single_scf(years, columns) + # Otherwise append to a list within a loop, and concatenate. + else: + scfs = [] + for year in years: + tmp = load_single_scf(year, columns) + tmp["year"] = year + scfs.append(tmp) + res = pd.concat(scfs) + # Return as a MicroDataFrame or DataFrame. + if as_microdataframe: + return mdf.MicroDataFrame(res, weights="wgt") + return res + +def preprocess_data(full_data=False): + data = load([VALID_YEARS[-1]]) + + # predictors shared with cps data + + PREDICTORS = ["hhsex", # sex of head of household + "age", # age of respondent + "married", # marital status of respondent + "kids", # number of children in household + "educ", # highest level of education + "race", # race of respondent + "income", # total annual income of household + "wageinc", # income from wages and salaries + "bussefarminc", # income from business, self-employment or farm + "intdivinc", # income from interest and dividends + "ssretinc", # income from social security and retirement accounts + "lf", # labor force status + ] + + IMPUTED_VARIABLES = ["networth"] # some property also captured in cps data (HPROP_VAL) + + data = data[PREDICTORS + IMPUTED_VARIABLES] + mean = data.mean(axis=0) + std = data.std(axis=0) + data = (data - mean) / std + + if full_data: + return data, PREDICTORS, IMPUTED_VARIABLES + else: + X, test_X = train_test_split(data, test_size=0.2, train_size=0.8, random_state=42) + return X, test_X, PREDICTORS, IMPUTED_VARIABLES \ No newline at end of file diff --git a/us_imputation_benchmarking/comparisons/imputations.py b/us_imputation_benchmarking/comparisons/imputations.py new file mode 100644 index 0000000..40492ac --- /dev/null +++ b/us_imputation_benchmarking/comparisons/imputations.py @@ -0,0 +1,70 @@ +from models import qrf, ols, quantreg, matching, gradient_boosting, random_forests +from sklearn.model_selection import KFold +import numpy as np + + +def get_imputations(methods, X, test_X, predictors, imputed_variables, data=None) -> dict: + + QUANTILES = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95] + method_imputations = dict() + for method in methods: + method_imputations[method] = dict() + + # Cross-validation option + if data: + for method in methods: + for q in QUANTILES: + method_imputations[method][q] = [] + K = 5 + kf = KFold(n_splits=K, shuffle=True, random_state=42) + for train_idx, test_idx in kf.split(data): + X, test_X = data.iloc[train_idx], data.iloc[test_idx] + # QRF + if "QRF" in methods: + imputations = qrf.impute_qrf(X, test_X, predictors, imputed_variables, QUANTILES) + for q in QUANTILES: + method_imputations["QRF"][q].append(imputations[q]) + + # Matching + if "Matching" in methods: + imputations = matching.impute_matching(X, test_X, predictors, imputed_variables, QUANTILES) + for q in QUANTILES: + method_imputations["Matching"][q].append(imputations[q]) + + # OLS + if "OLS" in methods: + imputations = ols.impute_ols(X, test_X, predictors, imputed_variables, QUANTILES) + for q in QUANTILES: + method_imputations["OLS"][q].append(imputations[q]) + + # QuantReg + if "QuantReg" in methods: + imputations = quantreg.impute_quantreg(X, test_X, predictors, imputed_variables, QUANTILES) + for q in QUANTILES: + method_imputations["QuantReg"][q].append(imputations[q]) + for method in methods: + for q in QUANTILES: + method_imputations[method][q] = np.mean(method_imputations[method][q]) + + else: + # QRF + if "QRF" in methods: + imputations = qrf.impute_qrf(X, test_X, predictors, imputed_variables, QUANTILES) + method_imputations["QRF"] = imputations + + # Matching + if "Matching" in methods: + imputations = matching.impute_matching(X, test_X, predictors, imputed_variables, QUANTILES) + method_imputations["Matching"] = imputations + + # OLS + if "OLS" in methods: + imputations = ols.impute_ols(X, test_X, predictors, imputed_variables, QUANTILES) + method_imputations["OLS"] = imputations + + # QuantReg + if "QuantReg" in methods: + imputations = quantreg.impute_quantreg(X, test_X, predictors, imputed_variables, QUANTILES) + method_imputations["QuantReg"] = imputations + + return method_imputations \ No newline at end of file diff --git a/us_imputation_benchmarking/comparisons/plot.py b/us_imputation_benchmarking/comparisons/plot.py new file mode 100644 index 0000000..c9a2785 --- /dev/null +++ b/us_imputation_benchmarking/comparisons/plot.py @@ -0,0 +1,14 @@ +import matplotlib.pyplot as plt +import seaborn as sns + +def plot_loss_comparison(loss_comparison_df, quantiles): + percentiles = [str(int(q * 100)) for q in quantiles] + plt.figure(figsize=(12, 6)) + ax = sns.barplot(data=loss_comparison_df, x="Percentile", y="Loss", hue="Method", dodge=True) + plt.xlabel("Percentiles", fontsize=12) + plt.ylabel("Average Test Quantile Loss", fontsize=12) + plt.title("Test Loss Across Quantiles for Different Imputation Methods", fontsize=14) + plt.legend(title="Method") + ax.set_xticklabels(percentiles) + plt.grid(axis='y', linestyle='--', alpha=0.7) + plt.show() \ No newline at end of file diff --git a/us_imputation_benchmarking/comparisons/quantile_loss.py b/us_imputation_benchmarking/comparisons/quantile_loss.py index 07726bc..8f442c2 100644 --- a/us_imputation_benchmarking/comparisons/quantile_loss.py +++ b/us_imputation_benchmarking/comparisons/quantile_loss.py @@ -1,4 +1,34 @@ import pandas as pd +import numpy as np + +def quantile_loss(q, y, f): + # q: Quantile to be evaluated, e.g., 0.5 for median. + # y: True value. + # f: Fitted or predicted value. + e = y - f + return np.maximum(q * e, (q - 1) * e) + +def compute_quantile_loss(test_y, imputations, q): + losses = quantile_loss(q, test_y, imputations) + return losses + + +def compare_quantile_loss(test_y, method_imputations) -> pd.DataFrame: + + QUANTILES = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95] + quantiles_legend = [str(int(q * 100)) + 'th percentile' for q in QUANTILES] + + # Initialize empty dataframe with method names, quantile, and loss columns + results_df = pd.DataFrame(columns=['Method', 'Percentile', 'Loss']) + + for method, imputation in method_imputations.items(): + for quantile in QUANTILES: + q_loss = compute_quantile_loss(test_y.values.flatten(), imputation[quantile].values.flatten(), quantile) + new_row = { + 'Method': method, + 'Percentile': str(int(quantile * 100)) + 'th percentile', + 'Loss': q_loss.mean()} + results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True) + + return results_df, QUANTILES -def compare_quantile_loss() -> pd.DataFrame: - pass \ No newline at end of file diff --git a/us_imputation_benchmarking/models/__init__.py b/us_imputation_benchmarking/models/__init__.py new file mode 100644 index 0000000..ca29e5a --- /dev/null +++ b/us_imputation_benchmarking/models/__init__.py @@ -0,0 +1,6 @@ +from .gradient_boosting import * +from .ols import * +from .quantreg import * +from .random_forests import * +from .qrf import * + diff --git a/us_imputation_benchmarking/models/gradient_boosting.py b/us_imputation_benchmarking/models/gradient_boosting.py new file mode 100644 index 0000000..e69de29 diff --git a/us_imputation_benchmarking/models/matching.py b/us_imputation_benchmarking/models/matching.py new file mode 100644 index 0000000..ed28869 --- /dev/null +++ b/us_imputation_benchmarking/models/matching.py @@ -0,0 +1,134 @@ +import pandas as pd +import numpy as np +import logging +import os +import pkg_resources +import rpy2 +from rpy2.robjects.packages import importr +from rpy2.robjects import pandas2ri +import rpy2.robjects as ro +from rpy2.robjects import numpy2ri +# Enable R-Python DataFrame and array conversion +pandas2ri.activate() +numpy2ri.activate() +utils = importr('utils') +utils.chooseCRANmirror(ind=1) +StatMatch = importr("StatMatch") + +log = logging.getLogger(__name__) + +""" +data.rec: A matrix or data frame that plays the role of recipient in the statistical matching application. + +data.don: A matrix or data frame that that plays the role of donor in the statistical matching application. + +mtc.ids: A matrix with two columns. Each row must contain the name or the index of the recipient record (row) + in data.don and the name or the index of the corresponding donor record (row) in data.don. Note that + this type of matrix is returned by the functions NND.hotdeck, RANDwNND.hotdeck, rankNND.hotdeck, and + mixed.mtc. + +z.vars: A character vector with the names of the variables available only in data.don that should be “donated” to data.rec. +networth + +dup.x: Logical. When TRUE the values of the matching variables in data.don are also “donated” to data.rec. + The names of the matching variables have to be specified with the argument match.vars. To avoid confusion, + the matching variables added to data.rec are renamed by adding the suffix “don”. By default dup.x=FALSE. + +match.vars: A character vector with the names of the matching variables. It has to be specified only when dup.x=TRUE. +All other vars (no need to specify) +""" + +def nnd_hotdeck_using_rpy2(receiver = None, donor = None, matching_variables = None, + z_variables = None, donor_classes = None): + from rpy2.robjects.packages import importr + from rpy2.robjects import pandas2ri + + assert receiver is not None and donor is not None + assert matching_variables is not None + + pandas2ri.activate() + StatMatch = importr("StatMatch") + + if isinstance(donor_classes, str): + assert donor_classes in receiver, 'Donor class not present in receiver' + assert donor_classes in donor, 'Donor class not present in donor' + + try: + if donor_classes: + out_NND = StatMatch.NND_hotdeck( + data_rec = receiver, + data_don = donor, + match_vars = pd.Series(matching_variables), + don_class = pd.Series(donor_classes) + ) + else: + out_NND = StatMatch.NND_hotdeck( + data_rec = receiver, + data_don = donor, + match_vars = pd.Series(matching_variables), + # don_class = pd.Series(donor_classes) + ) + + except Exception as e: + print(1) + print(receiver) + print(2) + print(donor) + print(3) + print(pd.Series(matching_variables)) + print(e) + + # Convert out_NND[0] to a NumPy array and reshape to 2 columns + # To address shape errors when using hotdeck for mtc_ids + mtc_ids_array = np.array(out_NND[0]) + mtc_ids_2d = mtc_ids_array.reshape(-1, 2) + # Convert the 2D NumPy array to an R matrix using py2rpy + mtc_ids = ro.conversion.py2rpy(mtc_ids_2d) + + print("mtc_ids:", mtc_ids) + print("mtc_ids type:", type(mtc_ids)) + + # create synthetic data.set, without the + # duplication of the matching variables + + fused_0 = StatMatch.create_fused( + data_rec = receiver, + data_don = donor, + mtc_ids = mtc_ids, + z_vars = pd.Series(z_variables) + ) + + # create synthetic data.set, with the "duplication" + # of the matching variables + + fused_1 = StatMatch.create_fused( + data_rec = receiver, + data_don = donor, + mtc_ids = mtc_ids, + z_vars = pd.Series(z_variables), + dup_x = False, + match_vars = pd.Series(matching_variables) + ) + + return fused_0, fused_1 + +def impute_matching(X, test_X, predictors, imputed_variables, quantiles, + matching_hotdeck = nnd_hotdeck_using_rpy2): + imputations = {} + test_X_dup = test_X.copy() + test_X.drop(imputed_variables, axis=1, inplace=True, errors='ignore') # in case the variable is already missing + + fused0, fused1 = matching_hotdeck(receiver = test_X, + donor = X, + matching_variables = predictors, + z_variables = imputed_variables, + donor_classes = None) + + print(fused0.colnames) + fused0_pd = pandas2ri.rpy2py(fused0) + print(fused0_pd[imputed_variables]) + + for q in quantiles: + imputations[q] = fused0_pd[imputed_variables] + + return imputations \ No newline at end of file diff --git a/us_imputation_benchmarking/models/ols.py b/us_imputation_benchmarking/models/ols.py new file mode 100644 index 0000000..1775e78 --- /dev/null +++ b/us_imputation_benchmarking/models/ols.py @@ -0,0 +1,25 @@ +import statsmodels.api as sm +import numpy as np +from scipy.stats import norm + +def ols_quantile(m, X, q): + # m: OLS model. + # X: X matrix. + # q: Quantile. + # + # Set alpha based on q. Vectorized for different values of q. + mean_pred = m.predict(X) + se = np.sqrt(m.scale) + return mean_pred + norm.ppf(q) * se + +def impute_ols(X, test_X, predictors, imputed_variables, quantiles): + imputations = {} + Y = X[imputed_variables] + # add constant for OLS + X = sm.add_constant(X[predictors]) + test_X = sm.add_constant(test_X[predictors]) + ols = sm.OLS(Y, X).fit() + for q in quantiles: + imputation = ols_quantile(ols, test_X, q) + imputations[q] = imputation + return imputations \ No newline at end of file diff --git a/us_imputation_benchmarking/models/qrf.py b/us_imputation_benchmarking/models/qrf.py index 819c301..edcdd33 100644 --- a/us_imputation_benchmarking/models/qrf.py +++ b/us_imputation_benchmarking/models/qrf.py @@ -1,4 +1,13 @@ +from policyengine_us_data.utils import QRF +import numpy as np -def impute_qrf(X, predictors, imputations): - pass \ No newline at end of file +def impute_qrf(X, test_X, predictors, imputed_variables, quantiles): + imputations = {} + qrf = QRF() + qrf.fit(X[predictors], X[imputed_variables]) + for q in quantiles: + imputation = qrf.predict(test_X[predictors], mean_quantile = q) + imputations[q] = imputation + + return imputations \ No newline at end of file diff --git a/us_imputation_benchmarking/models/quantreg.py b/us_imputation_benchmarking/models/quantreg.py new file mode 100644 index 0000000..4b71f6e --- /dev/null +++ b/us_imputation_benchmarking/models/quantreg.py @@ -0,0 +1,13 @@ +import statsmodels.api as sm + +def impute_quantreg(X, test_X, predictors, imputed_variables, quantiles): + imputations = {} + Y = X[imputed_variables] + # add constant for QuantReg + X = sm.add_constant(X[predictors]) + test_X = sm.add_constant(test_X[predictors]) + for q in quantiles: + quantreg = sm.QuantReg(Y, X).fit(q=q) + imputation = quantreg.predict(test_X) + imputations[q] = imputation + return imputations \ No newline at end of file diff --git a/us_imputation_benchmarking/models/random_forests.py b/us_imputation_benchmarking/models/random_forests.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/us_imputation_benchmarking/models/random_forests.py @@ -0,0 +1 @@ + diff --git a/us_imputation_benchmarking/tests.py b/us_imputation_benchmarking/tests.py new file mode 100644 index 0000000..2d4e25d --- /dev/null +++ b/us_imputation_benchmarking/tests.py @@ -0,0 +1,17 @@ +from comparisons.data import preprocess_data +from comparisons.imputations import get_imputations +from comparisons.quantile_loss import compare_quantile_loss +from comparisons.plot import plot_loss_comparison + +X, X_test, PREDICTORS, IMPUTED_VARIABLES = preprocess_data(full_data=False) +Y_test = X_test[IMPUTED_VARIABLES] +#data, PREDICTORS, IMPUTED_VARIABLES = preprocess_data(full_data=True) + +methods = ["QRF", "OLS", "QuantReg"] # "Matching" still not working +method_imputations = get_imputations(methods, X, X_test, PREDICTORS, IMPUTED_VARIABLES) + +loss_comparison_df, quantiles = compare_quantile_loss(Y_test, method_imputations) + +print(loss_comparison_df) + +plot_loss_comparison(loss_comparison_df, quantiles) \ No newline at end of file