PolicyEngine · juaristi22 · Mar 3, 2025 · Mar 3, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,51 @@
+name: CI Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+
+    - name: Set up R
+      uses: r-lib/actions/setup-r@v2
+      with:
+        r-version: '4.x'
+
+    - name: Install R dependencies
+      run: |
+        install.packages(c("StatMatch"))
+      shell: Rscript {0}
+
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install .[dev]
+
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics
+
+    - name: Test with pytest
+      run: |
+        # For now, just run the existing tests.py script
+        # Later, this should be replaced with proper pytest
+        python us_imputation_benchmarking/tests.py
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,79 @@
+# OS-generated files
+**/.DS_Store
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*$py.class
+
+# Virtual Environments
+.venv/
+venv/
+env/
+ENV/
+.env
+env.bak/
+venv.bak/
+
+# Distribution / Packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Testing / Coverage
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Logs
+*.log
+
+# Jupyter Notebooks
+.ipynb_checkpoints/
+
+# PyPI Configuration
+.pypirc
+
+# Sphinx Documentation
+docs/_build/
+
+# Celery
+celerybeat-schedule
+celerybeat.pid
+
+# Ignore Data Files
+*.csv
+*.h5
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,40 @@
+# US Imputation Benchmarking - Developer Guide
+
+## Build & Test Commands
+```bash
+# Install package in development mode
+pip install -e .
+
+# Run all tests
+python us_imputation_benchmarking/tests.py
+
+# Run specific model test (example)
+python -c "from us_imputation_benchmarking import tests; tests.test_qrf()"
+
+# Install development dependencies
+pip install black isort mypy pytest
+```
+
+## Code Style Guidelines
+
+### Formatting & Organization
+- Use 4 spaces for indentation
+- Maximum line length: 88 characters (Black default)
+- Format code with Black: `black us_imputation_benchmarking/`
+- Sort imports with isort: `isort us_imputation_benchmarking/`
+
+### Naming & Types
+- Use snake_case for variables, functions, and modules
+- Use CamelCase for classes
+- Constants should be UPPERCASE
+- Add type hints to all function parameters and return values
+- Document functions with ReStructuredText-style docstrings
+
+### Imports
+- Group imports: standard library, third-party, local modules
+- Import specific functions/classes rather than entire modules when practical
+
+### Error Handling
+- Use assertions for validation
+- Raise appropriate exceptions with informative messages
+- Add context to exceptions when re-raising
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,28 @@
 [project]
 name = "us-imputation-benchmarking"
 version = "0.1.0"
-description = "Add your description here"
+description = "Benchmarking imputation models for US household survey data"
 readme = "README.md"
 requires-python = ">=3.11"
-dependencies = []
+dependencies = [
+    "numpy",
+    "pandas",
+    "matplotlib",
+    "seaborn",
+    "scikit-learn",
+    "statsmodels",
+    "microdf",
+    "scf",
+    "policyengine_us_data",
+    "scipy",
+    "rpy2",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "flake8",
+    "black",
+    "isort",
+    "mypy",
+]
diff --git a/us_imputation_benchmarking/__init__.py b/us_imputation_benchmarking/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/us_imputation_benchmarking/comparisons/__init__.py b/us_imputation_benchmarking/comparisons/__init__.py
diff --git a/us_imputation_benchmarking/comparisons/data.py b/us_imputation_benchmarking/comparisons/data.py
@@ -0,0 +1,122 @@
+import microdf as mdf
+import scf
+from sklearn.model_selection import train_test_split
+import pandas as pd
+from typing import Union
+
+
+VALID_YEARS = [
+    1989,
+    1992,
+    1995,
+    1998,
+    2001,
+    2004,
+    2007,
+    2010,
+    2013,
+    2016,
+    2019,
+]
+
+def scf_url(year: int) -> str:
+    """ Returns the URL of the SCF summary microdata zip file for a year.
+
+    :param year: Year of SCF summary microdata to retrieve.
+    :type year: int
+    :return: URL of summary microdata zip file for the given year.
+    :rtype: str
+    """
+    assert year in VALID_YEARS, "The SCF is not available for " + str(year)
+    return (
+        "https://www.federalreserve.gov/econres/files/scfp"
+        + str(year)
+        + "s.zip"
+    )
+
+
+def load_single_scf(year: int, columns: list) -> pd.DataFrame:
+    """ Loads SCF summary microdata for a given year and set of columns.
+
+    :param year: Year of SCF summary microdata to retrieve.
+    :type year: int
+    :param columns: List of columns. The weight column `wgt` is always
+        returned. Defaults to all columns in the summary dataset.
+    :type columns: list
+    :return: SCF summary microdata for the given year.
+    :rtype: pd.DataFrame
+    """
+    # Add wgt to all returns.
+    if columns is not None:
+        columns = list(set(columns) | set(["wgt"]))
+    return mdf.read_stata_zip(scf_url(year), columns=columns)
+
+
+def load(
+    years: list = VALID_YEARS,
+    columns: list = None,
+    as_microdataframe: bool = False,
+) -> Union[pd.DataFrame, mdf.MicroDataFrame]:
+    """ Loads SCF summary microdata for a set of years and columns.
+
+    :param years: Year(s) to load SCF data for. Can be a list or single number.
+        Defaults to all available years, starting with 1989.
+    :type years: list
+    :param columns: List of columns. The weight column `wgt` is always returned.
+    :type columns: list
+    :param as_microdataframe: Whether to return as a MicroDataFrame with
+        weight set, defaults to False.
+    :type as_microdataframe: bool
+    :return: SCF summary microdata for the set of years.
+    :rtype: Union[pd.DataFrame, mdf.MicroDataFrame]
+    """
+    # Make cols a list if a single column is passed.
+    if columns is not None:
+        columns = mdf.listify(columns)
+    # If years is a single year rather than a list, don't use a loop.
+    if isinstance(years, int):
+        res = load_single_scf(years, columns)
+    # Otherwise append to a list within a loop, and concatenate.
+    else:
+        scfs = []
+        for year in years:
+            tmp = load_single_scf(year, columns)
+            tmp["year"] = year
+            scfs.append(tmp)
+        res = pd.concat(scfs)
+    # Return as a MicroDataFrame or DataFrame.
+    if as_microdataframe:
+        return mdf.MicroDataFrame(res, weights="wgt")
+    return res
+
+def preprocess_data(full_data=False):
+    data = load([VALID_YEARS[-1]])
+
+    # predictors shared with cps data
+
+    PREDICTORS = ["hhsex",      # sex of head of household
+                "age",          # age of respondent
+                "married",      # marital status of respondent
+                "kids",         # number of children in household
+                "educ",         # highest level of education
+                "race",         # race of respondent 
+                "income",       # total annual income of household  
+                "wageinc",      # income from wages and salaries
+                "bussefarminc", # income from business, self-employment or farm
+                "intdivinc",    # income from interest and dividends
+                "ssretinc",     # income from social security and retirement accounts
+                "lf",           # labor force status
+                ]   
+
+    IMPUTED_VARIABLES = ["networth"] # some property also captured in cps data (HPROP_VAL)
+
+    data = data[PREDICTORS + IMPUTED_VARIABLES]
+    mean = data.mean(axis=0)
+    std = data.std(axis=0)
+    data = (data - mean) / std
+
+    if full_data:
+        return data, PREDICTORS, IMPUTED_VARIABLES
+    else:
+        X, test_X = train_test_split(data, test_size=0.2, train_size=0.8, random_state=42)
+        return X, test_X, PREDICTORS, IMPUTED_VARIABLES