Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: CI Tests

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11"]

steps:
- uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'

- name: Set up R
uses: r-lib/actions/setup-r@v2
with:
r-version: '4.x'

- name: Install R dependencies
run: |
install.packages(c("StatMatch"))
shell: Rscript {0}

- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
python -m pip install .[dev]

- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics

- name: Test with pytest
run: |
# For now, just run the existing tests.py script
# Later, this should be replaced with proper pytest
python us_imputation_benchmarking/tests.py
79 changes: 79 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# OS-generated files
**/.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.pyc
*.pyo
*.pyd
*$py.class

# Virtual Environments
.venv/
venv/
env/
ENV/
.env
env.bak/
venv.bak/

# Distribution / Packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Testing / Coverage
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Logs
*.log

# Jupyter Notebooks
.ipynb_checkpoints/

# PyPI Configuration
.pypirc

# Sphinx Documentation
docs/_build/

# Celery
celerybeat-schedule
celerybeat.pid

# Ignore Data Files
*.csv
*.h5
40 changes: 40 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# US Imputation Benchmarking - Developer Guide

## Build & Test Commands
```bash
# Install package in development mode
pip install -e .

# Run all tests
python us_imputation_benchmarking/tests.py

# Run specific model test (example)
python -c "from us_imputation_benchmarking import tests; tests.test_qrf()"

# Install development dependencies
pip install black isort mypy pytest
```

## Code Style Guidelines

### Formatting & Organization
- Use 4 spaces for indentation
- Maximum line length: 88 characters (Black default)
- Format code with Black: `black us_imputation_benchmarking/`
- Sort imports with isort: `isort us_imputation_benchmarking/`

### Naming & Types
- Use snake_case for variables, functions, and modules
- Use CamelCase for classes
- Constants should be UPPERCASE
- Add type hints to all function parameters and return values
- Document functions with ReStructuredText-style docstrings

### Imports
- Group imports: standard library, third-party, local modules
- Import specific functions/classes rather than entire modules when practical

### Error Handling
- Use assertions for validation
- Raise appropriate exceptions with informative messages
- Add context to exceptions when re-raising
25 changes: 23 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,28 @@
[project]
name = "us-imputation-benchmarking"
version = "0.1.0"
description = "Add your description here"
description = "Benchmarking imputation models for US household survey data"
readme = "README.md"
requires-python = ">=3.11"
dependencies = []
dependencies = [
"numpy",
"pandas",
"matplotlib",
"seaborn",
"scikit-learn",
"statsmodels",
"microdf",
"scf",
"policyengine_us_data",
"scipy",
"rpy2",
]

[project.optional-dependencies]
dev = [
"pytest",
"flake8",
"black",
"isort",
"mypy",
]
2 changes: 2 additions & 0 deletions us_imputation_benchmarking/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@


Empty file.
122 changes: 122 additions & 0 deletions us_imputation_benchmarking/comparisons/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import microdf as mdf
import scf
from sklearn.model_selection import train_test_split
import pandas as pd
from typing import Union


VALID_YEARS = [
1989,
1992,
1995,
1998,
2001,
2004,
2007,
2010,
2013,
2016,
2019,
]

def scf_url(year: int) -> str:
""" Returns the URL of the SCF summary microdata zip file for a year.

:param year: Year of SCF summary microdata to retrieve.
:type year: int
:return: URL of summary microdata zip file for the given year.
:rtype: str
"""
assert year in VALID_YEARS, "The SCF is not available for " + str(year)
return (
"https://www.federalreserve.gov/econres/files/scfp"
+ str(year)
+ "s.zip"
)


def load_single_scf(year: int, columns: list) -> pd.DataFrame:
""" Loads SCF summary microdata for a given year and set of columns.

:param year: Year of SCF summary microdata to retrieve.
:type year: int
:param columns: List of columns. The weight column `wgt` is always
returned. Defaults to all columns in the summary dataset.
:type columns: list
:return: SCF summary microdata for the given year.
:rtype: pd.DataFrame
"""
# Add wgt to all returns.
if columns is not None:
columns = list(set(columns) | set(["wgt"]))
return mdf.read_stata_zip(scf_url(year), columns=columns)


def load(
years: list = VALID_YEARS,
columns: list = None,
as_microdataframe: bool = False,
) -> Union[pd.DataFrame, mdf.MicroDataFrame]:
""" Loads SCF summary microdata for a set of years and columns.

:param years: Year(s) to load SCF data for. Can be a list or single number.
Defaults to all available years, starting with 1989.
:type years: list
:param columns: List of columns. The weight column `wgt` is always returned.
:type columns: list
:param as_microdataframe: Whether to return as a MicroDataFrame with
weight set, defaults to False.
:type as_microdataframe: bool
:return: SCF summary microdata for the set of years.
:rtype: Union[pd.DataFrame, mdf.MicroDataFrame]
"""
# Make cols a list if a single column is passed.
if columns is not None:
columns = mdf.listify(columns)
# If years is a single year rather than a list, don't use a loop.
if isinstance(years, int):
res = load_single_scf(years, columns)
# Otherwise append to a list within a loop, and concatenate.
else:
scfs = []
for year in years:
tmp = load_single_scf(year, columns)
tmp["year"] = year
scfs.append(tmp)
res = pd.concat(scfs)
# Return as a MicroDataFrame or DataFrame.
if as_microdataframe:
return mdf.MicroDataFrame(res, weights="wgt")
return res

def preprocess_data(full_data=False):
data = load([VALID_YEARS[-1]])

# predictors shared with cps data

PREDICTORS = ["hhsex", # sex of head of household
"age", # age of respondent
"married", # marital status of respondent
"kids", # number of children in household
"educ", # highest level of education
"race", # race of respondent
"income", # total annual income of household
"wageinc", # income from wages and salaries
"bussefarminc", # income from business, self-employment or farm
"intdivinc", # income from interest and dividends
"ssretinc", # income from social security and retirement accounts
"lf", # labor force status
]

IMPUTED_VARIABLES = ["networth"] # some property also captured in cps data (HPROP_VAL)

data = data[PREDICTORS + IMPUTED_VARIABLES]
mean = data.mean(axis=0)
std = data.std(axis=0)
data = (data - mean) / std

if full_data:
return data, PREDICTORS, IMPUTED_VARIABLES
else:
X, test_X = train_test_split(data, test_size=0.2, train_size=0.8, random_state=42)
return X, test_X, PREDICTORS, IMPUTED_VARIABLES
Loading