Skip to content

Commit

Permalink
clarify MEPS education features
Browse files Browse the repository at this point in the history
Signed-off-by: Samuel Hoffman <hoffman.sc@gmail.com>
  • Loading branch information
hoffmansc committed Jul 27, 2022
1 parent 2e93e9c commit 28986da
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
14 changes: 11 additions & 3 deletions aif360/sklearn/datasets/meps_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def fetch_meps(panel, *, accept_terms=None, data_home=None, cache=True,
dropcols=None, numeric_only=False, dropna=True):
"""Load the Medical Expenditure Panel Survey (MEPS) dataset.
Note:
For descriptions of the dataset features, see the `data codebook
<https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H181>`_.
Args:
panel ({19, 20, 21}): Panel number (only 19, 20, and 21 are currently
supported).
Expand Down Expand Up @@ -99,7 +103,13 @@ def fetch_meps(panel, *, accept_terms=None, data_home=None, cache=True,
'DIABDX', 'JTPAIN', 'ARTHDX', 'ARTHTYPE', 'ASTHDX', 'ADHDADDX',
'PREGNT', 'WLKLIM', 'ACTLIM', 'SOCLIM', 'COGLIM', 'DFHEAR42',
'DFSEE42', 'ADSMOK42', 'PHQ242', 'EMPST', 'POVCAT', 'INSCOV',
'EDUCYR', 'HIDEG'] # TODO: why are these included here but not in usecols?
# NOTE: education tracking seems to have changed between panels. 'EDUYRDG'
# was used for panel 19, 'EDUCYR' and 'HIDEG' were used for panels 20 & 21.
# User may change usecols to include these manually.
'EDUCYR', 'HIDEG']
if panel == 19:
cat_cols += ['EDUYRDG']

for col in cat_cols:
df[col] = df[col].astype('category')
thresh = 0 if col in ['REGION', 'MARRY', 'ASTHDX'] else -1
Expand All @@ -116,8 +126,6 @@ def fetch_meps(panel, *, accept_terms=None, data_home=None, cache=True,
df['UTILIZATION'] = pd.cut(util, [min(util)-1, 10, max(util)+1], right=False,
labels=['< 10 Visits', '>= 10 Visits'])#['low', 'high'])

# TODO: let standardize_dataset handle dropna (see above todo re: extra cols)
df = df.dropna()
return standardize_dataset(df, prot_attr='RACE', target='UTILIZATION',
sample_weight='PERWT', usecols=usecols,
dropcols=dropcols, numeric_only=numeric_only,
Expand Down
20 changes: 14 additions & 6 deletions tests/sklearn/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,20 +241,28 @@ def test_cache_meps(panel):
[(19, MEPSDataset19), (20, MEPSDataset20), (21, MEPSDataset21)])
def test_meps_matches_old(panel, cls):
"""Tests MEPS datasets match original versions."""
meps = fetch_meps(panel, accept_terms=True)
usecols = ['REGION', 'AGE', 'SEX', 'RACE', 'MARRY', 'FTSTU',
'ACTDTY', 'HONRDC', 'RTHLTH', 'MNHLTH', 'HIBPDX',
'CHDDX', 'ANGIDX', 'MIDX', 'OHRTDX', 'STRKDX', 'EMPHDX',
'CHBRON', 'CHOLDX', 'CANCERDX', 'DIABDX', 'JTPAIN',
'ARTHDX', 'ARTHTYPE', 'ASTHDX', 'ADHDADDX', 'PREGNT',
'WLKLIM', 'ACTLIM', 'SOCLIM', 'COGLIM', 'DFHEAR42',
'DFSEE42', 'ADSMOK42', 'PCS42', 'MCS42', 'K6SUM42',
'PHQ242', 'EMPST', 'POVCAT', 'INSCOV']
educols = ['EDUCYR', 'HIDEG']
meps = fetch_meps(panel, accept_terms=True, usecols=usecols + educols)
assert len(meps) == 3
meps.X.RACE = meps.X.RACE.factorize(sort=True)[0]
MEPS = cls()
assert_array_equal(pd.get_dummies(meps.X), MEPS.features)
assert_array_equal(pd.get_dummies(meps.X.drop(columns=educols)), MEPS.features)
assert_array_equal(meps.y.factorize(sort=True)[0], MEPS.labels.ravel())

@pytest.mark.parametrize("panel", [19, 20, 21])
def test_fetch_meps(panel):
"""Tests MEPS datasets shapes with various options."""
# BUG: dropna does nothing currently
# meps = fetch_meps(panel, accept_terms=True)
# meps_dropna = fetch_meps(panel, dropna=False)
# assert meps_dropna.shape[0] < meps.shape[0]
meps = fetch_meps(panel, accept_terms=True)
meps_dropna = fetch_meps(panel, dropna=False)
assert meps_dropna.shape[0] < meps.shape[0]
meps_numeric = fetch_meps(panel, accept_terms=True, numeric_only=True)
assert meps_numeric.X.shape[1] == 5

Expand Down

0 comments on commit 28986da

Please sign in to comment.