Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .github/workflows/pull_request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,12 @@ jobs:
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
- name: Build datasets
run: make data
env:
TEST_LITE: true
- name: Run tests
run: pytest
- name: Test documentation builds
run: make documentation
run: make documentation
- name: Upload ECPS 2024
uses: actions/upload-artifact@v4
with:
name: enhanced_cps_2024.h5
path: policyengine_us_data/storage/enhanced_cps_2024.h5
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ documentation:

data:
python policyengine_us_data/datasets/cps/cps.py
python policyengine_us_data/datasets/cps/extended_cps.py
python policyengine_us_data/datasets/cps/enhanced_cps.py

clean:
Expand Down
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
changed:
- Enhanced CPS now uses a 3-year pooled CPS.
1 change: 1 addition & 0 deletions policyengine_us_data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
CPS_2022,
CPS_2023,
CPS_2024,
Pooled_3_Year_CPS_2024,
CensusCPS_2018,
CensusCPS_2019,
CensusCPS_2020,
Expand Down
91 changes: 80 additions & 11 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,8 @@ def generate(self):
# Extrapolate from CPS 2023

cps_2022 = CPS_2023(require=True)
uprating = create_policyengine_uprating_factors_table()
arrays = cps_2022.load_dataset()
for variable in uprating.index.unique():
if variable in arrays:
current_index = uprating[uprating.index == variable][
self.time_period
].values[0]
start_index = uprating[uprating.index == variable][
2023
].values[0]
growth = current_index / start_index
arrays[variable] = arrays[variable] * growth
arrays = uprate_cps_data(arrays, 2023, self.time_period)

self.save_dataset(arrays)
return
Expand All @@ -65,6 +55,22 @@ def generate(self):
cps.close()


def uprate_cps_data(data, from_period, to_period):
uprating = create_policyengine_uprating_factors_table()
for variable in uprating.index.unique():
if variable in data:
current_index = uprating[uprating.index == variable][
to_period
].values[0]
start_index = uprating[uprating.index == variable][
from_period
].values[0]
growth = current_index / start_index
data[variable] = data[variable] * growth

return data


def add_rent(cps: h5py.File, person: DataFrame, household: DataFrame):
is_renting = household.H_TENURE == 2
AVERAGE_RENT = 1_300 * 12
Expand Down Expand Up @@ -570,5 +576,68 @@ class CPS_2024(CPS):
url = "release://policyengine/policyengine-us-data/release/cps_2024.h5"


class PooledCPS(Dataset):
data_format = Dataset.ARRAYS
input_datasets: list
time_period: int

def generate(self):
data = [
input_dataset(require=True).load_dataset()
for input_dataset in self.input_datasets
]
time_periods = [dataset.time_period for dataset in self.input_datasets]
data = [
uprate_cps_data(data, time_period, self.time_period)
for data, time_period in zip(data, time_periods)
]

new_data = {}

for i in range(len(data)):
for variable in data[i]:
data_values = data[i][variable]
if variable not in new_data:
new_data[variable] = data_values
elif "_id" in variable:
previous_max = new_data[variable].max()
new_data[variable] = np.concatenate(
[
new_data[variable],
data_values + previous_max,
]
)
else:
new_data[variable] = np.concatenate(
[
new_data[variable],
data_values,
]
)

new_data["household_weight"] = new_data["household_weight"] / len(
self.input_datasets
)

self.save_dataset(new_data)


class Pooled_3_Year_CPS_2024(PooledCPS):
label = "CPS 2024 (3-year pooled)"
name = "pooled_3_year_cps_2024"
file_path = STORAGE_FOLDER / "pooled_3_year_cps_2024.h5"
input_datasets = [
CPS_2021,
CPS_2022,
CPS_2023,
]
time_period = 2024
url = "release://PolicyEngine/policyengine-us-data/release/pooled_3_year_cps_2024.h5"


if __name__ == "__main__":
CPS_2021().generate()
CPS_2022().generate()
CPS_2023().generate()
CPS_2024().generate()
Pooled_3_Year_CPS_2024().generate()
2 changes: 1 addition & 1 deletion policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ class EnhancedCPS(Dataset):
input_dataset: Type[Dataset]
start_year: int
end_year: int
url = "release://policyengine/policyengine-us-data/release/enhanced_cps_2024.h5"

def generate(self):
from policyengine_us import Microsimulation
Expand Down Expand Up @@ -175,6 +174,7 @@ class EnhancedCPS_2024(EnhancedCPS):
name = "enhanced_cps_2024"
label = "Enhanced CPS 2024"
file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5"
url = "release://policyengine/policyengine-us-data/release/enhanced_cps_2024.h5"


if __name__ == "__main__":
Expand Down
8 changes: 6 additions & 2 deletions policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from policyengine_core.data import Dataset
from policyengine_us_data.storage import STORAGE_FOLDER
from typing import Type
from .cps import *
from ..puf import *
from policyengine_us_data.datasets.cps.cps import *
from policyengine_us_data.datasets.puf import *
import pandas as pd
import os

Expand Down Expand Up @@ -151,3 +151,7 @@ class ExtendedCPS_2024(ExtendedCPS):
label = "Extended CPS (2024)"
file_path = STORAGE_FOLDER / "extended_cps_2024.h5"
time_period = 2024


if __name__ == "__main__":
ExtendedCPS_2024().generate()