Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ingest CHR Preventable Hospitalization #3323

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 72 additions & 7 deletions python/datasources/chr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,44 @@
from datasources.data_source import DataSource
from ingestion import gcs_to_bq_util, standardized_columns as std_col
from ingestion import dataset_utils, merge_utils, gcs_to_bq_util, standardized_columns as std_col
from ingestion.constants import COUNTY_LEVEL, CURRENT

CHR = 'chr'
source_state_fips = 'State FIPS Code'
source_county_fips = '5-digit FIPS Code'
source_time_period = 'Release Year'

source_cols = [source_time_period, source_state_fips, source_county_fips]

prev_hosp_per_100k_col = std_col.generate_column_name(std_col.PREVENTABLE_HOSP_PREFIX, std_col.PER_100K_SUFFIX)

source_race_to_id_map = {
'raw value': std_col.Race.ALL.value,
'(AIAN)': std_col.Race.AIAN_NH.value,
'(Asian/Pacific Islander)': std_col.Race.API_NH.value,
'(Black)': std_col.Race.BLACK_NH.value,
'(Hispanic)': std_col.Race.HISP.value,
'(White)': std_col.Race.WHITE_NH.value,
}

melt_map = {
prev_hosp_per_100k_col: {
f'Preventable Hospital Stays {source_race}': het_race_id
for source_race, het_race_id in source_race_to_id_map.items()
},
}

source_topic_cols = list(melt_map[prev_hosp_per_100k_col].keys())
# NOTE: cols for numerator and denominator are all NULL

source_dtypes = {
**{topic_col: 'float64' for topic_col in source_topic_cols},
**{col: 'str' for col in source_cols},
}


TIME_MAP = {
CURRENT: [prev_hosp_per_100k_col, std_col.POPULATION_PCT_COL, std_col.POPULATION_COL],
}


class CHRData(DataSource):
Expand All @@ -16,18 +55,44 @@ def upload_to_gcs(self, gcs_bucket, **attrs):

def write_to_bq(self, dataset, gcs_bucket, **attrs):
demographic = self.get_attr(attrs, "demographic")
geo_level = self.get_attr(attrs, "geographic")
if demographic == std_col.RACE_COL:
demographic = std_col.RACE_OR_HISPANIC_COL

use_cols = source_cols + source_topic_cols

df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
'chr',
CHR,
"analytic_data2024.csv",
usecols=use_cols,
dtype=source_dtypes,
skiprows=[1], # skip weird sub header row
)

table_name = f"{demographic}_{geo_level}"
# drop national and state-level rows
df = df[~df[source_county_fips].str.endswith('000')]

df = dataset_utils.melt_to_het_style_df(df, std_col.RACE_CATEGORY_ID_COL, source_cols, melt_map)

df = df.rename(
columns={
source_county_fips: std_col.COUNTY_FIPS_COL,
source_state_fips: std_col.STATE_FIPS_COL,
source_time_period: std_col.TIME_PERIOD_COL,
}
)

df = merge_utils.merge_state_ids(df)
df = merge_utils.merge_county_names(df)
df = merge_utils.merge_yearly_pop_numbers(df, std_col.RACE_COL, COUNTY_LEVEL)
std_col.add_race_columns_from_category_id(df)

for table_type in [CURRENT]:
df = df.copy()
table_name = f"{demographic}_{COUNTY_LEVEL}_{table_type}"
time_cols = TIME_MAP[table_type]

print(table_name)
print(df)
df_for_bq, col_types = dataset_utils.generate_time_df_with_cols_and_types(
df, time_cols, table_type, demographic
)

# gcs_to_bq_util.add_df_to_bq(df_for_bq, dataset, table_name, column_types=col_types)
gcs_to_bq_util.add_df_to_bq(df_for_bq, dataset, table_name, column_types=col_types)
9 changes: 7 additions & 2 deletions python/ingestion/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,9 +577,14 @@ def generate_time_df_with_cols_and_types(
DataFrame.
"""
df = df.copy()
mandatory_cols = [std_col.TIME_PERIOD_COL, std_col.STATE_NAME_COL, std_col.STATE_FIPS_COL]
str_cols_to_keep = [std_col.TIME_PERIOD_COL, std_col.STATE_NAME_COL, std_col.STATE_FIPS_COL, dem_col]

all_cols = mandatory_cols + [dem_col] + numerical_cols_to_keep
if std_col.COUNTY_NAME_COL in df.columns:
str_cols_to_keep.append(std_col.COUNTY_NAME_COL)
if std_col.COUNTY_FIPS_COL in df.columns:
str_cols_to_keep.append(std_col.COUNTY_FIPS_COL)

all_cols = str_cols_to_keep + numerical_cols_to_keep
df = df[all_cols]

if table_type == CURRENT:
Expand Down
3,197 changes: 0 additions & 3,197 deletions python/tests/data/chr/analytic_data2024.csv

This file was deleted.

Loading