Skip to content

Commit

Permalink
Ingest CHR Preventable Hospitalization (#3323)
Browse files Browse the repository at this point in the history
  • Loading branch information
benhammondmusic committed May 22, 2024
1 parent 47ef6de commit ebad99d
Show file tree
Hide file tree
Showing 7 changed files with 742 additions and 3,217 deletions.
79 changes: 72 additions & 7 deletions python/datasources/chr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,44 @@
from datasources.data_source import DataSource
from ingestion import gcs_to_bq_util, standardized_columns as std_col
from ingestion import dataset_utils, merge_utils, gcs_to_bq_util, standardized_columns as std_col
from ingestion.constants import COUNTY_LEVEL, CURRENT

CHR = 'chr'
source_state_fips = 'State FIPS Code'
source_county_fips = '5-digit FIPS Code'
source_time_period = 'Release Year'

source_cols = [source_time_period, source_state_fips, source_county_fips]

prev_hosp_per_100k_col = std_col.generate_column_name(std_col.PREVENTABLE_HOSP_PREFIX, std_col.PER_100K_SUFFIX)

source_race_to_id_map = {
'raw value': std_col.Race.ALL.value,
'(AIAN)': std_col.Race.AIAN_NH.value,
'(Asian/Pacific Islander)': std_col.Race.API_NH.value,
'(Black)': std_col.Race.BLACK_NH.value,
'(Hispanic)': std_col.Race.HISP.value,
'(White)': std_col.Race.WHITE_NH.value,
}

melt_map = {
prev_hosp_per_100k_col: {
f'Preventable Hospital Stays {source_race}': het_race_id
for source_race, het_race_id in source_race_to_id_map.items()
},
}

source_topic_cols = list(melt_map[prev_hosp_per_100k_col].keys())
# NOTE: cols for numerator and denominator are all NULL

source_dtypes = {
**{topic_col: 'float64' for topic_col in source_topic_cols},
**{col: 'str' for col in source_cols},
}


TIME_MAP = {
CURRENT: [prev_hosp_per_100k_col, std_col.POPULATION_PCT_COL, std_col.POPULATION_COL],
}


class CHRData(DataSource):
Expand All @@ -16,18 +55,44 @@ def upload_to_gcs(self, gcs_bucket, **attrs):

def write_to_bq(self, dataset, gcs_bucket, **attrs):
demographic = self.get_attr(attrs, "demographic")
geo_level = self.get_attr(attrs, "geographic")
if demographic == std_col.RACE_COL:
demographic = std_col.RACE_OR_HISPANIC_COL

use_cols = source_cols + source_topic_cols

df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
'chr',
CHR,
"analytic_data2024.csv",
usecols=use_cols,
dtype=source_dtypes,
skiprows=[1], # skip weird sub header row
)

table_name = f"{demographic}_{geo_level}"
# drop national and state-level rows
df = df[~df[source_county_fips].str.endswith('000')]

df = dataset_utils.melt_to_het_style_df(df, std_col.RACE_CATEGORY_ID_COL, source_cols, melt_map)

df = df.rename(
columns={
source_county_fips: std_col.COUNTY_FIPS_COL,
source_state_fips: std_col.STATE_FIPS_COL,
source_time_period: std_col.TIME_PERIOD_COL,
}
)

df = merge_utils.merge_state_ids(df)
df = merge_utils.merge_county_names(df)
df = merge_utils.merge_yearly_pop_numbers(df, std_col.RACE_COL, COUNTY_LEVEL)
std_col.add_race_columns_from_category_id(df)

for table_type in [CURRENT]:
df = df.copy()
table_name = f"{demographic}_{COUNTY_LEVEL}_{table_type}"
time_cols = TIME_MAP[table_type]

print(table_name)
print(df)
df_for_bq, col_types = dataset_utils.generate_time_df_with_cols_and_types(
df, time_cols, table_type, demographic
)

# gcs_to_bq_util.add_df_to_bq(df_for_bq, dataset, table_name, column_types=col_types)
gcs_to_bq_util.add_df_to_bq(df_for_bq, dataset, table_name, column_types=col_types)
9 changes: 7 additions & 2 deletions python/ingestion/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,9 +577,14 @@ def generate_time_df_with_cols_and_types(
DataFrame.
"""
df = df.copy()
mandatory_cols = [std_col.TIME_PERIOD_COL, std_col.STATE_NAME_COL, std_col.STATE_FIPS_COL]
str_cols_to_keep = [std_col.TIME_PERIOD_COL, std_col.STATE_NAME_COL, std_col.STATE_FIPS_COL, dem_col]

all_cols = mandatory_cols + [dem_col] + numerical_cols_to_keep
if std_col.COUNTY_NAME_COL in df.columns:
str_cols_to_keep.append(std_col.COUNTY_NAME_COL)
if std_col.COUNTY_FIPS_COL in df.columns:
str_cols_to_keep.append(std_col.COUNTY_FIPS_COL)

all_cols = str_cols_to_keep + numerical_cols_to_keep
df = df[all_cols]

if table_type == CURRENT:
Expand Down
3,197 changes: 0 additions & 3,197 deletions python/tests/data/chr/analytic_data2024.csv

This file was deleted.

0 comments on commit ebad99d

Please sign in to comment.