# define_sdoh_database

Recreate and load the SDOH_Surveys table using data from ETL notebook: dataAHRQCountySDOH.ipynb.

In [1]:
import pandas as pd
import os
from sqlalchemy import create_engine, text

In [2]:
# Set parms for ETL notebook.
parm_AHRQCountySDOH_years = ['2017', '2018', '2019', '2020']
parm_AHRQCountySDOH_surveys = ["ACS", "AHA", "AMFAR", "CAF", "CCBP", "CDCSVI", "CEN", "CRDC", "EPAA", "FARA", "FEA", "HHC", "HIFLD", "HRSA", "MHSVI", "MP", "NCHS", "NEPHTN", "NHC", "NOAAS", "POS", "SAHIE", "SAIPE", "SEDA"]
parm_AHRQCountySDOH_questions = ["CDCW_INJURY_DTH_RATE", "CDCW_TRANSPORT_DTH_RATE", "CDCW_SELFHARM_DTH_RATE", "CDCW_ASSAULT_DTH_RATE", "CHR_TOT_MENTAL_PROV", "CHR_MENTAL_PROV_RATE", "CHR_SEGREG_BLACK", "CHR_PCT_ALCOHOL_DRIV_DEATH", "CHR_PCT_EXCESS_DRINK", "CHR_PCT_FOOD", "CHR_SEGREG_BLACK", "CHR_SEGREG_NON_WHITE"]

DB_URI = os.environ.get("DB_URI")
if not DB_URI:
    raise EnvironmentError(
        "DB_URI is not set."
    )
DB_URI

'postgresql://myuser:mypassword@db:5432/mydb'

## Run ETL notebook

In [3]:
%run dataAHRQCountySDOH.ipynb

Loaded year 2017 (3232 rows).
Loaded year 2018 (3232 rows).
Loaded year 2019 (3232 rows).
Loaded year 2020 (3229 rows).


  from .autonotebook import tqdm as notebook_tqdm
  all_sheets = pd.read_excel(file_path, sheet_name=None, engine="openpyxl")
`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


## Collect DataFrame

In [4]:
df = out_AHRQCountySDOH()

if not isinstance(df, pd.DataFrame):
    raise TypeError(f"out_AHRQCountySDOH returned {type(df)}, expected pandas.DataFrame")

# print("Rows:", len(df), "Columns:", list(df.columns))
# df.head(3)

## Method to normalize column names

In [5]:
def normalize_colname(name: str) -> str:
    """
    1) lower-case
    2) replace any non-alphanumeric character with underscore
    3) collapse multiple underscores
    4) strip leading/trailing underscores
    5) if starts with digit -> prefix with 'col_'
    6) special-case a 'year' column -> rename to 'year_col' to avoid reserved-word collisions
    """
    if name is None:
        return name
    s = str(name).lower()
    # replace non-alphanumeric with underscore
    s = re.sub(r'[^a-z0-9]', '_', s)
    # collapse multiple underscores
    s = re.sub(r'_+', '_', s)
    s = s.strip('_')
    # if empty after cleaning
    if not s:
        s = 'col'
    # prefix if starts with digit
    if re.match(r'^[0-9]', s):
        s = 'col_' + s
    # avoid a bare "year" column name that can sometimes be problematic
    if s == 'year':
        s = 'sdoh_year'
    return s

## Normalize Column Names

In [6]:
# apply normalization
new_cols = [normalize_colname(c) for c in df.columns]
# # detect collisions (two different original names producing same normalized name)
# from collections import defaultdict
# mapping = defaultdict(list)
# for orig, new in zip(df.columns, new_cols):
#     mapping[new].append(orig)

# collisions = {k: v for k, v in mapping.items() if len(v) > 1}
# if collisions:
#     # break collisions by appending numeric suffixes in stable order
#     resolved = {}
#     for new, origs in collisions.items():
#         for i, orig in enumerate(origs, start=1):
#             resolved_name = f"{new}_{i}"
#             # replace the first occurrence in new_cols for this original name
#             idx = list(df.columns).index(orig)
#             new_cols[idx] = resolved_name

# assign cleaned column names
df.columns = new_cols

Normalized column names (first 50): ['state', 'county', 'sdoh_year', 'acs_tot_pop_wt', 'acs_tot_pop_us_above1', 'acs_tot_pop_above5', 'acs_tot_pop_above15', 'acs_tot_pop_above16', 'acs_tot_pop_16_19', 'acs_tot_pop_above25', 'acs_tot_civil_pop_above18', 'acs_tot_civil_vet_pop_above25', 'acs_tot_own_child_below17', 'acs_tot_worker_nwfh', 'acs_tot_worker_hh', 'acs_tot_civilian_labor', 'acs_tot_civil_employ_pop', 'acs_tot_pop_pov', 'acs_tot_civil_noninst_pop_pov', 'acs_tot_civil_pop_pov', 'acs_tot_grandchildren_gp', 'acs_tot_hu', 'acs_tot_hh', 'acs_avg_hh_size', 'acs_tot_civil_noninst_pop', 'acs_tot_civil_vet_pop', 'acs_pct_child_disab', 'acs_pct_disable', 'acs_pct_nonvet_disable_18_64', 'acs_pct_vet_disable_18_64', 'acs_pct_male', 'acs_pct_female', 'acs_pct_ctz_us_born', 'acs_pct_ctz_nonus_born', 'acs_pct_foreign_born', 'acs_pct_non_citizen', 'acs_pct_ctz_naturalized', 'acs_pct_nonctn_1990', 'acs_pct_nonctn_1999', 'acs_pct_nonctn_2000', 'acs_pct_nonctn_2010', 'acs_pct_api_lang', 'acs_pct_

## Delete duplicate columns

In [7]:
# Detect duplicate columns and remove them
seen = set()
duplicates = []

for col in df.columns:
    if col in seen:
        duplicates.append(col)
    else:
        seen.add(col)

# Drop duplicates
df = df.loc[:, ~df.columns.duplicated()]

# Print removed columns
if duplicates:
    print("Removed duplicate columns:", duplicates)
else:
    print("No duplicate columns found.")

Removed duplicate columns: ['chr_segreg_black']


## Drop & Recreate `SDOH_Surveys`, then Load

In [8]:
engine = create_engine(DB_URI)
TABLE_NAME = "sdoh_surveys"

with engine.begin() as conn:
    conn.execute(text(f'DROP TABLE IF EXISTS "{TABLE_NAME}" CASCADE;'))
    print(f'Dropped table "{TABLE_NAME}" (if existed).')

# Recreate schema and load rows
df.to_sql(TABLE_NAME, engine, if_exists="replace", index=False)
print(f'Loaded {len(df)} rows into "{TABLE_NAME}".')

Dropped table "sdoh_surveys" (if existed).
Loaded 12925 rows into "sdoh_surveys".
