In [1]:
import pandas as pd

# Load the metadata file
metadata_path = "data/LLCP2023_metadata.csv"
metadata = pd.read_csv(metadata_path)

# View the first few rows of the metadata
print(metadata.head())

  SAS Variable Name            Label           Section Name Value Value Label
0            _STATE  State FIPS Code  Record Identification     1     Alabama
1            _STATE  State FIPS Code  Record Identification     2      Alaska
2            _STATE  State FIPS Code  Record Identification     4     Arizona
3            _STATE  State FIPS Code  Record Identification     5    Arkansas
4            _STATE  State FIPS Code  Record Identification     6  California


In [2]:
metadata.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850 entries, 0 to 1849
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   SAS Variable Name  1850 non-null   object
 1   Label              1850 non-null   object
 2   Section Name       1850 non-null   object
 3   Value              1849 non-null   object
 4   Value Label        1838 non-null   object
dtypes: object(5)
memory usage: 72.4+ KB


In [3]:
# Get unique section names to explore the themes
section_names = metadata['Section Name'].dropna().unique()

# Print all section names to choose relevant ones
for name in sorted(section_names):
    print(name)

Adverse Childhood Experiences
Alcohol Consumption
Arthritis
Aspirin for CVD Prevention
Breast and Cervical Cancer Screening
COVID Vaccination
Calculated Race Variables
Calculated Variables
Cancer Survivorship: Course of Treatment
Cancer Survivorship: Pain Management
Cancer Survivorship: Type of Cancer
Caregiver
Cell Phone Introduction
Childhood Asthma Prevalence
Child Demographic Variables
Child Weighting Variables
Cholesterol Awareness
Chronic Health Conditions
Cognitive Decline
Colorectal Cancer Screening
Demographics
Diabetes
Disability
Excess Sun Exposure
Exercise (Physical Activity)
Falls
Firearm Safety
HIV/AIDS
HPV Vaccination
Healthy Days
Health Care Access
Health Status
Heart Attack and Stroke
Hypertension Awareness
Immunization
Indoor Tanning
Land Line Introduction
Long-term COVID Effects
Lung Cancer Screening
Marijuana Use
Other Tobacco Use
Place of Flu Vaccination
Pre-Diabetes
Prostate Cancer Screening
Questionnaire Language
Questionnaire Version
Random Child Selection
React

In [4]:
import re

# Clean section names more aggressively using regex
def clean_section_name(name):
    if pd.isnull(name):
        return name
    name = name.lower()
    name = re.sub(r'\(.*?\)', '', name)         # remove things in parentheses
    name = re.sub(r'[^a-z0-9 ]+', '', name)     # remove punctuation except space
    name = re.sub(r'\s+', '_', name.strip())    # replace whitespace with underscores
    return name

metadata['Section Name Clean'] = metadata['Section Name'].apply(clean_section_name)

# Show unique cleaned names
section_names = metadata['Section Name Clean'].dropna().unique()
for name in sorted(section_names):
    print(name)


adversechildhoodexperiences
alcoholconsumption
arthritis
aspirinforcvdprevention
breastandcervicalcancerscreening
calculatedracevariables
calculatedvariables
cancersurvivorshipcourseoftreatment
cancersurvivorshippainmanagement
cancersurvivorshiptypeofcancer
caregiver
cellphoneintroduction
childdemographicvariables
childhoodasthmaprevalence
childweightingvariables
cholesterolawareness
chronichealthconditions
cognitivedecline
colorectalcancerscreening
covidvaccination
demographics
diabetes
disability
excesssunexposure
exercise
falls
firearmsafety
healthcareaccess
healthstatus
healthydays
heartattackandstroke
hivaids
hpvvaccination
hypertensionawareness
immunization
indoortanning
landlineintroduction
longtermcovideffects
lungcancerscreening
marijuanause
othertobaccouse
placeoffluvaccination
prediabetes
prostatecancerscreening
questionnairelanguage
questionnaireversion
randomchildselection
reactionstorace
recordidentification
respondentsex
seatbeltuseanddrinkinganddriving
sexualorientation

In [5]:
# Define secttions of interest
sections_interest = [
    'adversechildhoodexperiences',
    'alcoholconsumption',
    'chronichealthconditions',
    'demographics',
    'disability',
    'healthcareaccess',
    'healthydays', # This section contains the result variable
    'marijuanause',
    'othertobaccouse',
    'respondentsex',
    'sexualorientation',
    'socialdeterminants',
    'tobaccouse',
    'urbanrural', 
    'exercise'
]

# Filter metadata to include only those sections
df_subset = metadata[metadata['Section Name Clean'].isin(sections_interest)]

In [6]:
df_subset2 = df_subset[['SAS Variable Name', 'Section Name Clean', 'Label']].drop_duplicates()

In [7]:
# Display the entire DataFrame of confounder variables
df_subset2 = df_subset2.reset_index(drop=True)
pd.set_option('display.max_rows', None)

display(df_subset2)

Unnamed: 0,SAS Variable Name,Section Name Clean,Label
0,SEXVAR,respondentsex,Sex of Respondent
1,PHYSHLTH,healthydays,Number of Days Physical Health Not Good
2,MENTHLTH,healthydays,Number of Days Mental Health Not Good
3,POORHLTH,healthydays,Poor Physical or Mental Health
4,PRIMINS1,healthcareaccess,What is Current Primary Source of Health Insur...
5,PERSDOC3,healthcareaccess,Have Personal Health Care Provider?
6,MEDCOST1,healthcareaccess,Could Not Afford To See Doctor
7,CHECKUP1,healthcareaccess,Length of time since last routine checkup
8,EXERANY2,exercise,Exercise in Past 30 Days
9,EXRACT12,exercise,Type of Physical Activity


In [8]:
variables = [
    "SEXVAR",
    "PHYSHLTH",
    "MENTHLTH",
    "POORHLTH",
    "PRIMINS1",
    "PERSDOC3",
    "MEDCOST1",
    "CHECKUP1",
    "EXERANY2",
    "EXRACT12",
    "EXEROFT1",
    "EXERHMM1",
    "EXRACT22",
    "EXEROFT2",
    "EXERHMM2",
    "STRENGTH",
    "ADDEPEV3",
    "MARITAL",
    "EDUCA",
    "RENTHOM1",
    "VETERAN3",
    "EMPLOY1",
    "CHILDREN",
    "INCOME3",
    "PREGNANT",
    "WEIGHT2",
    "HEIGHT3",
    "DEAF",
    "BLIND",
    "DECIDE",
    "DIFFWALK",
    "DIFFDRES",
    "DIFFALON",
    "SMOKE100",
    "SMOKDAY2",
    "ECIGNOW2",
    "ALCDAY4",
    "AVEDRNK3",
    "DRNK3GE5",
    "MAXDRNKS",
    "SOMALE",
    "SOFEMALE",
    "MARIJAN1",
    "ACEDEPRS",
    "ACEDRINK",
    "ACEDRUGS",
    "ACEPRISN",
    "ACEDIVRC",
    "ACEPUNCH",
    "ACEHURT1",
    "ACESWEAR",
    "ACETOUCH",
    "ACETTHEM",
    "ACEHVSEX",
    "ACEADSAF",
    "ACEADNED",
    "LSATISFY",
    "EMTSUPRT",
    "SDLONELY",
    "SDHEMPLY",
    "FOODSTMP",
    "SDHFOOD1",
    "SDHBILLS",
    "SDHUTILS",
    "SDHTRNSP",
    "SDHSTRE1",
    "_METSTAT",
    "_URBSTAT"
]

df_subset2 = metadata[metadata['SAS Variable Name'].isin(variables)]

df_subset2 = df_subset2.reset_index(drop=True)
df_subset2.shape

(421, 6)

In [9]:
# Join with dataframe to inspect data types and values
data = pd.read_csv("data/LLCP2023.csv")

# Subset columns to match df_subset2
data = data[data.columns.intersection(df_subset2['SAS Variable Name'])]

data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433323 entries, 0 to 433322
Data columns (total 68 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   SEXVAR    433323 non-null  float64
 1   PHYSHLTH  433320 non-null  float64
 2   MENTHLTH  433320 non-null  float64
 3   POORHLTH  252170 non-null  float64
 4   PRIMINS1  433318 non-null  float64
 5   PERSDOC3  433320 non-null  float64
 6   MEDCOST1  433321 non-null  float64
 7   CHECKUP1  433321 non-null  float64
 8   EXERANY2  433321 non-null  float64
 9   EXRACT12  325227 non-null  float64
 10  EXEROFT1  323041 non-null  float64
 11  EXERHMM1  323039 non-null  float64
 12  EXRACT22  323040 non-null  float64
 13  EXEROFT2  228757 non-null  float64
 14  EXERHMM2  228757 non-null  float64
 15  STRENGTH  433319 non-null  float64
 16  ADDEPEV3  433320 non-null  float64
 17  MARITAL   433316 non-null  float64
 18  EDUCA     433314 non-null  float64
 19  RENTHOM1  433315 non-null  float64
 20  VETE

In [10]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Adverse Childhood Experiences (ACE) variables
# Define your mapping
ace_value_mapping = {
    'ACEDEPRS': {1: 1, 2: 0, 7: None, 9: None, 'BLANK': None, None: None},
    'ACEDRINK': {1: 1, 2: 0, 7: None, 9: None, 'BLANK': None, None: None},
    'ACEDRUGS': {1: 1, 2: 0, 7: None, 9: None, 'BLANK': None, None: None},
    'ACEPRISN': {1: 1, 2: 0, 7: None, 9: None, 'BLANK': None, None: None},
    'ACEDIVRC': {1: 1, 2: 0, 7: None, 8: None, 9: None, 'BLANK': None, None: None},
    'ACEPUNCH': {1: 0, 2: 1, 3: 1, 7: None, 9: None, 'BLANK': None, None: None},
    'ACEHURT1': {1: 0, 2: 1, 3: 1, 7: None, 9: None, 'BLANK': None, None: None},
    'ACESWEAR': {1: 0, 2: 1, 3: 1, 7: None, 9: None, 'BLANK': None, None: None},
    'ACETOUCH': {1: 0, 2: 1, 3: 1, 7: None, 9: None, 'BLANK': None, None: None},
    'ACETTHEM': {1: 0, 2: 1, 3: 1, 7: None, 9: None, 'BLANK': None, None: None},
    'ACEHVSEX': {1: 0, 2: 1, 3: 1, 7: None, 9: None, 'BLANK': None, None: None},
    'ACEADSAF': {1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 7: None, 9: None, 'BLANK': None, None: None},
    'ACEADNED': {1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 7: None, 9: None, 'BLANK': None, None: None}
}

# List of ACE columns
ace_columns = list(ace_value_mapping.keys())

# Apply mapping to each column
for col in ace_columns:
    data[col] = data[col].map(ace_value_mapping[col])

# Convert None to np.nan explicitly if needed
data[ace_columns] = data[ace_columns].applymap(lambda x: np.nan if x is None else x)

# Calculate total ACE score (ignoring missing values)
data['ACE_TOTAL'] = data[ace_columns].sum(axis=1, skipna=True)

data['ACE_TOTAL'].describe()

  data[ace_columns] = data[ace_columns].applymap(lambda x: np.nan if x is None else x)


count    433323.000000
mean          0.471549
std           1.440661
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          13.000000
Name: ACE_TOTAL, dtype: float64

In [11]:
scaler = MinMaxScaler((0, 1))
data['ACE_NORMALIZED'] = scaler.fit_transform(data[['ACE_TOTAL']])

data['ACE_NORMALIZED'].describe()

count    433323.000000
mean          0.036273
std           0.110820
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: ACE_NORMALIZED, dtype: float64

In [12]:
# Map alcohol severity based on the provided rules
def map_alcohol_severity_weighted(row):
    # Frequency: ALCDAY4
    alcfreq = row['ALCDAY4']
    if 101 <= alcfreq <= 199:
        freq_score = alcfreq - 100  # days/week (1–99)
    elif 201 <= alcfreq <= 299:
        freq_score = (alcfreq - 200) / 4.3  # days/month → days/week
    else:
        freq_score = np.nan

    # Normalize to 0–1 (max possible is 30 drinking days)
    norm_freq = freq_score / 30 if pd.notna(freq_score) else np.nan

    # Drinks per day: AVEDRNK3
    avg_drinks = row['AVEDRNK3']
    norm_drinks = avg_drinks / 15 if 1 <= avg_drinks <= 76 else np.nan  # cap at 15 drinks

    # Binge days: DRNK3GE5
    binge = row['DRNK3GE5']
    norm_binge = binge / 30 if 1 <= binge <= 76 else np.nan

    # Max drinks in a session: MAXDRNKS
    maxdrinks = row['MAXDRNKS']
    norm_max = maxdrinks / 15 if 1 <= maxdrinks <= 76 else np.nan  # cap at 15 drinks

    # Combine with equal weights (25% each)
    components = [norm_freq, norm_drinks, norm_binge, norm_max]
    valid_components = [x for x in components if pd.notna(x)]

    if valid_components:
        weighted_score = np.mean(valid_components)  # Each component has equal weight
    else:
        weighted_score = np.nan

    return weighted_score


# Apply function row-wise
data['ALCOHOL_SEVERITY'] = data.apply(map_alcohol_severity_weighted, axis=1)

scaler = MinMaxScaler((0, 1))
data['ALCOHOL_SEVERITY_NORMALIZED'] = scaler.fit_transform(data[['ALCOHOL_SEVERITY']])

data['ALCOHOL_SEVERITY_NORMALIZED'].describe()

count    212620.000000
mean          0.040173
std           0.034376
min           0.000000
25%           0.019920
50%           0.033253
75%           0.049317
max           1.000000
Name: ALCOHOL_SEVERITY_NORMALIZED, dtype: float64

In [13]:
# Map Gender variable to labels
sexvar_mapping = {
    1: 'Male',
    2: 'Female'
}

data['SEXVAR'] = data['SEXVAR'].map(sexvar_mapping)
data['SEXVAR'].describe()


count     433323
unique         2
top       Female
freq      229541
Name: SEXVAR, dtype: object

In [14]:
# --- Define mapping dictionaries ---
marital_mapping = {
    1: "Married", 2: "Divorced", 3: "Widowed", 4: "Separated",
    5: "Never married", 6: "A member of an unmarried couple",
    9: None, "": None
}

educa_mapping = {
    1: "Never attended school or only kindergarten",
    2: "Grades 1 through 8 (Elementary)",
    3: "Grades 9 through 11 (Some high school)",
    4: "Grade 12 or GED (High school graduate)",
    5: "College 1 to 3 years (Some college/tech school)",
    6: "College 4+ years (College graduate)",
    9: None, "": None
}

renthom1_mapping = {
    1: "Own", 2: "Rent", 3: "Other arrangement",
    7: None, 9: None, "": None
}

veteran3_mapping = {
    1: "Yes", 2: "No", 7: None, 9: None, "": None
}

employ1_mapping = {
    1: "Employed for wages", 2: "Self-employed", 3: "Out of work ≥1 year",
    4: "Out of work <1 year", 5: "Homemaker", 6: "Student",
    7: "Retired", 8: "Unable to work", 9: None, "": None
}

children_mapping = {**{i: str(i) for i in range(1, 88)}, 88: "", 99: None, "": None}

income3_mapping = {
    1: "<$10k", 2: "$10k–15k", 3: "$15k–20k", 4: "$20k–25k",
    5: "$25k–35k", 6: "$35k–50k", 7: "$50k–75k", 8: "$75k–100k",
    9: "$100k–150k", 10: "$150k–200k", 11: "$200k+", 77: None,
    99: None, "": None
}
    
pregnant_mapping = {
    1: "Yes", 2: "No", 7: None, 9: None, "": None
}

weight2_mapping = {
    7777: None, 9999: None, "": None
    # all other values can be used as-is (in lbs)
}

height3_mapping = {
    7777: None, 9999: None, "": None
    # all other values can be used as-is (in inches or cm)
}

# --- Apply mappings to DataFrame columns ---
data['MARITAL'] = data['MARITAL'].map(marital_mapping)
data['EDUCA'] = data['EDUCA'].map(educa_mapping)
data['RENTHOM1'] = data['RENTHOM1'].map(renthom1_mapping)
data['VETERAN3'] = data['VETERAN3'].map(veteran3_mapping)
data['EMPLOY1'] = data['EMPLOY1'].map(employ1_mapping)
data['CHILDREN'] = data['CHILDREN'].map(children_mapping)
data['INCOME3'] = data['INCOME3'].map(income3_mapping)
data['PREGNANT'] = data['PREGNANT'].map(pregnant_mapping)
data['WEIGHT2'] = data['WEIGHT2'].map(weight2_mapping)
data['HEIGHT3'] = data['HEIGHT3'].map(height3_mapping)

In [15]:
# --- Define shared disability mapping ---
disability_mapping = {
    1: "Yes",
    2: "No",
    7: None,
    9: None,
    "": None
}

# --- List of disability-related columns ---
disability_vars = ['DEAF', 'BLIND', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON']

# --- Map to label columns ---
for col in disability_vars:
    data[f'{col}_label'] = data[col].map(disability_mapping)

# --- Create binary indicator columns: 1 if "Yes", 0 otherwise ---
for col in disability_vars:
    data[f'{col}_bin'] = data[col].apply(lambda x: 1 if x == 1 else 0)

# --- Create unnormalized disability index ---
data['DISABILITY_INDEX'] = data[[f'{col}_bin' for col in disability_vars]].sum(axis=1)

# --- Normalize the index to 0–1 scale (equal weightage to all 6 variables) ---
data['DISABILITY_INDEX_NORMALIZED'] = data['DISABILITY_INDEX'] / len(disability_vars)


In [16]:
metstat_mapping = {
    1: "Metropolitan",
    2: "Nonmetropolitan",
    "": None
}

urbstat_mapping = {
    1: "Urban",
    2: "Rural",
    "": None
}

data['METRO_AREA'] = data['_METSTAT'].map(metstat_mapping)
data['URBAN_RURAL_AREA'] = data['_URBSTAT'].map(urbstat_mapping)


In [17]:
# Chronic Health Conditions (CHC) variables
addepev3_mapping = {
    1: "Yes",
    2: "No",
    7: None,
    9: None,
    np.nan: None    
}

data['ADDEPEV3'] = data['ADDEPEV3'].map(addepev3_mapping)

In [18]:
# --- Assume your dataframe is already loaded as 'data' ---

# SMOKING INDEX component scores
data['SMOKE100'] = data['SMOKE100'].map({1: 1, 2: 0, 7: 0, 9: 0})
data['SMOKDAY2'] = data['SMOKDAY2'].map({1: 1, 2: 0.5, 3: 0, 7: 0, 9: 0})
data['ECIGNOW2'] = data['ECIGNOW2'].map({2: 1, 3: 0.5, 4: 0, 1: 0, 7: 0, 9: 0})

# Compute raw smoking index
data['SMOKING_INDEX'] = data[['SMOKE100', 'SMOKDAY2', 'ECIGNOW2']].mean(axis=1)

# Normalize smoking index to [0,1]
smoking_min = data['SMOKING_INDEX'].min()
smoking_max = data['SMOKING_INDEX'].max()
data['SMOKING_INDEX_NORMALIZED'] = (data['SMOKING_INDEX'] - smoking_min) / (smoking_max - smoking_min)

# MARIJUANA INDEX (raw value is number of days, capped between 0 and 30)
data['MARIJAN1'] = np.where(
    data['MARIJAN1'].isin(range(1, 31)),
    data['MARIJAN1'],
    0
)

# Normalize marijuana index to [0,1]
data['MARIJUANA_INDEX_NORMALIZED'] = data['MARIJAN1'] / 30


In [19]:
def map_orientation(somale, sofemale):
    val = somale if pd.notna(somale) else sofemale

    if val == 1:
        return 1  # Gay or Lesbian
    elif val == 3:
        return 0.5  # Bisexual
    elif val in [2, 4, 7, 9] or pd.isna(val):
        return 0  # Straight or Other/Refused/Missing
    else:
        return 0  # Fallback for anything unexpected

# Apply to dataframe
data['SEXUAL_ORIENTATION_SCORE'] = data.apply(
    lambda row: map_orientation(row['SOMALE'], row['SOFEMALE']), axis=1
)

In [20]:
# --- 1. Mappings ---
satisfaction_map = {
    '1': 0, '2': 0.33, '3': 0.67, '4': 1,
    '7': np.nan, '9': np.nan, '': np.nan, np.nan: np.nan
}

emotional_support_map = {
    '1': 0, '2': 0.25, '3': 0.5, '4': 0.75, '5': 1,
    '7': np.nan, '9': np.nan, '': np.nan, np.nan: np.nan
}

lonely_map = {
    '1': 1, '2': 0.75, '3': 0.5, '4': 0.25, '5': 0,
    '7': np.nan, '9': np.nan, '': np.nan, np.nan: np.nan
}

employment_loss_map = {
    '1': 1, '2': 0,
    '7': np.nan, '9': np.nan, '': np.nan, np.nan: np.nan
}

food_stamps_map = {
    '1': 1, '2': 0,
    '7': np.nan, '9': np.nan, '': np.nan, np.nan: np.nan
}

food_insecurity_map = {
    '1': 1, '2': 0.75, '3': 0.5, '4': 0.25, '5': 0,
    '7': np.nan, '9': np.nan, '': np.nan, np.nan: np.nan
}

bills_map = {
    '1': 1, '2': 0,
    '7': np.nan, '9': np.nan, '': np.nan, np.nan: np.nan
}

utils_map = bills_map  # same as bills_map

transport_map = bills_map  # same as bills_map

stress_map = food_insecurity_map  # same as food insecurity

mapping_dict = {
    'LSATISFY': satisfaction_map,
    'EMTSUPRT': emotional_support_map,
    'SDLONELY': lonely_map,
    'SDHEMPLY': employment_loss_map,
    'FOODSTMP': food_stamps_map,
    'SDHFOOD1': food_insecurity_map,
    'SDHBILLS': bills_map,
    'SDHUTILS': utils_map,
    'SDHTRNSP': transport_map,
    'SDHSTRE1': stress_map
}

indexes = {
    'financial_insecurity': ['SDHBILLS', 'SDHUTILS', 'SDHTRNSP'],
    'food_insecurity': ['FOODSTMP', 'SDHFOOD1'],
    'emotional_distress': ['LSATISFY', 'EMTSUPRT', 'SDLONELY', 'SDHSTRE1'],
    'facing_unemployment': ['SDHEMPLY']
}

weights = {
    'financial_insecurity': {'SDHBILLS': 0.4, 'SDHUTILS': 0.3, 'SDHTRNSP': 0.3},
    'food_insecurity': {'FOODSTMP': 0.5, 'SDHFOOD1': 0.5},
    'emotional_distress': {'LSATISFY': 0.25, 'EMTSUPRT': 0.25, 'SDLONELY': 0.25, 'SDHSTRE1': 0.25},
    'facing_unemployment': {'SDHEMPLY': 1.0}
}

# --- 2. Helper functions ---

def map_column(series, mapping):
    return series.astype(str).map(mapping).astype(float)

def weighted_mean(row, cols, wts):
    vals = row[cols]
    mask = ~vals.isna()
    if mask.sum() == 0:
        return np.nan
    valid_vals = vals[mask]
    valid_cols = [col.replace('_mapped', '') for col in cols]
    valid_weights = np.array([wts.get(col, 0) for col in valid_cols])
    valid_weights = valid_weights[mask.values]
    if valid_weights.sum() == 0:
        return np.nan
    return np.dot(valid_vals, valid_weights) / valid_weights.sum()

# --- 3. Calculate and add final indexes only ---

for idx_name, cols in indexes.items():
    mapped_cols = []
    for col in cols:
        if col in data.columns:
            mapped_col = data[col].astype(str).map(mapping_dict[col]).astype(float)
            mapped_cols.append(mapped_col)
        else:
            # Create a Series of NaNs if column missing
            mapped_cols.append(pd.Series([np.nan]*len(data), index=data.index))
    mapped_df = pd.concat(mapped_cols, axis=1)
    mapped_df.columns = [c + '_mapped' for c in cols]
    
    # Calculate weighted mean ignoring NaNs
    data[idx_name.upper() + '_INDEX'] = mapped_df.apply(
        lambda row: weighted_mean(row, mapped_df.columns.tolist(), weights[idx_name]),
        axis=1
    )
    
    # Normalize index to 0-1 scale
    min_val = data[idx_name.upper() + '_INDEX'].min()
    max_val = data[idx_name.upper() + '_INDEX'].max()
    if max_val - min_val == 0:
        data[idx_name.upper() + '_INDEX_NORMALIZED'] = 0
    else:
        data[idx_name.upper() + '_INDEX_NORMALIZED'] = (
            data[idx_name.upper() + '_INDEX'] - min_val) / (max_val - min_val)

In [21]:
# Mapping dictionaries for each variable (mapping to numeric scores)
# You can adjust numeric scores to reflect meaningful order/impact
primins1_map = {
    1: 0.8,   # Employer/union plan
    2: 0.7,   # Private nongovernmental plan
    3: 0.6,   # Medicare
    4: 0.5,   # Medigap
    5: 0.4,   # Medicaid
    6: 0.4,   # CHIP
    7: 0.6,   # Military health care
    8: 0.5,   # Indian Health Service
    9: 0.5,   # State sponsored plan
    10: 0.4,  # Other government
    88: 0.0,  # No coverage
    77: np.nan, # Don't know
    99: np.nan, # Refused
    np.nan: np.nan
}

persdoc3_map = {
    1: 1.0,   # Yes, only one
    2: 1.0,   # More than one
    3: 0.0,   # No personal doctor
    7: np.nan, # Don't know
    9: np.nan, # Refused
    np.nan: np.nan
}

medcost1_map = {
    1: 0.0,   # Could not afford to see doctor
    2: 1.0,   # Could afford
    7: np.nan, # Don't know
    9: np.nan, # Refused
    np.nan: np.nan
}

checkup1_map = {
    1: 1.0,   # Within past year
    2: 0.8,   # Within past 2 years
    3: 0.5,   # Within past 5 years
    4: 0.2,   # 5 or more years ago
    7: np.nan, # Don't know
    8: 0.0,   # Never
    9: np.nan, # Refused
    np.nan: np.nan
}

# Map columns to numeric scores
data['PRIMINS1'] = data['PRIMINS1'].map(primins1_map)
data['PERSDOC3'] = data['PERSDOC3'].map(persdoc3_map)
data['MEDCOST1'] = data['MEDCOST1'].map(medcost1_map)
data['CHECKUP1'] = data['CHECKUP1'].map(checkup1_map)

# Weights for each variable
weights = {
    'PRIMINS1': 0.25,
    'PERSDOC3': 0.25,
    'MEDCOST1': 0.25,
    'CHECKUP1': 0.25
}

mapped_cols = list(weights.keys())

# Compute weighted index (ignore nan by using weighted mean of available variables)
def weighted_index(row):
    vals = row[mapped_cols]
    wts = np.array([weights[col] for col in mapped_cols])
    mask = ~vals.isna()
    if mask.sum() == 0:
        return np.nan
    return np.dot(vals[mask], wts[mask]) / wts[mask].sum()

data['HEALTHCAREACCESS_INDEX_WEIGHTED'] = data.apply(weighted_index, axis=1)

# Normalize index to 0-1 scale (min-max normalization ignoring NaNs)
min_val = data['HEALTHCAREACCESS_INDEX_WEIGHTED'].min()
max_val = data['HEALTHCAREACCESS_INDEX_WEIGHTED'].max()

data['HEALTHCAREACCESS_INDEX_NORMALIZED'] = (data['HEALTHCAREACCESS_INDEX_WEIGHTED'] - min_val) / (max_val - min_val)

In [22]:
def map_menthlth(val):
    try:
        val_str = str(val).strip()
        if val_str in ['77', '88', '99', '', 'nan', 'NaN', 'None']:
            return np.nan
        num_days = int(val_str)
        if 1 <= num_days <= 30:
            return num_days / 30  # normalize 1-30 to 0.033 to 1.0
        else:
            return np.nan
    except:
        return np.nan

# Apply to your DataFrame column 'MENTHLTH'
data['MENTHLTH_MAPPED'] = data['MENTHLTH'].apply(map_menthlth)

In [23]:
# Frequency mapping function (times/week)
def map_frequency(val, never_code=None):
    try:
        val = int(val)
        if 101 <= val <= 199:
            return val - 100
        elif 201 <= val <= 299:
            return (val - 200) / 4.345
        elif never_code is not None and val == never_code:
            return 0
        else:
            return np.nan
    except:
        return np.nan

# Map strength exercise with 888 as 'Never'
data['STRENGTH_WEEKLY'] = data['STRENGTH'].apply(lambda x: map_frequency(x, never_code=888))

# Map aerobic exercise frequencies
data['EXEROFT1_WEEKLY'] = data['EXEROFT1'].apply(map_frequency)
data['EXEROFT2_WEEKLY'] = data['EXEROFT2'].apply(map_frequency)

# Activity type mapping
activity_map = {
    1: 'walking',
    2: 'running_jogging',
    3: 'gardening',
    4: 'bicycling',
    5: 'aerobics_class',
    6: 'calisthenics',
    7: 'elliptical',
    8: 'household_activities',
    9: 'weight_lifting',
    10: 'yoga_pilates_tai_chi',
    11: 'other',
    77: np.nan,
    88: 'no_other_activity',
    99: np.nan,
}

data['EXRACT12_ACTIVITY'] = data['EXRACT12'].map(activity_map).astype('category')
data['EXRACT22_ACTIVITY'] = data['EXRACT22'].map(activity_map).astype('category')

# Aerobic activity indicator
aerobic_codes = {1, 2, 4, 5, 7}
data['EXRACT12_AEROBIC'] = data['EXRACT12'].apply(lambda x: int(x) in aerobic_codes if pd.notna(x) else False)
data['EXRACT22_AEROBIC'] = data['EXRACT22'].apply(lambda x: int(x) in aerobic_codes if pd.notna(x) else False)

# Minutes/hours mapping
def map_time_minutes(val):
    try:
        val = int(val)
        if 1 <= val <= 759:
            return val
        elif 800 <= val <= 959:
            return (val - 800) * 60
        else:
            return np.nan
    except:
        return np.nan

data['EXERHMM1_MIN'] = data['EXERHMM1'].apply(map_time_minutes)
data['EXERHMM2_MIN'] = data['EXERHMM2'].apply(map_time_minutes)

In [27]:
# Drop the individual ACE columns if not needed anymore
drop_columns = [
    'ACEDEPRS',
    'ACEDRINK',
    'ACEDRUGS',
    'ACEPRISN',
    'ACEDIVRC',
    'ACEPUNCH',
    'ACEHURT1',
    'ACESWEAR',
    'ACETOUCH',
    'ACETTHEM',
    'ACEHVSEX',
    'ACEADSAF',
    'ACEADNED',
    'ALCDAY4', 'AVEDRNK3', 'DRNK3GE5', 'MAXDRNKS', # Alcohol consumption variables
    'DEAF', 'BLIND', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON', # Disability variables
    '_METSTAT', '_URBSTAT', # Urban/rural status
    'SMOKE100', 'SMOKDAY2', 'ECIGNOW2', # Smoking index components
    'MARIJAN1', # Marijuana days
    'SOMALE', 'SOFEMALE', # Sexual orientation components
    'LSATISFY',
    'EMTSUPRT',
    'SDLONELY',
    'SDHEMPLY',
    'FOODSTMP',
    'SDHFOOD1',
    'SDHBILLS',
    'SDHUTILS',
    'SDHTRNSP',
    'SDHSTRE1', 
    'PRIMINS1',
    'PERSDOC3',
    'MEDCOST1',
    'CHECKUP1',
    'PHYSHLTH',
    'MENTHLTH',
    'POORHLTH',
    "EXERANY2",
    "EXRACT12",
    "EXEROFT1",
    "EXERHMM1",
    "EXRACT22",
    "EXEROFT2",
    "EXERHMM2",
    "STRENGTH"
]

data = data.drop(columns=drop_columns)

In [30]:
columns_to_drop = [
    "DEAF_label", "BLIND_label", "DECIDE_label", "DIFFWALK_label",
    "DIFFDRES_label", "DIFFALON_label",
    "DEAF_bin", "BLIND_bin", "DECIDE_bin", "DIFFWALK_bin",
    "DIFFDRES_bin", "DIFFALON_bin"
]

data.drop(columns=columns_to_drop, inplace=True)

In [31]:
data.head(5)

Unnamed: 0,SEXVAR,ADDEPEV3,MARITAL,EDUCA,RENTHOM1,VETERAN3,EMPLOY1,CHILDREN,INCOME3,PREGNANT,...,MENTHLTH_MAPPED,STRENGTH_WEEKLY,EXEROFT1_WEEKLY,EXEROFT2_WEEKLY,EXRACT12_ACTIVITY,EXRACT22_ACTIVITY,EXRACT12_AEROBIC,EXRACT22_AEROBIC,EXERHMM1_MIN,EXERHMM2_MIN
0,Female,No,Married,College 1 to 3 years (Some college/tech school),Own,No,Retired,,,,...,,0.0,,,,,False,False,,
1,Female,Yes,Divorced,College 1 to 3 years (Some college/tech school),Own,No,Retired,,,,...,,0.0,6.0,,walking,no_other_activity,True,False,30.0,
2,Female,No,Widowed,Grade 12 or GED (High school graduate),Rent,No,Retired,,$10k–15k,,...,,1.150748,1.150748,,walking,no_other_activity,True,False,15.0,
3,Female,Yes,Married,College 1 to 3 years (Some college/tech school),Own,No,Retired,,,,...,,0.0,3.0,,walking,no_other_activity,True,False,30.0,
4,Female,Yes,Widowed,College 1 to 3 years (Some college/tech school),Own,No,Unable to work,,$50k–75k,,...,,0.0,2.0,7.0,walking,household_activities,True,False,45.0,100.0


In [32]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433323 entries, 0 to 433322
Data columns (total 44 columns):
 #   Column                                 Non-Null Count   Dtype   
---  ------                                 --------------   -----   
 0   SEXVAR                                 433323 non-null  object  
 1   ADDEPEV3                               430736 non-null  object  
 2   MARITAL                                429034 non-null  object  
 3   EDUCA                                  430998 non-null  object  
 4   RENTHOM1                               429233 non-null  object  
 5   VETERAN3                               430757 non-null  object  
 6   EMPLOY1                                425642 non-null  object  
 7   CHILDREN                               424237 non-null  object  
 8   INCOME3                                346700 non-null  object  
 9   PREGNANT                               75759 non-null   object  
 10  WEIGHT2                                0 non

In [33]:
data.to_csv("data/LLCP2023_processed.csv", index=False)