# Data Exploration

Quick overview of the dataset before detailed analysis.

In [7]:
import pandas as pd
import sqlite3

# Load data from database
conn = sqlite3.connect('../databases/nhanes_1st.db')
df = pd.read_sql_query('SELECT * FROM raw_dataset', conn)
conn.close()

print(f"Dataset loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")

Dataset loaded: 56,893 rows × 29 columns


In [8]:
# NOTE: Column renaming should be moved to ETL pipeline or data cleaning later

# Ranaming columns standards
# - Use lowercase with underscores (snake_case)
# - Add units as abbriviations if needed (e.g., _cm, _bpm, _g_dl)
# - Use full words instead of abbreviations where possible

column_mapping = {
    # Demographics
    'RIDAGEYR': 'age',
    'RIAGENDR': 'gender',
    'RIDRETH3': 'ethnicity',
    'INDFMPIR': 'income_ratio',

    # Physical measurements
    'BMXBMI': 'body_mass_index',
    'BMXHT': 'height_cm',
    'Pulse': 'heart_rate_bpm',

    # Blood cell counts
    'LBXWBCSI': 'white_blood_cells_count',
    'LBXPLTSI': 'platelets_count',
    'LBXHGB': 'hemoglobin_g_dl',
    'LBXMCVSI': 'mean_corpuscular_volume_fL',

    # Kidney markers
    'LBXSCR': 'creatinine_mg_dl',
    'LBXSUA': 'uric_acid_mg_dl',

    # Liver markers
    'LBXSASSI': 'liver_ast_U_L',
    'LBXSTB': 'bilirubin_mg_dl',
    'LBXSGTSI': 'liver_ggt_U_L',

    # Electrolytes
    'LBXSNASI': 'sodium_mmol_L',
    'LBXSKSI': 'potassium_mmol_L',

    # Lipids
    'LBXTC': 'cholesterol_mg_dl',

    # Lifestyle
    'Alcohol_Drinks_Per_Week': 'alcohol_drinks_per_week',
    'SMQ040': 'smoking_status',

    # Target: Cardiovascular
    'Cardiovascular_target': 'has_cardiovascular_disease',

    # Target: Metabolic syndrome (5 components)
    'Waist_Label': 'high_waist_circumference',
    'Triglycerides_Label': 'high_triglycerides_mg_dl',
    'HDL_Label': 'low_hdl_mg_dl',
    'BP_Label': 'high_blood_pressure',
    'Glucose_Label': 'high_glucose_mg_dl',

    # Target: Kidney health
    'ACR_Log': 'kidney_acr_mg_g',

    # Target: Liver health
    'ALT_Log': 'liver_alt_U_L'
}

df.rename(columns=column_mapping, inplace=True)
print("✓ Columns renamed")

✓ Columns renamed


In [9]:
print("="*80)
print("DATASET INFO")
print("="*80)
df.info()

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(df.describe())

print("\n" + "="*80)
print("FIRST ROWS")
print("="*80)
display(df.head())

DATASET INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56893 entries, 0 to 56892
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         56893 non-null  float64
 1   gender                      56893 non-null  float64
 2   ethnicity                   56893 non-null  float64
 3   income_ratio                49583 non-null  float64
 4   body_mass_index             47424 non-null  float64
 5   height_cm                   47508 non-null  float64
 6   heart_rate_bpm              37178 non-null  float64
 7   white_blood_cells_count     43938 non-null  float64
 8   platelets_count             43938 non-null  float64
 9   hemoglobin_g_dl             43938 non-null  float64
 10  mean_corpuscular_volume_fL  43938 non-null  float64
 11  creatinine_mg_dl            34512 non-null  float64
 12  liver_ast_U_L               34432 non-null  float64
 13  bilirubin_mg_dl   

Unnamed: 0,age,gender,ethnicity,income_ratio,body_mass_index,height_cm,heart_rate_bpm,white_blood_cells_count,platelets_count,hemoglobin_g_dl,...,alcohol_drinks_per_week,smoking_status,has_cardiovascular_disease,high_waist_circumference,high_triglycerides_mg_dl,low_hdl_mg_dl,high_blood_pressure,high_glucose_mg_dl,kidney_acr_mg_g,liver_alt_U_L
0,69.0,1.0,4.0,0.84,26.7,171.3,86.0,4.7,204.0,15.2,...,0.019231,3.0,1.0,0.0,,0.0,0.0,,2.487041,2.772589
1,54.0,1.0,3.0,1.78,28.6,176.8,74.0,12.6,314.0,11.9,...,28.0,2.0,0.0,1.0,,0.0,1.0,,5.726848,3.367296
2,72.0,1.0,3.0,4.51,28.9,175.3,68.0,7.2,237.0,17.2,...,,3.0,0.0,1.0,0.0,0.0,1.0,1.0,2.445037,2.772589
3,9.0,1.0,3.0,2.52,17.1,137.3,64.0,7.8,240.0,12.9,...,,,,0.0,,0.0,0.0,,3.093432,
4,73.0,2.0,3.0,5.0,19.7,162.4,92.0,6.6,300.0,14.5,...,,,0.0,,0.0,0.0,1.0,1.0,5.161749,3.332205


In [10]:
# Save renamed columns back to database
conn = sqlite3.connect('../databases/nhanes_1st.db')
df.to_sql('raw_dataset', conn, if_exists='replace', index=False)
conn.close()

print("✓ Renamed dataset saved to database")

✓ Renamed dataset saved to database
