# Creating table and storing data 

## Step 1: Import and Set Up

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
# 1. Setup environment and imports
import sys

# Add the 'scripts' folder to the Python path
scripts_path = Path("../scripts").resolve()
if str(scripts_path) not in sys.path:
    sys.path.append(str(scripts_path))

from db_utils import create_connection_from_script, save_to_sqlite, close_connection, run_query, create_nhanes_tables

# Folder containing your processed CSVs
from config import (
    PROCESSED_DATA_DIR,
    DATABASE_PATH, 
    FINAL_DATA_DIR    
)


All required directories are ready.


## Step 2: Define Your Required Files and Paths

In [2]:

# List of processed CSV files
datasets_to_load = {
    "demographics": "demo_l_processed.csv",
    "diet": "dr1tot_l_processed.csv",
    "physical_activity": "paq_l_processed.csv",
    "sleep": "slq_l_processed.csv",
    "health_insurance": "hiq_l_processed.csv",
    "bmi": "bmx_l_processed.csv",
    "bp": "bpxo_l_processed.csv",
    "total_cholestrol": "tchol_l_processed.csv",
    "glucose": "glu_l_processed.csv",
    "diabetes": "diq_l_processed.csv",
    "cardio_vascular": "mcq_l_processed.csv"
}


## Step 3: Load Each CSV into a DataFrame

In [3]:
# Dictionary to store DataFrames
dataframes = {}

for table_name, filename in datasets_to_load.items():
    file_path = Path(PROCESSED_DATA_DIR) / filename
    
    # Load CSV, force participant_id column to string
    df = pd.read_csv(file_path, dtype={"participant_id": str})
    
    # Clean participant_id: convert values like "12345.0" to "12345"
    df['participant_id'] = df['participant_id'].apply(
        lambda x: str(int(float(x))) if pd.notnull(x) else np.nan
    )
    
    dataframes[table_name] = df
    print(f"Loaded '{table_name}' with shape {df.shape}")
  

Loaded 'demographics' with shape (6064, 12)
Loaded 'diet' with shape (5879, 13)
Loaded 'physical_activity' with shape (5894, 9)
Loaded 'sleep' with shape (8371, 5)
Loaded 'health_insurance' with shape (11871, 2)
Loaded 'bmi' with shape (8471, 3)
Loaded 'bp' with shape (7518, 4)
Loaded 'total_cholestrol' with shape (6890, 3)
Loaded 'glucose' with shape (3360, 8)
Loaded 'diabetes' with shape (11744, 5)
Loaded 'cardio_vascular' with shape (11744, 6)


###  Step 4: Connect to SQLite Database

In [4]:
conn = create_connection_from_script(DATABASE_PATH)

Connected to database at: C:\Users\sahil\Documents\CodeYou-Project\DA_Projects\health_track\database\nhanes_2021_2023.db


###  Step 5: Create Your Tables

In [5]:
from db_utils import create_nhanes_tables
create_nhanes_tables(conn)

Created table 'demographics'
Created table 'health_insurance'
Created table 'sleep'
Created table 'physical_activity'
Created table 'diet'
Created table 'bmi'
Created table 'bp'
Created table 'total_cholestrol'
Created table 'glucose'
Created table 'diabetes'
Created table 'cardio_vascular'
All required NHANES tables created successfully.


### Check point 1: Check tables in the database

In [6]:
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables_df = run_query(conn, query)

print("Tables in the database:")
print(tables_df)

Tables in the database:
                 name
0        demographics
1               sleep
2                diet
3   physical_activity
4    health_insurance
5                 bmi
6                  bp
7    total_cholestrol
8             glucose
9            diabetes
10    cardio_vascular


### Check point 2: Get the structure of each table

In [7]:
# Get the list of tables
tables_df = run_query(conn, "SELECT name FROM sqlite_master WHERE type='table';")
table_names = tables_df['name'].tolist()

# Loop through each table and print its schema
for table in table_names:
    print(f"\nSchema for table: '{table}'")
    schema_df = run_query(conn, f"PRAGMA table_info({table});")
    print(schema_df)


Schema for table: 'demographics'
    cid                     name     type  notnull dflt_value  pk
0     0           participant_id     TEXT        0       None   1
1     1                   gender     TEXT        0       None   0
2     2                      age  INTEGER        0       None   0
3     3           race_ethnicity     TEXT        0       None   0
4     4          education_level     TEXT        0       None   0
5     5     poverty_income_ratio     REAL        0       None   0
6     6  interview_sample_weight     REAL        0       None   0
7     7       exam_sample_weight     REAL        0       None   0
8     8                   strata     REAL        0       None   0
9     9                      psu     REAL        0       None   0
10   10             pir_category     TEXT        0       None   0

Schema for table: 'sleep'
   cid            name  type  notnull dflt_value  pk
0    0  participant_id  TEXT        0       None   1
1    1    sleep_avg_hr  REAL        0    

### Step 6: Save DataFrames to SQLite Tables

In [8]:

# Save data
save_to_sqlite(df=dataframes["demographics"], conn=conn, table_name="demographics", recreate=True)
save_to_sqlite(df=dataframes["sleep"], conn=conn, table_name="sleep", recreate=True)
save_to_sqlite(df=dataframes["diet"], conn=conn, table_name="diet", recreate=True)
save_to_sqlite(df=dataframes["physical_activity"], conn=conn, table_name="physical_activity", recreate=True)
save_to_sqlite(df=dataframes["health_insurance"], conn=conn, table_name="health_insurance", recreate=True)

save_to_sqlite(df=dataframes["bmi"], conn=conn, table_name="bmi", recreate=True)
save_to_sqlite(df=dataframes["bp"], conn=conn, table_name="bp", recreate=True)
save_to_sqlite(df=dataframes["total_cholestrol"], conn=conn, table_name="total_cholestrol", recreate=True)
save_to_sqlite(df=dataframes["glucose"], conn=conn, table_name="glucose", recreate=True)
save_to_sqlite(df=dataframes["diabetes"], conn=conn, table_name="diabetes", recreate=True)
save_to_sqlite(df=dataframes["cardio_vascular"], conn=conn, table_name="cardio_vascular", recreate=True)




Recreated table 'demographics'
Dropping columns not in table 'demographics': ['interview_sample_weight_missing']
Inserted 6064 rows into 'demographics'
Recreated table 'sleep'
Dropping columns not in table 'sleep': ['sleep_weekday_hr', 'sleep_weekend_hr']
Inserted 8371 rows into 'sleep'
Recreated table 'diet'
Dropping columns not in table 'diet': ['dietary_recall_complete', 'energy_kcal', 'energy_kcal_missing_flag', 'fiber_g', 'fiber_g_missing_flag', 'sat_fat_g', 'sat_fat_g_missing_flag', 'sugar_g', 'sugar_g_missing_flag']
Inserted 5879 rows into 'diet'
Recreated table 'physical_activity'
Dropping columns not in table 'physical_activity': ['duration_min', 'freq', 'freq_per_week', 'freq_unit', 'sedentary_min_per_day']
Inserted 5894 rows into 'physical_activity'
Recreated table 'health_insurance'
Inserted 11871 rows into 'health_insurance'
Recreated table 'bmi'
Inserted 8471 rows into 'bmi'
Recreated table 'bp'
Inserted 7518 rows into 'bp'
Recreated table 'total_cholestrol'
Inserted 6890

### Step 7: Check point

In [9]:
from db_utils import run_query

sample = run_query(conn, "SELECT * FROM demographics LIMIT 5;")
print(sample)


  participant_id  gender  age      race_ethnicity            education_level  \
0         130378    Male   43  Non-Hispanic Asian  College graduate or above   
1         130379    Male   66  Non-Hispanic White  College graduate or above   
2         130380  Female   44      Other Hispanic            High school/GED   
3         130386    Male   34    Mexican American     Some college/AA degree   
4         130387  Female   68  Non-Hispanic White  College graduate or above   

   poverty_income_ratio  interview_sample_weight  exam_sample_weight  strata  \
0                  5.00             50055.450807        54374.463898   173.0   
1                  5.00             29087.450605        34084.721548   173.0   
2                  1.41             80062.674301        81196.277992   174.0   
3                  1.33             30995.282610        39988.452940   179.0   
4                  1.32             19896.970559        20776.254850   181.0   

   psu pir_category  
0  2.0    Very H

### Step 8: prepare the data for Objective 1.1 analysis using Sql query

In [10]:
join_query = """
SELECT
    d.participant_id,
    d.age,
    d.gender,
    d.race_ethnicity,
    d.education_level,
    d.poverty_income_ratio,
    d.pir_category,
    d.interview_sample_weight,    
    i.has_health_insurance,
    s.sleep_avg_hr,
    s.sleep_category,
    p.activity_level,
    p.total_weekly_min,
    di.diet_score,
    di.diet_category
FROM demographics d
LEFT JOIN health_insurance i ON d.participant_id = i.participant_id
LEFT JOIN sleep s ON d.participant_id = s.participant_id
LEFT JOIN physical_activity p ON d.participant_id = p.participant_id
LEFT JOIN diet di ON d.participant_id = di.participant_id
"""

ls_sei_df = run_query(conn, join_query) # ls_sei - lifestyle and socio-economic indicator

ls_sei_df.head()

Unnamed: 0,participant_id,age,gender,race_ethnicity,education_level,poverty_income_ratio,pir_category,interview_sample_weight,has_health_insurance,sleep_avg_hr,sleep_category,activity_level,total_weekly_min,diet_score,diet_category
0,130378,43,Male,Non-Hispanic Asian,College graduate or above,5.0,Very High,50055.450807,Yes,9.357143,Long Sleep,Low active,135.0,1.0,Unhealthy
1,130379,66,Male,Non-Hispanic White,College graduate or above,5.0,Very High,29087.450605,Yes,9.0,Normal Sleep,Moderately active,180.0,3.0,Healthy
2,130380,44,Female,Other Hispanic,High school/GED,1.41,Mid,80062.674301,Yes,8.285714,Normal Sleep,Low active,20.0,1.0,Unhealthy
3,130386,34,Male,Mexican American,Some college/AA degree,1.33,Mid,30995.28261,Yes,7.642857,Normal Sleep,Low active,30.0,3.0,Healthy
4,130387,68,Female,Non-Hispanic White,College graduate or above,1.32,Mid,19896.970559,Yes,3.571429,Short Sleep,,,1.0,Unhealthy


### Step 9: Check and Save the lifestyle and socio-economic Merged Data

In [11]:
print(ls_sei_df.shape)
# Define the output path
output_path = FINAL_DATA_DIR / "lifestyle_socio_economic.csv"
ls_sei_df.to_csv(FINAL_DATA_DIR / "lifestyle_socio_economic.csv", index=False)
# Check if the file exists using pathlib
if output_path.exists():
    print(f"File saved successfully at: {output_path}")
else:
    print(f"File was not saved at: {output_path}")

(6064, 15)
File saved successfully at: C:\Users\sahil\Documents\CodeYou-Project\DA_Projects\health_track\data\final\lifestyle_socio_economic.csv


### Step 9: Prepare the full merged dataset for further analysing the objectives

In [12]:
remaining_tables = [
    "bmi",
    "bp",
    "total_cholestrol",
    "glucose",
    "diabetes",
    "cardio_vascular"
]
nhanes_df = ls_sei_df.copy()

for table in remaining_tables:
    df = pd.read_sql_query(f"SELECT * FROM {table}", conn)
    
    if 'participant_id' not in df.columns:
        raise KeyError(f"'participant_id' not found in table '{table}'")
    
    nhanes_df = pd.merge(nhanes_df, df, on="participant_id", how="left")

# Merge exam_sample_weight from demographics table
query = "SELECT participant_id, exam_sample_weight FROM demographics"
demo_df = run_query(conn, query)

# Safety check
if 'participant_id' not in demo_df.columns:
    raise KeyError("'participant_id' not found in 'demographics' table")

# Merge with your main NHANES dataframe
nhanes_df = pd.merge(nhanes_df, demo_df, on="participant_id", how="left")

nhanes_df.head()


Unnamed: 0,participant_id,age,gender,race_ethnicity,education_level,poverty_income_ratio,pir_category,interview_sample_weight,has_health_insurance,sleep_avg_hr,...,diabetes_dx,diabetes_meds,diabetes_meds_cat,diabetes_status,congestive_heart_failure,coronary_heart_disease,angina,heart_attack,any_cvd,exam_sample_weight
0,130378,43,Male,Non-Hispanic Asian,College graduate or above,5.0,Very High,50055.450807,Yes,9.357143,...,0.0,,Unknown,0.0,0.0,0.0,0.0,0.0,0,54374.463898
1,130379,66,Male,Non-Hispanic White,College graduate or above,5.0,Very High,29087.450605,Yes,9.0,...,0.0,,Unknown,0.0,0.0,0.0,0.0,0.0,0,34084.721548
2,130380,44,Female,Other Hispanic,High school/GED,1.41,Mid,80062.674301,Yes,8.285714,...,1.0,1.0,Taking meds,1.0,0.0,0.0,0.0,0.0,0,81196.277992
3,130386,34,Male,Mexican American,Some college/AA degree,1.33,Mid,30995.28261,Yes,7.642857,...,0.0,,Unknown,0.0,0.0,0.0,0.0,0.0,0,39988.45294
4,130387,68,Female,Non-Hispanic White,College graduate or above,1.32,Mid,19896.970559,Yes,3.571429,...,0.0,,Unknown,0.0,0.0,0.0,0.0,0.0,0,20776.25485


### Step 10: Check and Save the full NHANES Data

In [13]:
print(nhanes_df.shape)
# Define the output path
output_path = FINAL_DATA_DIR / "final_merged_nhanes_dataset.csv"
nhanes_df.to_csv(FINAL_DATA_DIR / "final_merged_nhanes_dataset.csv", index=False)
# Check if the file exists using pathlib
if output_path.exists():
    print(f"File saved successfully at: {output_path}")
else:
    print(f"File was not saved at: {output_path}")

(6064, 38)
File saved successfully at: C:\Users\sahil\Documents\CodeYou-Project\DA_Projects\health_track\data\final\final_merged_nhanes_dataset.csv


### Step 8: Close the database connection

In [14]:
close_connection(conn)

Database connection closed.
