# Loading and Analysing NHANES Data

In [187]:
import pandas as pd
import os
from functools import reduce

## Demographic Data

In [4]:
# Reading the demographic features
demo_df = pd.read_sas('../Dataset/Demographic_XPT/DEMO_J.XPT')
demo_df.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,DMDHREDZ,DMDHRMAZ,DMDHSEDZ,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,93703.0,10.0,2.0,2.0,2.0,,5.0,6.0,2.0,27.0,...,3.0,1.0,3.0,9246.491865,8539.731348,2.0,145.0,15.0,15.0,5.0
1,93704.0,10.0,2.0,1.0,2.0,,3.0,3.0,1.0,33.0,...,3.0,1.0,2.0,37338.768343,42566.61475,1.0,143.0,15.0,15.0,5.0
2,93705.0,10.0,2.0,2.0,66.0,,4.0,4.0,2.0,,...,1.0,2.0,,8614.571172,8338.419786,2.0,145.0,3.0,3.0,0.82
3,93706.0,10.0,2.0,1.0,18.0,,5.0,6.0,2.0,222.0,...,3.0,1.0,2.0,8548.632619,8723.439814,2.0,134.0,,,
4,93707.0,10.0,2.0,1.0,13.0,,5.0,7.0,2.0,158.0,...,2.0,1.0,3.0,6769.344567,7064.60973,1.0,138.0,10.0,10.0,1.88


<a href="https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.htm#Codebook" target="_blank" rel="noopener">Demographic Variabeles Reference</a>

In [5]:
# We only require the Age and Gender from demographic features
# SEQN : Sequence Number
# RIAGENDR : Age
# RIDAGEYR : Gender

demo_df = demo_df[['SEQN','RIAGENDR','RIDAGEYR']]
# demo_df = demo_df.set_index('SEQN')
demo_df.head()

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR
0,93703.0,2.0,2.0
1,93704.0,1.0,2.0
2,93705.0,2.0,66.0
3,93706.0,1.0,18.0
4,93707.0,1.0,13.0


In [6]:
demo_df.shape

(9254, 3)

In [7]:
demo_df.describe()

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR
count,9254.0,9254.0,9254.0
mean,98329.5,1.507564,34.33423
std,2671.544029,0.49997,25.50028
min,93703.0,1.0,5.397605e-79
25%,96016.25,1.0,11.0
50%,98329.5,2.0,31.0
75%,100642.75,2.0,58.0
max,102956.0,2.0,80.0


In [8]:
demo_df.isnull().sum()

SEQN        0
RIAGENDR    0
RIDAGEYR    0
dtype: int64

## Laboratoty Data

<a href="https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory&CycleBeginYear=2017" target="_blank" rel="noopener">Laboratory Variabeles Reference</a>

In [42]:
# Reading the laboratory data features
lab_files_directory = "../Dataset/Lab_XPT"
lab_files = sorted(os.listdir(lab_files_directory))

# Eliminating unwanted features (like urine test data)
unw = ['ALB_CR_J.XPT','UTAS_J.XPT','UAS_J.XPT','UCM_J.XPT','UCOT_J.XPT','FASTQX_J.XPT',
       'FR_J.XPT','SSFR_J.XPT','UIO_J.XPT','UHG_J.XPT','UM_J.XPT','UNI_J.XPT','OPD_J.XPT',
       'PERNT_J.XPT','PFAS_J.XPT','SSPFAS_J.XPT','PHTHTE_J.XPT','UCFLOW_J.XPT','UCPREG_J.XPT',
       'UVOC_J.XPT','SSUVOC_J.XPT','VOCWB_J.XPT','CMV_J.XPT','ETHOX_J.XPT','FOLATE_J.XPT','INS_J.XPT',
       'FOLFMS_J.XPT','GLU_J.XPT','IHGEM_J.XPT','HEPC_J.XPT','TRIGLY_J.XPT','HEPE_J.XPT', 'HIV_J.XPT']

lab_files = [file for file in lab_files if file not in unw]
print("Selected Features: ",lab_files)

# Loading the files (Each feature is loaded as a dataframe and then merged)
lab_dfs = {}  # Create a dictionary to store the DataFrames

for file in lab_files:
    if file.endswith(".XPT"):
        file_path = os.path.join(lab_files_directory, file)
        df_name = file.split('.')[0]  # Extract the filename without extension
        df = pd.read_sas(file_path)
        lab_dfs[df_name] = df  # Store the DataFrame in the dictionary

# Now we can access each DataFrame using its filename as the key
# For example: lab_dfs['LBXSATSI'] will give us the DataFrame for 'LBXSATSI.XPT'

Selected Features:  ['BIOPRO_J.XPT', 'CBC_J.XPT', 'COT_J.XPT', 'CRCO_J.XPT', 'FERTIN_J.XPT', 'FETIB_J.XPT', 'GHB_J.XPT', 'HDL_J.XPT', 'HEPA_J.XPT', 'HEPBD_J.XPT', 'HEPB_S_J.XPT', 'HSCRP_J.XPT', 'PBCD_J.XPT', 'TCHOL_J.XPT', 'TFR_J.XPT', 'VIC_J.XPT', 'VID_J.XPT', 'VITAEC_J.XPT']


### Loading and Selecting Features from each Data File

In [43]:
# Selecting useful attributes from each df

# 1) Standard Biochemistry Profile - BIOPRO_J

# LBXSATSI: Alanine Aminotransferase (ALT) (U/L)
# LBXSAL: Albumin, refrigerated serum (g/dL)
# LBXSAPSI: Alkaline Phosphatase (ALP) (IU/L)
# LBXSASSI: Aspartate Aminotransferase (AST) (U/L)
# LBXSC3SI: Bicarbonate (mmol/L)
# LBXSBU: Blood Urea Nitrogen (BUN) (mg/dL)
# LBXSCLSI: Chloride (mmol/L)
# LBXSCK: Creatine Phosphokinase (CPK) (IU/L)
# LBXSCR: Creatinine, refrigerated serum (mg/dL)
# LBXSGB: Globulin (g/dL)
# LBXSGTSI: Gamma Glutamyl Transferase (GGT) (IU/L)
# LBXSIR: Iron, refrigerated serum (ug/dL)
# LBXSLDSI: Lactate Dehydrogenase (LDH) (IU/L)
# LBXSOSSI: Osmolality (mmol/Kg)
# LBXSPH: Phosphorus (mg/dL)
# LBXSKSI: Potassium (mmol/L)
# LBXSNASI: Sodium (mmol/L)
# LBXSTB: Total Bilirubin (mg/dL)
# LBXSCA: Total Calcium (mg/dL)
# LBXSCH: Cholesterol, refrigerated serum (mg/dL)
# LBXSTP: Total Protein (g/dL)
# LBXSTR: Triglycerides, refrigerated serum (mg/dL)
# LBXSUA: Uric acid (mg/dL)

f1 = ["SEQN","LBXSATSI", "LBXSAL", "LBXSAPSI", "LBXSASSI", "LBXSC3SI", "LBXSBU",
    "LBXSCLSI", "LBXSCK", "LBXSCR", "LBXSGB", "LBXSGTSI", "LBXSIR", "LBXSLDSI",
    "LBXSOSSI", "LBXSPH", "LBXSKSI", "LBXSNASI", "LBXSTB", "LBXSCA",
    "LBXSCH", "LBXSTP", "LBXSTR", "LBXSUA"]

lab_dfs['BIOPRO_J'] = lab_dfs['BIOPRO_J'][f1]
lab_dfs['BIOPRO_J'].head()

Unnamed: 0,SEQN,LBXSATSI,LBXSAL,LBXSAPSI,LBXSASSI,LBXSC3SI,LBXSBU,LBXSCLSI,LBXSCK,LBXSCR,...,LBXSOSSI,LBXSPH,LBXSKSI,LBXSNASI,LBXSTB,LBXSCA,LBXSCH,LBXSTP,LBXSTR,LBXSUA
0,93705.0,16.0,4.4,74.0,20.0,31.0,11.0,100.0,166.0,0.92,...,280.0,4.0,4.0,141.0,0.6,9.2,157.0,7.3,95.0,5.8
1,93706.0,10.0,4.4,79.0,14.0,28.0,12.0,104.0,114.0,0.81,...,286.0,4.0,4.4,144.0,0.7,9.6,149.0,7.1,92.0,8.0
2,93707.0,13.0,5.2,238.0,24.0,22.0,17.0,97.0,342.0,0.64,...,276.0,4.3,3.3,137.0,0.7,10.1,199.0,8.0,110.0,5.5
3,93708.0,19.0,3.9,66.0,21.0,27.0,16.0,104.0,347.0,0.58,...,289.0,3.3,4.4,144.0,0.5,9.5,210.0,7.1,72.0,4.5
4,93709.0,15.0,3.7,86.0,17.0,24.0,20.0,100.0,63.0,1.32,...,284.0,3.5,4.1,141.0,0.3,9.9,180.0,7.0,132.0,6.2


In [44]:
lab_dfs['BIOPRO_J'].describe()

Unnamed: 0,SEQN,LBXSATSI,LBXSAL,LBXSAPSI,LBXSASSI,LBXSC3SI,LBXSBU,LBXSCLSI,LBXSCK,LBXSCR,...,LBXSOSSI,LBXSPH,LBXSKSI,LBXSNASI,LBXSTB,LBXSCA,LBXSCH,LBXSTP,LBXSTR,LBXSUA
count,6401.0,5902.0,5905.0,5903.0,5882.0,5901.0,5901.0,5904.0,5899.0,5903.0,...,5901.0,5901.0,5899.0,5904.0,5903.0,5901.0,5903.0,5901.0,5901.0,5901.0
mean,98293.35432,21.420197,4.078645,90.616127,21.760456,25.541434,14.615489,101.03269,174.86913,0.875231,...,280.821217,3.664735,4.093965,140.324018,0.460495,9.319793,183.241741,7.165819,137.437553,5.402406
std,2685.012056,16.949439,0.345033,52.392677,12.952607,2.481768,5.984281,2.786815,404.328956,0.446905,...,5.826916,0.594482,0.364047,2.752762,0.280342,0.372979,41.287669,0.435356,109.130574,1.480927
min,93705.0,2.0,2.1,16.0,6.0,16.0,2.0,84.0,16.0,0.25,...,246.0,1.9,2.8,121.0,0.1,6.4,77.0,5.3,25.0,0.8
25%,95962.0,12.0,3.9,64.0,16.0,24.0,11.0,99.0,77.0,0.68,...,277.0,3.3,3.9,138.0,0.3,9.1,153.0,6.9,77.0,4.3
50%,98284.0,17.0,4.1,79.0,19.0,26.0,14.0,101.0,115.0,0.82,...,281.0,3.6,4.1,140.0,0.4,9.3,180.0,7.2,111.0,5.3
75%,100623.0,25.0,4.3,98.0,23.0,27.0,17.0,103.0,181.0,0.98,...,284.0,4.0,4.3,142.0,0.6,9.6,208.0,7.4,162.0,6.3
max,102956.0,420.0,5.4,638.0,272.0,38.0,79.0,117.0,16959.0,12.74,...,314.0,9.6,6.6,151.0,3.7,11.7,438.0,10.0,2923.0,15.1


In [45]:
# 2) Complete Blood Count with 5-Part Differential - CBC_J

# LBXWBCSI - White blood cell count (1000 cells/uL)
# LBXLYPCT - Lymphocyte percent (%)
# LBXMOPCT - Monocyte percent (%)
# LBXNEPCT - Segmented neutrophils percent (%)
# LBXEOPCT - Eosinophils percent (%)
# LBXBAPCT - Basophils percent (%)
# LBDLYMNO - Lymphocyte number (1000 cells/uL)
# LBDMONO - Monocyte number (1000 cells/uL)
# LBDNENO - Segmented neutrophils num (1000 cell/uL)
# LBDEONO - Eosinophils number (1000 cells/uL)
# LBDBANO - Basophils number (1000 cells/uL)
# LBXRBCSI - Red blood cell count (million cells/uL)
# LBXHGB - Hemoglobin (g/dL)
# LBXHCT - Hematocrit (%)
# LBXMCVSI - Mean cell volume (fL)
# LBXMCHSI - Mean cell hemoglobin (pg)
# LBXMC - Mean Cell Hgb Conc. (g/dL)
# LBXRDW - Red cell distribution width (%)
# LBXPLTSI - Platelet count (1000 cells/uL)
# LBXMPSI - Mean platelet volume (fL)
# LBXNRBC - Nucleated red blood cells

lab_dfs['CBC_J'].head()

Unnamed: 0,SEQN,LBXWBCSI,LBXLYPCT,LBXMOPCT,LBXNEPCT,LBXEOPCT,LBXBAPCT,LBDLYMNO,LBDMONO,LBDNENO,...,LBXRBCSI,LBXHGB,LBXHCT,LBXMCVSI,LBXMCHSI,LBXMC,LBXRDW,LBXPLTSI,LBXMPSI,LBXNRBC
0,93703.0,,,,,,,,,,...,,,,,,,,,,
1,93704.0,7.4,47.8,8.0,42.6,1.0,0.7,3.5,0.6,3.2,...,4.25,13.1,37.0,87.0,30.8,35.4,12.8,239.0,8.6,0.1
2,93705.0,8.6,40.0,7.4,48.8,2.9,1.0,3.4,0.6,4.2,...,5.48,11.9,36.7,67.0,21.7,32.4,15.6,309.0,7.9,5.397605e-79
3,93706.0,6.1,24.6,9.1,61.4,4.3,0.8,1.5,0.6,3.7,...,5.24,16.3,47.0,89.7,31.1,34.7,12.2,233.0,6.6,5.397605e-79
4,93707.0,11.2,37.1,6.2,54.7,1.6,0.5,4.2,0.7,6.1,...,5.02,14.5,42.1,83.9,28.9,34.4,13.6,348.0,8.5,0.2


In [46]:
# 3) Cotinine and Hydroxycotinine - COT_J

# LBXCOT - Cotinine, Serum (ng/mL)
# LBXHCT - Hydroxycotinine, Serum (ng/mL)

f3 = ['SEQN','LBXCOT','LBXHCT']

lab_dfs['COT_J'] = lab_dfs['COT_J'][f3]
lab_dfs['COT_J'].head()

Unnamed: 0,SEQN,LBXCOT,LBXHCT
0,93705.0,0.028,0.02
1,93706.0,0.138,0.024
2,93707.0,0.555,0.07
3,93708.0,0.011,0.011
4,93709.0,54.3,0.628


In [47]:
# 4) Chromium & Cobalt (CRCO_J)

# SEQN - Respondent sequence number
# LBDBCRSI - Chromium (nmol/L)
# LBDBCOSI - Cobalt (nmol/L)

f4 = ["SEQN", "LBDBCRSI", "LBDBCOSI"]

lab_dfs['CRCO_J'] = lab_dfs['CRCO_J'][f4]
lab_dfs['CRCO_J'].head()

Unnamed: 0,SEQN,LBDBCRSI,LBDBCOSI
0,93705.0,5.58,3.39
1,93708.0,5.58,2.72
2,93709.0,13.46,5.77
3,93711.0,5.58,4.41
4,93713.0,5.58,2.04


In [48]:
# 5) Ferritin (FERTIN_J)

# SEQN - Respondent sequence number
# LBXFER - Ferritin (ng/mL)

f5 = ["SEQN", "LBXFER"]

lab_dfs['FERTIN_J'] = lab_dfs['FERTIN_J'][f5]
lab_dfs['FERTIN_J'].head()

Unnamed: 0,SEQN,LBXFER
0,93703.0,
1,93704.0,36.6
2,93705.0,28.7
3,93706.0,284.0
4,93707.0,49.3


In [49]:
# 6) Iron Status - Serum (FETIB_J)

# SEQN - Respondent sequence number
# LBDIRNSI - Iron frozen, Serum (umol/L)
# LBDUIBSI - UIBC, Serum (umol/L)
# LBDTIBSI - Tot Iron Binding Capacity TIBC (umol/L)
# LBDPCT - Transferrin Saturation (%)

f5 = ["SEQN", "LBDIRNSI", "LBDUIBSI", "LBDTIBSI", "LBDPCT"]

lab_dfs['FETIB_J'] = lab_dfs['FETIB_J'][f5]
lab_dfs['FETIB_J'].head()

Unnamed: 0,SEQN,LBDIRNSI,LBDUIBSI,LBDTIBSI,LBDPCT
0,93705.0,16.5,41.73,58.21,28.0
1,93706.0,29.4,27.76,57.13,51.0
2,93707.0,16.3,60.89,77.19,21.0
3,93708.0,16.1,34.92,51.04,32.0
4,93709.0,11.3,42.8,54.09,21.0


In [50]:
# 7) Glycohemoglobin (GHB_J)

# SEQN - Respondent sequence number
# LBXGH - Glycohemoglobin (%)

lab_dfs['GHB_J'].head()

Unnamed: 0,SEQN,LBXGH
0,93705.0,6.2
1,93706.0,5.2
2,93707.0,5.6
3,93708.0,6.2
4,93709.0,6.3


In [51]:
# 8) Cholesterol - High - Density Lipoprotein (HDL) (HDL_J)

# SEQN - Respondent Sequence Number
# LBDHDDSI - Direct HDL-Cholesterol (mmol/L)

f8 = ["SEQN", "LBDHDDSI"]

lab_dfs['HDL_J'] = lab_dfs['HDL_J'][f8]
lab_dfs['HDL_J'].head()

Unnamed: 0,SEQN,LBDHDDSI
0,93705.0,1.55
1,93706.0,1.22
2,93707.0,1.76
3,93708.0,2.28
4,93709.0,1.68


In [52]:
# 9) High-Sensitivity C-Reactive Protein (HSCRP_J)

# SEQN - Respondent Sequence Number
# LBXHSCRP - HS C-Reactive Protein (mg/L)

f7 = ["SEQN", "LBXHSCRP"]

lab_dfs['HSCRP_J'] = lab_dfs['HSCRP_J'][f7]
lab_dfs['HSCRP_J'].head()

Unnamed: 0,SEQN,LBXHSCRP
0,93703.0,
1,93704.0,0.29
2,93705.0,2.72
3,93706.0,0.74
4,93707.0,0.32


In [53]:
# 10) Lead, Cadmium, Total Mercury, Selenium, & Manganese - Blood (PBCD_J)

# SEQN - Respondent sequence number
# LBDBPBSI - Blood lead (umol/L)
# LBDBCDSI - Blood cadmium (nmol/L)
# LBDTHGSI - Blood mercury, total (nmol/L)
# LBDBSESI - Blood selenium (umol/L)
# LBDBMNSI - Blood manganese (nmol/L)

f10 = ["SEQN", "LBDBPBSI", "LBDBCDSI", "LBDTHGSI", "LBDBSESI", "LBDBMNSI"]

lab_dfs['PBCD_J'] = lab_dfs['PBCD_J'][f10]
lab_dfs['PBCD_J'].head()

Unnamed: 0,SEQN,LBDBPBSI,LBDBCDSI,LBDTHGSI,LBDBSESI,LBDBMNSI
0,93703.0,,,,,
1,93704.0,,0.62,2.35,1.81,171.28
2,93705.0,0.14,2.14,5.34,2.36,155.99
3,93706.0,0.04,1.87,53.09,2.53,256.1
4,93707.0,0.02,1.25,1.0,2.28,229.35


In [54]:
# 11) Cholesterol - Total (TCHOL_J)

# SEQN - Respondent Sequence Number
# LBDTCSI - Total Cholesterol (mmol/L)

f11 = ["SEQN", "LBDTCSI"]

lab_dfs['TCHOL_J'] = lab_dfs['TCHOL_J'][f11]
lab_dfs['TCHOL_J'].head()

Unnamed: 0,SEQN,LBDTCSI
0,93705.0,4.06
1,93706.0,3.83
2,93707.0,4.89
3,93708.0,5.4
4,93709.0,4.55


In [55]:
# 12) Transferrin Receptor (TFR_J)

# SEQN - Respondent sequence number
# LBDTFRSI - Transferrin receptor (nmol/L)

f12 = ["SEQN", "LBDTFRSI"]

lab_dfs['TFR_J'] = lab_dfs['TFR_J'][f12]
lab_dfs['TFR_J'].head()

Unnamed: 0,SEQN,LBDTFRSI
0,93703.0,
1,93704.0,30.6
2,93705.0,39.8
3,93706.0,34.0
4,93707.0,34.0


In [56]:
# 13) Vitamin C (VIC_J)

# SEQN - Respondent sequence number
# LBDVICSI - Vitamin C (umol/L)

f13 = ["SEQN", "LBDVICSI"]

lab_dfs['VIC_J'] = lab_dfs['VIC_J'][f13]
lab_dfs['VIC_J'].head()

Unnamed: 0,SEQN,LBDVICSI
0,93705.0,73.8
1,93706.0,63.6
2,93707.0,27.4
3,93708.0,86.3
4,93709.0,24.2


In [57]:
# 14) Vitamin D (VID_J)

# SEQN - Respondent sequence number
# LBXVIDMS - 25OHD2+25OHD3 (nmol/L)

f14 = ["SEQN", "LBXVIDMS"]

lab_dfs['VID_J'] = lab_dfs['VID_J'][f14]
lab_dfs['VID_J'].head()

Unnamed: 0,SEQN,LBXVIDMS
0,93703.0,
1,93704.0,74.4
2,93705.0,89.9
3,93706.0,53.8
4,93707.0,58.2


In [58]:
# 15) Vitamin A, Vitamin E & Carotenoids (VITAEC_J)

# SEQN - Respondent sequence number
# LBDALCSI - alpha-carotene (umol/L)
# LBDARYSI - alpha-crypotoxanthin (umol/L)
# LBDBECSI - trans-beta-carotene (umol/L)
# LBDCBCSI - cis-beta-carotene (umol/L)
# LBDCRYSI - beta-cryptoxanthin (umol/L)
# LBDGTCSI - gamma-tocopherol (umol/L)
# LBDLUZSI - Lutein and zeaxanthin (umol/L)
# LBDLYCSI - trans-lycopene (umol/L)
# LBDRPLSI - Retinyl palmitate (umol/L)
# LBDRSTSI - Retinyl stearate (umol/L)
# LBDLCCSI - Total Lycopene (umol/L)
# LBDVIASI - Retinol (umol/L)
# LBDVIESI - alpha-tocopherol (umol/L)

f15 = ["SEQN", "LBDALCSI", "LBDARYSI", "LBDBECSI", "LBDCBCSI", 
       "LBDCRYSI", "LBDGTCSI","LBDLUZSI", "LBDLYCSI", "LBDRPLSI",
       "LBDRSTSI", "LBDLCCSI", "LBDVIASI", "LBDVIESI"]

lab_dfs['VITAEC_J'] = lab_dfs['VITAEC_J'][f15]
lab_dfs['VITAEC_J'].head()

Unnamed: 0,SEQN,LBDALCSI,LBDARYSI,LBDBECSI,LBDCBCSI,LBDCRYSI,LBDGTCSI,LBDLUZSI,LBDLYCSI,LBDRPLSI,LBDRSTSI,LBDLCCSI,LBDVIASI,LBDVIESI
0,93705.0,0.138,0.071,0.494,0.026,0.154,2.786,0.482,0.186,,,0.386,2.185,26.006
1,93706.0,0.16,0.031,0.408,0.016,0.11,5.356,0.336,0.32,0.031,0.017,0.583,2.042,16.765
2,93707.0,0.009,0.044,0.099,0.009,0.06,3.987,0.2,0.281,0.051,0.017,0.499,1.222,22.756
3,93708.0,0.192,0.127,2.627,0.138,0.246,2.546,1.512,0.138,0.031,0.017,0.33,2.367,
4,93709.0,0.009,0.017,0.113,,0.027,2.522,0.214,,0.057,0.017,,1.739,28.561


In [59]:
# 16) Hepatitis A (HEPA_J)

# SEQN - Respondent sequence number
# LBXHA - Hepatitis A antibody

lab_dfs['HEPA_J'].head()

Unnamed: 0,SEQN,LBXHA
0,93703.0,
1,93704.0,1.0
2,93705.0,1.0
3,93706.0,2.0
4,93707.0,2.0


In [60]:
# 17) Hepatitis B: Core antibody (HEPBD_J)

# SEQN - Respondent sequence number
# LBXHBC - Hepatitis B core antibody

f17 = ["SEQN", "LBXHBC"]

lab_dfs['HEPBD_J'] = lab_dfs['HEPBD_J'][f17]
lab_dfs['HEPBD_J'].head()

Unnamed: 0,SEQN,LBXHBC
0,93705.0,1.0
1,93706.0,2.0
2,93707.0,2.0
3,93708.0,1.0
4,93709.0,2.0


In [61]:
# 18) Hepatitis B: Surface Antibody (HEPB_S_J)

# SEQN - Respondent sequence number
# LBXHBS - Hepatitis B Surface Antibody

lab_dfs['HEPB_S_J'].head()

Unnamed: 0,SEQN,LBXHBS
0,93703.0,
1,93704.0,1.0
2,93705.0,1.0
3,93706.0,2.0
4,93707.0,2.0


## Questionnaire Data

<a href="https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&Cycle=2017-2018" target="_blank" rel="noopener">Questionnaire Data Variabeles Reference</a>

In [269]:
# Reading the questionnaire data
q_directory = "../Dataset/Questionnaire_XPT"
q_files = sorted(os.listdir(q_directory))

# Selecting required data files
req = ['DIQ_J.XPT','HEQ_J.XPT','KIQ_U_J.XPT','MCQ_J.XPT']

q_files = [file for file in q_files if file in req]
print("Selected Files: ",q_files)

# Loading the files (Each file is loaded as a dataframe)
q_dfs = {}  # Create a dictionary to store the DataFrames

for file in q_files:
    if file.endswith(".XPT"):
        file_path = os.path.join(q_directory, file)
        df_name = file.split('.')[0]  # Extract the filename without extension
        df = pd.read_sas(file_path)
        q_dfs[df_name] = df  # Store the DataFrame in the dictionary

# Now we can access each DataFrame using its filename as the key

Selected Files:  ['DIQ_J.XPT', 'HEQ_J.XPT', 'KIQ_U_J.XPT', 'MCQ_J.XPT']


In [270]:
# Diabetes (DIQ_J)

# SEQN - Respondent sequence number
# DIQ010 - Doctor told you have diabetes

f19 = ["SEQN", "DIQ010"]

q_dfs['DIQ_J'] = q_dfs['DIQ_J'][f1]
q_dfs['DIQ_J'].head()

Unnamed: 0,SEQN,DIQ010
0,93703.0,2.0
1,93704.0,2.0
2,93705.0,2.0
3,93706.0,2.0
4,93707.0,2.0


In [271]:
# Hepatitis (HEQ_J)

# SEQN - Respondent sequence number
# HEQ030 - Ever told you have Hepatitis C?

f20 = ["SEQN", "HEQ030"]

q_dfs['HEQ_J'] = q_dfs['HEQ_J'][f2]
q_dfs['HEQ_J'].head()

Unnamed: 0,SEQN,HEQ030
0,93705.0,2.0
1,93706.0,2.0
2,93707.0,2.0
3,93708.0,2.0
4,93709.0,2.0


In [272]:
# Kidney Conditions - Urology (KIQ_U_J)

# SEQN - Respondent sequence number
# KIQ022 - Ever told you had weak/failing kidneys?

f21 = ["SEQN", "KIQ022"]

q_dfs['KIQ_U_J'] = q_dfs['KIQ_U_J'][f3]
q_dfs['KIQ_U_J'].head()

Unnamed: 0,SEQN,KIQ022
0,93705.0,2.0
1,93708.0,2.0
2,93709.0,2.0
3,93711.0,2.0
4,93713.0,2.0


In [281]:
# Medical Conditions (MCQ_J)

# SEQN - Respondent sequence number
# MCQ160b - Ever told had congestive heart failure
# MCQ160c - Ever told you had coronary heart disease
# MCQ160f - Ever told you had a stroke
# MCQ160m - Ever told you had thyroid problem
# MCQ160l - Ever told you had any liver condition
# MCQ510a - Liver condition: Fatty liver
# MCQ510b - Liver condition: Liver fibrosis
# MCQ510c - Liver condition: Liver cirrhosis
# MCQ510d - Liver condition: Viral hepatitis
# MCQ510e - Liver condition: Autoimmune hepatitis
# MCQ203 - Ever been told you have jaundice?
# MCQ220 - Ever told you had cancer or malignancy
# MCQ230a - 1st cancer - what kind was it?

f4 = ["SEQN", "MCQ160B", "MCQ160C", "MCQ160F", "MCQ160M", "MCQ160L", "MCQ510A", "MCQ510B",
      "MCQ510C", "MCQ510D", "MCQ510E", "MCQ203", "MCQ220", "MCQ230A"]

q_dfs['MCQ_J'] = q_dfs['MCQ_J'][f4]
q_dfs['MCQ_J'].head()

Unnamed: 0,SEQN,MCQ160B,MCQ160C,MCQ160F,MCQ160M,MCQ160L,MCQ510A,MCQ510B,MCQ510C,MCQ510D,MCQ510E,MCQ203,MCQ220,MCQ230A
0,93703.0,,,,,,,,,,,,,
1,93704.0,,,,,,,,,,,,,
2,93705.0,2.0,2.0,2.0,2.0,2.0,,,,,,2.0,2.0,
3,93706.0,,,,,,,,,,,2.0,,
4,93707.0,,,,,,,,,,,2.0,,
