### Import packages

In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
from functools import reduce

### Import all files from directories

In [2]:
from pathlib import Path
DATA_PATH = Path.cwd()/'NHANES_Data/'
get_filenames = lambda dir_name: [path.name for path in (DATA_PATH/dir_name).iterdir() if path.is_file()]

In [3]:
albcr = get_filenames("albcr")
bio = get_filenames("bio")
cbc = get_filenames("cbc")
chol = get_filenames("chol")
cholhdl = get_filenames("cholhdl")
demo = get_filenames("demo")
glu = get_filenames("glu")
hdl = get_filenames("hdl")
trildl = get_filenames("trildl")

#### Give string values to gender

In [4]:
def gender(x):
    x = int(x)
    if x == 1:
        return "M"
    elif x == 2:
        return "F"

### Extracting demographical data

In [5]:
demo_data = pd.DataFrame()
for i in demo:
    temp = pd.read_sas(DATA_PATH/"demo/"/i)
    temp = temp[["SEQN","RIAGENDR","RIDAGEYR"]]
    temp.columns = ["SEQN","GENDER","AGE"]
    temp = temp.astype({"AGE":int,"SEQN":int})
    temp["GENDER"] = temp["GENDER"].apply(gender)
    demo_data = pd.concat([demo_data,temp],axis=0)
    del temp

  df[x] = v


### Extracting Albumin and Creatinine 

In [6]:
albcr_data = pd.DataFrame()
for i in albcr:
    temp = pd.read_sas(DATA_PATH/"albcr/"/i)
    if "URXUMS" in temp:
        albumin = "URXUMS"
    elif "URXUMASI" in temp:
        albumin = "URXUMASI"
    temp = temp[["SEQN",albumin,"URXUCR"]]
    temp.columns = ["SEQN","Albumin_mgl","Creatinine_mgdl"]
    temp = temp.astype({"SEQN":int})
    albcr_data = pd.concat([albcr_data,temp],axis=0)
    del temp

### Extracting data from standard biochemistry profile

In [7]:
bio_data = pd.DataFrame()
for i in bio:
    temp = pd.read_sas(DATA_PATH/"bio/"/i)
    if "LBXSAPSI" in temp:
        alkPho = "LBXSAPSI"
    elif "LBDSAPSI" in temp:
        alkPho = "LBDSAPSI"
    temp = temp[["SEQN","LBXSKSI","LBXSNASI","LBDSTBSI","LBDSCASI","LBXSTP","LBDSBUSI",alkPho]]
    temp.columns = ["SEQN","Potassium_mmolL","Sodium_mmolL","Bilirubin_umolL","Calcium_mmolL","Protein_gdL","Urea_mmolL",
                   "AlkPhos_UL"]
    temp = temp.astype({"SEQN":int})
    bio_data = pd.concat([bio_data,temp],axis=0)
    del temp

### Extracting Glucose

In [8]:
glu_data = pd.DataFrame()
for i in glu:
    temp = pd.read_sas(DATA_PATH/"glu/"/i)
    if "LBDGLUSI" in temp:
        glucose = "LBDGLUSI"
    elif "LBXGLUSI" in temp:
        glucose = "LBXGLUSI"
    else:
        print(i)
    temp = temp[["SEQN",glucose]]
    temp.columns = ["SEQN","Glucose_mmolL"]
    temp = temp.astype({"SEQN":int})
    glu_data = pd.concat([glu_data,temp],axis=0)
    del temp

### Extracting LDL and Triglyceride

In [9]:
trildl_data = pd.DataFrame()
for i in trildl:
    temp = pd.read_sas(DATA_PATH/"trildl/"/i)
    temp = temp[["SEQN","LBDLDLSI","LBDTRSI"]]
    temp.columns = ["SEQN","LDL_mmolL","Triglyceride_mmolL"]
    temp = temp.astype({"SEQN":int})
    trildl_data = pd.concat([trildl_data,temp],axis=0)
    del temp

### Extracting HDL

In [10]:
hdl_data = pd.DataFrame()
for i in hdl:
    temp = pd.read_sas(DATA_PATH/"hdl/"/i)
    temp = temp[["SEQN","LBDHDDSI"]]
    temp.columns = ["SEQN","HDL_mmolL"]
    temp = temp.astype({"SEQN":int})
    hdl_data = pd.concat([hdl_data,temp],axis=0)
    del temp

### Extracting Total Cholesterol

In [11]:
chol_data = pd.DataFrame()
for i in chol:
    temp = pd.read_sas(DATA_PATH/"chol/"/i)
    temp = temp[["SEQN","LBDTCSI"]]
    temp.columns = ["SEQN","Cholesterol_mmolL"]
    temp = temp.astype({"SEQN":int})
    chol_data = pd.concat([chol_data,temp],axis=0)
    del temp

#### Merge HDL and Cholesterol

In [12]:
chol_hdl = pd.merge(hdl_data,chol_data,on=["SEQN"],how="outer")

### Extracting HDL and Cholesterol

In [13]:
x=0
cholhdl_data = pd.DataFrame()
for i in cholhdl:
    temp = pd.read_sas(DATA_PATH/"cholhdl/"/i)
    if "LBDHDDSI" in temp:
        hdl = "LBDHDDSI"
    elif "LBDHDLSI" in temp:
        hdl = "LBDHDLSI"
    temp = temp[["SEQN",hdl,"LBDTCSI"]]
    temp.columns = ["SEQN","HDL_mmolL","Cholesterol_mmolL"]
    temp = temp.astype({"SEQN":int})
    cholhdl_data = pd.concat([cholhdl_data,temp],axis=0)
    del temp

#### Concat both

In [14]:
cholhdl_data = pd.concat([cholhdl_data,chol_hdl],axis=0)

### Extracting Complete blood count

In [15]:
x=0
cbc_data = pd.DataFrame()
for i in cbc:
    temp = pd.read_sas(DATA_PATH/"cbc/"/i)
    temp = temp[["SEQN","LBXRBCSI","LBXHGB","LBXHCT","LBXMCVSI","LBXMC","LBXPLTSI"]]
    temp.columns = ["SEQN","RBC_MuL","Hemoglobin_gdl","Hematocrit_%","MCV_fL","MCHC_gdl","Platelet_TuL"]
    temp = temp.astype({"SEQN":int})
    cbc_data = pd.concat([cbc_data,temp],axis=0)
    del temp

### Merge Dataframes

In [16]:
dfs = [demo_data,albcr_data,glu_data,bio_data,trildl_data,cholhdl_data,cbc_data]
data = reduce(lambda  left,right: pd.merge(left,right,on=['SEQN'],
                                            how='outer'), dfs)

### Drop NaN values

In [17]:
data = data.dropna()

### For age 20+

In [18]:
data = data[data["AGE"]>=20]

### Reorder Columns

In [19]:
data = data[["SEQN","GENDER","Albumin_mgl","Glucose_mmolL","Urea_mmolL","Cholesterol_mmolL","Protein_gdL","Sodium_mmolL",
             "Creatinine_mgdl","Hemoglobin_gdl","Bilirubin_umolL","Triglyceride_mmolL","HDL_mmolL","LDL_mmolL",
             "Calcium_mmolL","Potassium_mmolL","AlkPhos_UL","Hematocrit_%","MCHC_gdl","MCV_fL","Platelet_TuL","RBC_MuL","AGE"]]

### Export to CSV

In [20]:
data.to_csv(DATA_PATH.parent/'Linux_data_processing.csv', index=False, header=True)