### Import packages

In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
from functools import reduce


### Import all files from directories

In [2]:
#Changed here
from pathlib import Path
NHANES_PATH = Path.cwd() / 'NHANES_Data'
f = lambda dirname: [f.name for f in (NHANES_PATH/dirname).iterdir() if isfile(join("albcr", f))]

albcr = f('albcr')
bio = f('bio') 
cbc = f('cbc') 
chol = f('chol') 
cholhdl = f('cholhdl') 
demo = f('demo') 
glu = f('glu') 
hdl = f('hdl') 
trildl = f('trildl') 

#### Give string values to gender

In [3]:
def gender(x):
    x = int(x)
    if x == 1:
        return "M"
    elif x == 2:
        return "F"

### Extracting demographical data

In [4]:
demo_data = pd.DataFrame()
for i in demo:
    temp = pd.read_sas(NHANES_PATH/"demo/"/i)
    temp = temp[["SEQN","RIAGENDR","RIDAGEYR"]]
    temp.columns = ["SEQN","GENDER","AGE"]
    temp = temp.astype({"AGE":int,"SEQN":int})
    temp["GENDER"] = temp["GENDER"].apply(gender)
    demo_data = pd.concat([demo_data,temp],axis=0)
    del temp

  df[x] = v


### Extracting Albumin and Creatinine 

In [5]:
albcr_data = pd.DataFrame()
for i in albcr:
    temp = pd.read_sas(NHANES_PATH/"albcr/"/i)
    if "URXUMS" in temp:
        albumin = "URXUMS"
    elif "URXUMASI" in temp:
        albumin = "URXUMASI"
    temp = temp[["SEQN",albumin,"URXUCR"]]
    temp.columns = ["SEQN","Albumin_mgl","Creatinine_mgdl"]
    temp = temp.astype({"SEQN":int})
    albcr_data = pd.concat([albcr_data,temp],axis=0)
    del temp

### Extracting data from standard biochemistry profile

In [6]:
bio_data = pd.DataFrame()
for i in bio:
    temp = pd.read_sas(NHANES_PATH/"bio/"/i)
    if "LBXSAPSI" in temp:
        alkPho = "LBXSAPSI"
    elif "LBDSAPSI" in temp:
        alkPho = "LBDSAPSI"
    temp = temp[["SEQN","LBXSKSI","LBXSNASI","LBDSTBSI","LBDSCASI","LBXSTP","LBDSBUSI",alkPho]]
    temp.columns = ["SEQN","Potassium_mmolL","Sodium_mmolL","Bilirubin_umolL","Calcium_mmolL","Protein_gdL","Urea_mmolL",
                   "AlkPhos_UL"]
    temp = temp.astype({"SEQN":int})
    bio_data = pd.concat([bio_data,temp],axis=0)
    del temp

### Extracting Glucose

In [7]:
glu_data = pd.DataFrame()
for i in glu:
    temp = pd.read_sas(NHANES_PATH/"glu/"/i)
    if "LBDGLUSI" in temp:
        glucose = "LBDGLUSI"
    elif "LBXGLUSI" in temp:
        glucose = "LBXGLUSI"
    else:
        print(i)
    temp = temp[["SEQN",glucose]]
    temp.columns = ["SEQN","Glucose_mmolL"]
    temp = temp.astype({"SEQN":int})
    glu_data = pd.concat([glu_data,temp],axis=0)
    del temp

### Extracting LDL and Triglyceride

In [8]:
trildl_data = pd.DataFrame()
for i in trildl:
    temp = pd.read_sas(NHANES_PATH/"trildl/"/i)
    temp = temp[["SEQN","LBDLDLSI","LBDTRSI"]]
    temp.columns = ["SEQN","LDL_mmolL","Triglyceride_mmolL"]
    temp = temp.astype({"SEQN":int})
    trildl_data = pd.concat([trildl_data,temp],axis=0)
    del temp

### Extracting HDL

In [9]:
hdl_data = pd.DataFrame()
for i in hdl:
    temp = pd.read_sas(NHANES_PATH/"hdl/"/i)
    temp = temp[["SEQN","LBDHDDSI"]]
    temp.columns = ["SEQN","HDL_mmolL"]
    temp = temp.astype({"SEQN":int})
    hdl_data = pd.concat([hdl_data,temp],axis=0)
    del temp

### Extracting Total Cholesterol

In [10]:
chol_data = pd.DataFrame()
for i in chol:
    temp = pd.read_sas(NHANES_PATH/"chol/"/i)
    temp = temp[["SEQN","LBDTCSI"]]
    temp.columns = ["SEQN","Cholesterol_mmolL"]
    temp = temp.astype({"SEQN":int})
    chol_data = pd.concat([chol_data,temp],axis=0)
    del temp

#### Merge HDL and Cholesterol

In [11]:
chol_hdl = pd.merge(hdl_data,chol_data,on=["SEQN"],how="outer")

### Extracting HDL and Cholesterol

In [12]:
x=0
cholhdl_data = pd.DataFrame()
for i in cholhdl:
    temp = pd.read_sas(NHANES_PATH/"cholhdl/"/i)
    if "LBDHDDSI" in temp:
        hdl = "LBDHDDSI"
    elif "LBDHDLSI" in temp:
        hdl = "LBDHDLSI"
    temp = temp[["SEQN",hdl,"LBDTCSI"]]
    temp.columns = ["SEQN","HDL_mmolL","Cholesterol_mmolL"]
    temp = temp.astype({"SEQN":int})
    cholhdl_data = pd.concat([cholhdl_data,temp],axis=0)
    del temp

#### Concat both

In [13]:
cholhdl_data = pd.concat([cholhdl_data,chol_hdl],axis=0)

### Extracting Complete blood count

In [14]:
x=0
cbc_data = pd.DataFrame()
for i in cbc:
    temp = pd.read_sas(NHANES_PATH/"cbc/"/i)
    temp = temp[["SEQN","LBXRBCSI","LBXHGB","LBXHCT","LBXMCVSI","LBXMC","LBXPLTSI"]]
    temp.columns = ["SEQN","RBC_MuL","Hemoglobin_gdl","Hematocrit_%","MCV_fL","MCHC_gdl","Platelet_TuL"]
    temp = temp.astype({"SEQN":int})
    cbc_data = pd.concat([cbc_data,temp],axis=0)
    del temp

### Merge Dataframes

In [15]:
dfs = [demo_data,albcr_data,glu_data,bio_data,trildl_data,cholhdl_data,cbc_data]
data = reduce(lambda  left,right: pd.merge(left,right,on=['SEQN'],
                                            how='outer'), dfs)

### Drop NaN values

In [16]:
data = data.dropna()

### For age 20+

In [17]:
data = data[data["AGE"]>=20]

### Export to CSV

In [18]:
#A way to export file while being OS agnostic
data.to_csv(NHANES_PATH.parent/'data_after_running_notebook.csv', index=False, header=False)

In [19]:
#This a data file that is in your respository and it has 38k rows
pd.read_csv(NHANES_PATH.parent/'data.csv')

Unnamed: 0,SEQN,GENDER,AGE,Albumin_mgl,Creatinine_mgdl,Glucose_mmolL,Potassium_mmolL,Sodium_mmolL,Bilirubin_umolL,Calcium_mmolL,...,LDL_mmolL,Triglyceride_mmolL,HDL_mmolL,Cholesterol_mmolL,RBC_MuL,Hemoglobin_gdl,Hematocrit_%,MCV_fL,MCHC_gdl,Platelet_TuL
0,2,M,77,9.1,145.0,4.646,4.06,144.1,12.00,2.325,...,3.520,1.450,1.39,5.56,4.73,14.1,41.8,88.5,33.6,214.0
1,5,M,49,6.1,172.0,5.550,4.63,137.5,8.60,2.375,...,4.340,3.920,1.08,7.21,5.13,14.5,43.6,84.9,33.3,209.0
2,7,F,59,6.7,128.0,4.756,4.25,143.2,6.80,2.450,...,3.280,0.700,2.73,6.34,4.60,13.4,40.2,87.4,33.3,244.0
3,10,M,43,11.1,279.0,4.989,4.28,140.9,6.80,2.350,...,2.070,0.510,1.31,3.62,5.00,15.4,46.2,92.3,33.5,167.0
4,12,M,37,34.3,99.0,4.606,3.81,141.3,6.80,2.200,...,2.300,1.650,0.98,4.03,5.76,16.0,48.1,83.5,33.3,357.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38781,51618,M,48,5.2,81.0,5.551,4.30,144.0,13.68,2.450,...,3.207,1.163,1.22,4.97,5.01,14.7,43.9,87.6,33.4,310.0
38782,51620,F,50,39.5,161.0,5.440,4.30,139.0,11.97,2.200,...,4.086,2.518,1.06,6.31,4.39,14.3,40.3,91.9,35.4,161.0
38783,51620,F,50,39.5,161.0,5.440,4.30,139.0,11.97,2.200,...,4.086,2.518,1.06,6.31,4.39,14.3,40.3,91.9,35.4,161.0
38784,51623,M,72,26.3,123.0,6.717,4.20,141.0,30.78,2.375,...,1.577,0.632,1.09,2.95,5.66,16.4,48.9,86.3,33.5,241.0


In [20]:
#And here is the data after running your notebook
data

Unnamed: 0,SEQN,GENDER,AGE,Albumin_mgl,Creatinine_mgdl,Glucose_mmolL,Potassium_mmolL,Sodium_mmolL,Bilirubin_umolL,Calcium_mmolL,...,LDL_mmolL,Triglyceride_mmolL,HDL_mmolL,Cholesterol_mmolL,RBC_MuL,Hemoglobin_gdl,Hematocrit_%,MCV_fL,MCHC_gdl,Platelet_TuL
2,9968,F,84,7.4,119.0,5.356,3.7,135.0,10.26,2.350,...,1.450,1.470,1.32,3.44,3.97,11.8,34.8,87.8,33.7,305.0
3,9969,F,51,0.4,13.0,5.039,3.9,140.0,11.97,2.400,...,3.900,1.040,1.73,6.10,4.88,15.2,44.3,90.6,34.3,239.0
6,9972,M,44,3.6,99.0,5.356,4.7,140.0,17.10,2.400,...,3.150,1.920,0.96,4.99,5.38,15.7,45.7,85.0,34.3,318.0
7,9973,F,63,2.7,20.0,4.650,4.4,136.0,8.55,2.325,...,2.690,2.750,1.37,5.33,4.70,13.9,40.2,85.4,34.5,322.0
10,9976,M,36,5.5,99.0,5.628,4.1,137.0,17.10,2.375,...,2.690,0.530,2.40,5.33,5.02,17.5,49.9,99.1,35.0,249.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101299,41458,M,46,9.5,242.0,6.051,4.2,139.0,8.55,2.325,...,3.543,2.055,1.06,5.53,5.33,15.6,45.4,85.2,34.3,288.0
101300,41459,F,26,1.9,91.0,4.607,3.6,135.0,8.55,2.250,...,2.069,1.547,2.53,5.30,4.58,13.3,38.5,84.4,34.5,152.0
101303,41462,M,56,2.1,63.0,5.384,4.1,139.0,8.55,2.325,...,3.517,0.768,1.81,5.69,4.34,13.8,40.7,93.7,33.7,280.0
101305,41464,F,60,3.2,71.0,6.384,3.9,141.0,8.55,2.375,...,2.845,1.197,1.42,4.81,4.20,12.5,38.6,91.9,32.5,451.0


In [21]:
#The dataframes are different and it is better to have a reproducible dataframe so that we can edit it in any way we want.

#Notice that in Jupyter the environment/namespace stays the same the whole time unless you restart your kernel.
#So while it is sometimes useful before submitting something to your repository it is better to make sure that if you restart your kernel
#and run all cells then you get the same output. Notice that in my notebook the first cells output index is [1] and all of the cells
#are executed in order. This means that the program/notebok run from the clean environment, meaning with no prior bindings except for the
#ones "defined in your kernel".