### Import packages

In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
from functools import reduce

### Import all files from directories

In [2]:
albcr = [f for f in listdir("albcr") if isfile(join("albcr", f))]
bio = [f for f in listdir("bio") if isfile(join("bio", f))]
cbc = [f for f in listdir("cbc") if isfile(join("cbc", f))]
chol = [f for f in listdir("chol") if isfile(join("chol", f))]
cholhdl = [f for f in listdir("cholhdl") if isfile(join("cholhdl", f))]
demo = [f for f in listdir("demo") if isfile(join("demo", f))]
glu = [f for f in listdir("glu") if isfile(join("glu", f))]
hdl = [f for f in listdir("hdl") if isfile(join("hdl", f))]
trildl = [f for f in listdir("trildl") if isfile(join("trildl", f))]

#### Give string values to gender

In [3]:
def gender(x):
    x = int(x)
    if x == 1:
        return "M"
    elif x == 2:
        return "F"

### Extracting demographical data

In [4]:
demo_data = pd.DataFrame()
for i in demo:
    temp = pd.read_sas("demo/"+i)
    temp = temp[["SEQN","RIAGENDR","RIDAGEYR"]]
    temp.columns = ["SEQN","GENDER","AGE"]
    temp = temp.astype({"AGE":int,"SEQN":int})
    temp["GENDER"] = temp["GENDER"].apply(gender)
    demo_data = pd.concat([demo_data,temp],axis=0)
    del temp

### Extracting Albumin and Creatinine 

In [5]:
albcr_data = pd.DataFrame()
for i in albcr:
    temp = pd.read_sas("albcr/"+i)
    if "URXUMS" in temp:
        albumin = "URXUMS"
    elif "URXUMASI" in temp:
        albumin = "URXUMASI"
    temp = temp[["SEQN",albumin,"URXUCR"]]
    temp.columns = ["SEQN","Albumin_mgl","Creatinine_mgdl"]
    temp = temp.astype({"SEQN":int})
    albcr_data = pd.concat([albcr_data,temp],axis=0)
    del temp

### Extracting data from standard biochemistry profile

In [6]:
bio_data = pd.DataFrame()
for i in bio:
    temp = pd.read_sas("bio/"+i)
    if "LBXSAPSI" in temp:
        alkPho = "LBXSAPSI"
    elif "LBDSAPSI" in temp:
        alkPho = "LBDSAPSI"
    temp = temp[["SEQN","LBXSKSI","LBXSNASI","LBDSTBSI","LBDSCASI","LBXSTP","LBDSBUSI",alkPho]]
    temp.columns = ["SEQN","Potassium_mmolL","Sodium_mmolL","Bilirubin_umolL","Calcium_mmolL","Protein_gdL","Urea_mmolL",
                   "AlkPhos_UL"]
    temp = temp.astype({"SEQN":int})
    bio_data = pd.concat([bio_data,temp],axis=0)
    del temp

### Extracting Glucose

In [7]:
glu_data = pd.DataFrame()
for i in glu:
    temp = pd.read_sas("glu/"+i)
    if "LBDGLUSI" in temp:
        glucose = "LBDGLUSI"
    elif "LBXGLUSI" in temp:
        glucose = "LBXGLUSI"
    else:
        print(i)
    temp = temp[["SEQN",glucose]]
    temp.columns = ["SEQN","Glucose_mmolL"]
    temp = temp.astype({"SEQN":int})
    glu_data = pd.concat([glu_data,temp],axis=0)
    del temp

### Extracting LDL and Triglyceride

In [8]:
trildl_data = pd.DataFrame()
for i in trildl:
    temp = pd.read_sas("trildl/"+i)
    temp = temp[["SEQN","LBDLDLSI","LBDTRSI"]]
    temp.columns = ["SEQN","LDL_mmolL","Triglyceride_mmolL"]
    temp = temp.astype({"SEQN":int})
    trildl_data = pd.concat([trildl_data,temp],axis=0)
    del temp

### Extracting HDL

In [9]:
hdl_data = pd.DataFrame()
for i in hdl:
    temp = pd.read_sas("hdl/"+i)
    temp = temp[["SEQN","LBDHDDSI"]]
    temp.columns = ["SEQN","HDL_mmolL"]
    temp = temp.astype({"SEQN":int})
    hdl_data = pd.concat([hdl_data,temp],axis=0)
    del temp

### Extracting Total Cholesterol

In [10]:
chol_data = pd.DataFrame()
for i in chol:
    temp = pd.read_sas("chol/"+i)
    temp = temp[["SEQN","LBDTCSI"]]
    temp.columns = ["SEQN","Cholesterol_mmolL"]
    temp = temp.astype({"SEQN":int})
    chol_data = pd.concat([chol_data,temp],axis=0)
    del temp

#### Merge HDL and Cholesterol

In [11]:
chol_hdl = pd.merge(hdl_data,chol_data,on=["SEQN"],how="outer")

### Extracting HDL and Cholesterol

In [12]:
x=0
cholhdl_data = pd.DataFrame()
for i in cholhdl:
    temp = pd.read_sas("cholhdl/"+i)
    if "LBDHDDSI" in temp:
        hdl = "LBDHDDSI"
    elif "LBDHDLSI" in temp:
        hdl = "LBDHDLSI"
    temp = temp[["SEQN",hdl,"LBDTCSI"]]
    temp.columns = ["SEQN","HDL_mmolL","Cholesterol_mmolL"]
    temp = temp.astype({"SEQN":int})
    cholhdl_data = pd.concat([cholhdl_data,temp],axis=0)
    del temp

#### Concat both

In [13]:
cholhdl_data = pd.concat([cholhdl_data,chol_hdl],axis=0)

### Extracting Complete blood count

In [14]:
x=0
cbc_data = pd.DataFrame()
for i in cbc:
    temp = pd.read_sas("cbc/"+i)
    temp = temp[["SEQN","LBXRBCSI","LBXHGB","LBXHCT","LBXMCVSI","LBXMC","LBXPLTSI"]]
    temp.columns = ["SEQN","RBC_MuL","Hemoglobin_gdl","Hematocrit_%","MCV_fL","MCHC_gdl","Platelet_TuL"]
    temp = temp.astype({"SEQN":int})
    cbc_data = pd.concat([cbc_data,temp],axis=0)
    del temp

### Merge Dataframes

In [15]:
dfs = [demo_data,albcr_data,glu_data,bio_data,trildl_data,cholhdl_data,cbc_data]
data = reduce(lambda  left,right: pd.merge(left,right,on=['SEQN'],
                                            how='outer'), dfs)

### Drop NaN values

In [16]:
data = data.dropna()

### For age 20+

In [17]:
data = data[data["AGE"]>=20]

### Reorder Columns

In [20]:
data = data[["SEQN","GENDER","Albumin_mgl","Glucose_mmolL","Urea_mmolL","Cholesterol_mmolL","Protein_gdL","Sodium_mmolL",
             "Creatinine_mgdl","Hemoglobin_gdl","Bilirubin_umolL","Triglyceride_mmolL","HDL_mmolL","LDL_mmolL",
             "Calcium_mmolL","Potassium_mmolL","AlkPhos_UL","Hematocrit_%","MCHC_gdl","MCV_fL","Platelet_TuL","RBC_MuL","AGE"]]

### Export to CSV

In [22]:
data.to_csv(r"D:\A_SJSU\Python\Program\Jupyter_notebook\ML\aging clock\data\data.csv",index=False,header=True)

### Data for Deep Learning

In [None]:
# Removing SEQN and assigning binary for gender
data["GENDER"] = data["GENDER"].map(dict(M=0, F=1))
data = data.drop(["SEQN"],axis=1)

In [None]:
data.to_csv(r"D:\A_SJSU\Python\Program\Jupyter_notebook\ML\aging clock\data\data_for_DL.csv",index=False,header=True)