## download the data files

In [5]:
import os
import urllib
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
def download_nhanes_data(data_dir):
    file_list = [
        ('2005-2006', 'DEMO_D'), ('2007-2008', 'DEMO_E'),  ('2009-2010', 'DEMO_F'),('2011-2012', 'DEMO_G'),  ('2013-2014', 'DEMO_H'),
        ('2005-2006', 'BPX_D'), ('2007-2008', 'BPX_E'),  ('2009-2010', 'BPX_F'),('2011-2012', 'BPX_G'),  ('2013-2014', 'BPX_H'),
        ('2005-2006', 'BMX_D'), ('2007-2008', 'BMX_E'),  ('2009-2010', 'BMX_F'),('2011-2012', 'BMX_G'),  ('2013-2014', 'BMX_H'),
        ('2005-2006', 'SMQ_D'), ('2007-2008', 'SMQ_E'),  ('2009-2010', 'SMQ_F'),('2011-2012', 'SMQ_G'),  ('2013-2014', 'SMQ_H'),
        ('2005-2006', 'DIQ_D'), ('2007-2008', 'DIQ_E'),  ('2009-2010', 'DIQ_F'),('2011-2012', 'DIQ_G'),  ('2013-2014', 'DIQ_H'),
        ('2005-2006', 'CDQ_D'), ('2007-2008', 'CDQ_E'),  ('2009-2010', 'CDQ_F'),('2011-2012', 'CDQ_G'),  ('2013-2014', 'CDQ_H'),
        ('2005-2006', 'TCHOL_D'), ('2007-2008', 'TCHOL_E'),  ('2009-2010', 'TCHOL_F'),('2011-2012', 'TCHOL_G'),  ('2013-2014', 'TCHOL_H'),
    ]

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    for (year, data_file) in file_list:
        sub_dir = os.path.join(data_dir, year)
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        url = 'http://wwwn.cdc.gov/Nchs/Nhanes/{0}/{1}.XPT'.format(year, data_file)
        file_name = os.path.join(sub_dir, data_file + '.XPT')
        if not os.path.exists(file_name):
#             logging.info('Downloading: {}'.format(url))
            urllib.request.urlretrieve(url, file_name)

## merge the data and transfer into csv file

In [7]:
def merge_nhanes_data(data_dir):

    DEMO_D = pd.read_sas(os.path.join(data_dir, '2005-2006', 'DEMO_D.XPT'))
    DEMO_E = pd.read_sas(os.path.join(data_dir, '2007-2008', 'DEMO_E.XPT'))
    DEMO_F = pd.read_sas(os.path.join(data_dir, '2009-2010', 'DEMO_F.XPT'))
    DEMO_G = pd.read_sas(os.path.join(data_dir, '2011-2012', 'DEMO_G.XPT'))
    DEMO_H = pd.read_sas(os.path.join(data_dir, '2013-2014', 'DEMO_H.XPT'))
    DEMO_cols = ['SEQN', 'RIDAGEYR', 'RIAGENDR']

    BPX_D = pd.read_sas(os.path.join(data_dir, '2005-2006', 'BPX_D.XPT'))
    BPX_E = pd.read_sas(os.path.join(data_dir, '2007-2008', 'BPX_E.XPT'))
    BPX_F = pd.read_sas(os.path.join(data_dir, '2009-2010', 'BPX_F.XPT'))
    BPX_G = pd.read_sas(os.path.join(data_dir, '2011-2012', 'BPX_G.XPT'))
    BPX_H = pd.read_sas(os.path.join(data_dir, '2013-2014', 'BPX_H.XPT'))
    BPX_cols = ['SEQN', 'BPXSY1', 'BPXDI1']

    BMX_D = pd.read_sas(os.path.join(data_dir, '2005-2006', 'BMX_D.XPT'))
    BMX_E = pd.read_sas(os.path.join(data_dir, '2007-2008', 'BMX_E.XPT'))
    BMX_F = pd.read_sas(os.path.join(data_dir, '2009-2010', 'BMX_F.XPT'))
    BMX_G = pd.read_sas(os.path.join(data_dir, '2011-2012', 'BMX_G.XPT'))
    BMX_H = pd.read_sas(os.path.join(data_dir, '2013-2014', 'BMX_H.XPT'))
    BMX_cols = ['SEQN', 'BMXBMI']

    SMQ_D = pd.read_sas(os.path.join(data_dir, '2005-2006', 'SMQ_D.XPT'))
    SMQ_E = pd.read_sas(os.path.join(data_dir, '2007-2008', 'SMQ_E.XPT'))
    SMQ_F = pd.read_sas(os.path.join(data_dir, '2009-2010', 'SMQ_F.XPT'))
    SMQ_G = pd.read_sas(os.path.join(data_dir, '2011-2012', 'SMQ_G.XPT'))
    SMQ_H = pd.read_sas(os.path.join(data_dir, '2013-2014', 'SMQ_H.XPT'))
    SMQ_cols = ['SEQN', 'SMQ020']

    DIQ_D = pd.read_sas(os.path.join(data_dir, '2005-2006', 'DIQ_D.XPT'))
    DIQ_E = pd.read_sas(os.path.join(data_dir, '2007-2008', 'DIQ_E.XPT'))
    DIQ_F = pd.read_sas(os.path.join(data_dir, '2009-2010', 'DIQ_F.XPT'))
    DIQ_G = pd.read_sas(os.path.join(data_dir, '2011-2012', 'DIQ_G.XPT'))
    DIQ_H = pd.read_sas(os.path.join(data_dir, '2013-2014', 'DIQ_H.XPT'))
    DIQ_cols = ['SEQN', 'DIQ010']
    
    TCHOL_D = pd.read_sas(os.path.join(data_dir, '2005-2006', 'TCHOL_D.XPT'))
    TCHOL_E = pd.read_sas(os.path.join(data_dir, '2007-2008', 'TCHOL_E.XPT'))
    TCHOL_F = pd.read_sas(os.path.join(data_dir, '2009-2010', 'TCHOL_F.XPT'))
    TCHOL_G = pd.read_sas(os.path.join(data_dir, '2011-2012', 'TCHOL_G.XPT'))
    TCHOL_H = pd.read_sas(os.path.join(data_dir, '2013-2014', 'TCHOL_H.XPT'))
    TCHOL_cols = ['SEQN', 'LBXTC']
    
    CDQ_D = pd.read_sas(os.path.join(data_dir, '2005-2006', 'CDQ_D.XPT'))
    CDQ_E = pd.read_sas(os.path.join(data_dir, '2007-2008', 'CDQ_E.XPT'))
    CDQ_F = pd.read_sas(os.path.join(data_dir, '2009-2010', 'CDQ_F.XPT'))
    CDQ_G = pd.read_sas(os.path.join(data_dir, '2011-2012', 'CDQ_G.XPT'))
    CDQ_H = pd.read_sas(os.path.join(data_dir, '2013-2014', 'CDQ_H.XPT'))
    CDQ_cols = ['SEQN', 'CDQ010']
    

    '''Merge Datasets
    '''
#     logging.info('Merging data...')
    age = 40
    df_00 = DEMO_D.loc[(DEMO_D.RIDAGEYR >= age),DEMO_cols] \
            .merge(BPX_D[BPX_cols], on='SEQN') \
            .merge(BMX_D[BMX_cols], on='SEQN') \
            .merge(SMQ_D[SMQ_cols], on='SEQN') \
            .merge(DIQ_D[DIQ_cols], on='SEQN') \
            .merge(TCHOL_D[TCHOL_cols], on='SEQN')

    df_02 = DEMO_E.loc[(DEMO_E.RIDAGEYR >= age),DEMO_cols] \
            .merge(BPX_E[BPX_cols], on='SEQN') \
            .merge(BMX_E[BMX_cols], on='SEQN') \
            .merge(SMQ_E[SMQ_cols], on='SEQN') \
            .merge(DIQ_E[DIQ_cols], on='SEQN') \
            .merge(TCHOL_E[TCHOL_cols], on='SEQN')

    df_04 = DEMO_F.loc[(DEMO_F.RIDAGEYR >= age),DEMO_cols] \
            .merge(BPX_F[BPX_cols], on='SEQN') \
            .merge(BMX_F[BMX_cols], on='SEQN') \
            .merge(SMQ_F[SMQ_cols], on='SEQN') \
            .merge(DIQ_F[DIQ_cols], on='SEQN') \
            .merge(TCHOL_F[TCHOL_cols], on='SEQN')
    
    df_06 = DEMO_G.loc[(DEMO_G.RIDAGEYR >= age),DEMO_cols] \
            .merge(BPX_G[BPX_cols], on='SEQN') \
            .merge(BMX_G[BMX_cols], on='SEQN') \
            .merge(SMQ_G[SMQ_cols], on='SEQN') \
            .merge(DIQ_G[DIQ_cols], on='SEQN') \
            .merge(TCHOL_G[TCHOL_cols], on='SEQN')
    
    df_08 = DEMO_H.loc[(DEMO_H.RIDAGEYR >= age),DEMO_cols] \
            .merge(BPX_H[BPX_cols], on='SEQN') \
            .merge(BMX_H[BMX_cols], on='SEQN') \
            .merge(SMQ_H[SMQ_cols], on='SEQN') \
            .merge(DIQ_H[DIQ_cols], on='SEQN') \
            .merge(TCHOL_H[TCHOL_cols], on='SEQN')
    df_pop = pd.concat([df_00, df_02, df_04, df_06, df_08])


    '''Diagnosed Cardiovascular '''
    df_00_diag = df_00.merge(CDQ_D.loc[CDQ_D.CDQ010 == 1, CDQ_cols], on="SEQN")
    df_02_diag = df_02.merge(CDQ_E.loc[CDQ_E.CDQ010 == 1, CDQ_cols], on="SEQN")
    df_04_diag = df_04.merge(CDQ_F.loc[CDQ_F.CDQ010 == 1, CDQ_cols], on="SEQN")
    df_06_diag = df_06.merge(CDQ_G.loc[CDQ_G.CDQ010 == 1, CDQ_cols], on="SEQN")
    df_08_diag = df_08.merge(CDQ_H.loc[CDQ_H.CDQ010 == 1, CDQ_cols], on="SEQN")
    diag_total = pd.concat([df_00_diag, df_02_diag, df_04_diag, df_06_diag, df_08_diag])
    diag_total.loc[:,'status'] = 1
#     logging.info('Diagnosed subject count: {}'.format(diag_total.shape[0]))

    '''No Cardiovascular'''
    df_00_undiag = df_00.merge(CDQ_D.loc[CDQ_D.CDQ010 == 2, CDQ_cols], on='SEQN')
    df_02_undiag = df_02.merge(CDQ_E.loc[CDQ_E.CDQ010 == 2, CDQ_cols], on='SEQN')
    df_04_undiag = df_04.merge(CDQ_F.loc[CDQ_F.CDQ010 == 2, CDQ_cols], on='SEQN')
    df_06_undiag = df_06.merge(CDQ_G.loc[CDQ_G.CDQ010 == 2, CDQ_cols], on='SEQN')
    df_08_undiag = df_08.merge(CDQ_H.loc[CDQ_H.CDQ010 == 2, CDQ_cols], on='SEQN')
    undiag_total = pd.concat([df_00_undiag, df_02_undiag, df_04_undiag, df_06_undiag, df_08_undiag])
    undiag_total.loc[:,'status'] = 0
#     logging.info('Undiagnosed subject count: {}'.format(undiag_total.shape[0]))

    '''Join and split data'''
    df = pd.concat([diag_total, undiag_total], ignore_index=True)
    df = df.drop(['CDQ010'], axis=1)
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=289)

    '''Save data'''
    fname_train = os.path.join(data_dir, 'data_train_cvd.csv')
    fname_test = os.path.join(data_dir, 'data_test_cvd.csv')
    df_train.to_csv(fname_train, index=False, float_format='%.1f')
#     logging.info('Training set saved: {}'.format(fname_train))
    df_test.to_csv(fname_test, index=False, float_format='%.1f')
#     logging.info('Test set saved: {}'.format(fname_test))

In [8]:
if __name__ == '__main__':
    data_dir = 'data'
    download_nhanes_data(data_dir)
    merge_nhanes_data(data_dir)