In [24]:
import os
import glob
import pandas as pd

cwd = os.getcwd()
data_folder = os.path.join(cwd, "Data")
xpt_files = glob.glob(os.path.join(data_folder, "*.xpt"))

xpt_files

[]

In [None]:
dfs = []
for file in xpt_files:
    df = pd.read_sas(file, format='xport')
    if 'SEQN' in df.columns:
        # Drop duplicate SEQN rows, keep first occurrence
        df = df.drop_duplicates(subset='SEQN')

        # Set SEQN as index
        df = df.set_index('SEQN')
        dfs.append(df)

# Concatenate along columns matching SEQN index
merged_df = pd.concat(dfs, axis=1, join='outer')

# Optional reset index for SEQN as column
merged_df = merged_df.reset_index()

print(merged_df.shape)

In [19]:
merged_df.columns

Index(['SEQN', 'WTPH2YR', 'LBXAGP', 'BAXMSTAT', 'BAXRXNC', 'BAXRXND',
       'BAX5STAT', 'BAQ110', 'BAQ121', 'BAQ125',
       ...
       'URXPREG', 'WTPH2YR', 'LBXVIDMS', 'LBDVIDLC', 'LBXVD2MS', 'LBDVD2LC',
       'LBXVD3MS', 'LBDVD3LC', 'LBXVE3MS', 'LBDVE3LC'],
      dtype='object', length=773)

In [11]:
# Removing highly specific columns, HPV type, specific tooth missing, sample weights, metadata
drop_cols = ['ORXGH', 'ORXGL', 'ORXH06', 'ORXH11', 'ORXH16', 'ORXH18', 'ORXH26', 'ORXH31', 'ORXH33', 'ORXH35', 'ORXH39', 'ORXH40', 'ORXH42', 'ORXH45', 'ORXH51', 'ORXH52', 'ORXH53', 
             'ORXH54', 'ORXH55', 'ORXH56', 'ORXH58', 'ORXH59', 'ORXH61', 'ORXH62', 'ORXH64', 'ORXH66', 'ORXH67', 'ORXH68', 'ORXH69', 'ORXH70', 'ORXH71', 'ORXH72', 'ORXH73', 
             'ORXH81', 'ORXH82', 'ORXH83', 'ORXH84', 'ORXHPC', 'ORXHPI','OHX01TC', 'OHX02TC', 'OHX03TC', 'OHX04TC', 'OHX05TC', 'OHX06TC', 'OHX07TC', 'OHX08TC', 'OHX09TC', 
             'OHX10TC', 'OHX11TC', 'OHX12TC', 'OHX13TC', 'OHX14TC', 'OHX15TC', 'OHX16TC', 'OHX17TC', 'OHX18TC', 'OHX19TC', 'OHX20TC', 'OHX21TC', 'OHX22TC', 'OHX23TC', 'OHX24TC', 
             'OHX25TC', 'OHX26TC', 'OHX27TC', 'OHX28TC', 'OHX29TC', 'OHX30TC', 'OHX31TC', 'OHX32TC', 'OHX02CTC', 'OHX03CTC', 'OHX04CTC', 'OHX05CTC', 'OHX06CTC', 'OHX07CTC', 
             'OHX08CTC', 'OHX09CTC', 'OHX10CTC', 'OHX11CTC', 'OHX12CTC', 'OHX13CTC', 'OHX14CTC', 'OHX15CTC', 'OHX18CTC', 'OHX19CTC', 'OHX20CTC', 'OHX21CTC', 'OHX22CTC', 
             'OHX23CTC', 'OHX24CTC', 'OHX25CTC', 'OHX26CTC', 'OHX27CTC', 'OHX28CTC', 'OHX29CTC', 'OHX30CTC', 'OHX31CTC','DMDHRAGE','WTDR2D','WTINT2YR','WTMEC2YR',
             'PHAFSTMN.x','SEQN','RIDSTATR'
            ]
merged_df.drop(drop_cols, axis=1, inplace=True, errors='ignore')


In [6]:
merged_df["SystolicBP"]=(merged_df["BPXOSY1"]+merged_df["BPXOSY2"]+merged_df["BPXOSY3"])/3
merged_df["DiastolicBP"]=(merged_df["BPXODI1"]+merged_df["BPXODI2"]+merged_df["BPXODI3"])/3
merged_df.drop(["BPXDI1","BPXDI2","BPXDI3","BPXSY1","BPXSY2","BPXSY3","MGXH1T1","MGXH1T2","MGXH1T3","MGXH2T1","MGXH2T2","MGXH2T3"],axis=1,inplace=True)
merged_df

KeyError: 'BPXSY1'

In [18]:
# Remove columns with more than 70% missing values
threshold = 0.7
merged_df = merged_df.loc[:, merged_df.isnull().mean() <= threshold]

In [19]:
merged_df.shape

(11933, 580)