# Cleaned Data

In [1]:
import numpy as np
import pandas as pd
import os
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load datasets
demog = pd.read_sas('P_DEMO.xpt', format='xport')
med_hx = pd.read_sas('P_MCQ.xpt', format='xport')
biochem_panel = pd.read_sas('P_BIOPRO.xpt', format='xport')
HDL = pd.read_sas('P_HDL.xpt', format='xport')
tchol = pd.read_sas('P_TCHOL.xpt', format='xport')
trig = pd.read_sas('P_TRIGLY.xpt', format='xport')
smoking = pd.read_sas('P_SMQ.xpt', format='xport')
medications = pd.read_sas('P_RXQ_RX.xpt', format='xport')
body_meas = pd.read_sas('P_BMX.xpt', format='xport')
blood_pressure = pd.read_sas('P_BPXO.xpt', format='xport')
aspirin = pd.read_sas('P_RXQASA.xpt', format='xport')
cotinine = pd.read_sas('P_COT.xpt', format='xport')

# Process demographics
demog['Gender'] = demog['RIAGENDR'].map({1: 'Male', 2: 'Female'})
demog['Race_Eth'] = demog['RIDRETH3'].map({
    1: 'Mexican American', 2: 'Other Hispanic', 3: 'Non-Hispanic White',
    4: 'Non-Hispanic Black', 6: 'Non-Hispanic Asian', 7: 'Other Race'
})
education_map = {
    1: 'Less than 9th grade', 2: '9-11th grade', 3: 'High school graduate/GED or equivalent',
    4: 'Some college or AA degree', 5: 'College graduate or above'
}
demog['Education'] = demog['DMDEDUC2'].map(education_map)
demog['Age'] = demog['RIDAGEYR']
combined = demog[['SEQN', 'Gender', 'Education', 'Race_Eth', 'Age']]

# Add biochemistry data
biochem_panel['Creatinine'] = biochem_panel['LBXSCR']
biochem_panel['Glucose'] = biochem_panel['LBXSGL']
combined = combined.merge(biochem_panel[['SEQN', 'Creatinine', 'Glucose']], on='SEQN', how='left')

# Add cholesterol data
HDL['HDL'] = HDL['LBDHDD']
tchol['Tchol'] = tchol['LBXTC']
trig['Trig'] = trig['LBXTR']
trig['LDL'] = trig['LBDLDL']
combined = combined.merge(HDL[['SEQN', 'HDL']], on='SEQN', how='left')
combined = combined.merge(tchol[['SEQN', 'Tchol']], on='SEQN', how='left')
combined = combined.merge(trig[['SEQN', 'Trig', 'LDL']], on='SEQN', how='left')

# Add smoking data
smoking = smoking[~smoking['SMQ020'].isin([7, 9])]
smoking['Smoker'] = smoking['SMQ020'].replace(2, 0)
combined = combined.merge(smoking[['SEQN', 'Smoker']], on='SEQN', how='left')

# Process medications
medications['RXDDRGID'] = medications['RXDDRGID'].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
medications['RXDDRGID'] = medications['RXDDRGID'].str.replace("b'", "").str.replace("'", "")
cholest_med_ids = ['d04851', 'd00348', 'd07637', 'd07110', 'd04787', 'd00280', 'd03183', 'd05348']
medications['Cholest_med'] = medications['RXDDRGID'].apply(lambda x: 1 if x in cholest_med_ids else 0)
combined = combined.merge(medications[['SEQN', 'Cholest_med']], on='SEQN', how='left')

# Add BMI and blood pressure
body_meas['BMI'] = body_meas['BMXBMI']
combined = combined.merge(body_meas[['SEQN', 'BMI']], on='SEQN', how='left')

blood_pressure['Systolic_blood_pressure'] = blood_pressure[['BPXOSY1', 'BPXOSY2', 'BPXOSY3']].mean(axis=1)
blood_pressure['Diastolic_blood_pressure'] = blood_pressure[['BPXODI1', 'BPXODI2', 'BPXODI3']].mean(axis=1)
combined = combined.merge(blood_pressure[['SEQN', 'Systolic_blood_pressure', 'Diastolic_blood_pressure']], on='SEQN', how='left')

# Add coronary disease target
med_hx = med_hx[~med_hx['MCQ160C'].isin([7, 9])]
med_hx['Coronary_dz'] = med_hx['MCQ160C'].replace(2, 0)
combined = combined.merge(med_hx[['SEQN', 'Coronary_dz']], on='SEQN', how='left')

# Add aspirin use
aspirin = aspirin[~aspirin[['RXQ510', 'RXQ515', 'RXQ520']].isin([7, 9]).any(axis=1)]
aspirin['RXQ510'] = aspirin['RXQ510'].replace(2, 0)
aspirin['RXQ515'] = aspirin['RXQ515'].replace([2, 4], 0)
aspirin['RXQ515'] = aspirin['RXQ515'].replace(3, 1)
aspirin['RXQ520'] = aspirin['RXQ520'].replace(2, 0)

aspirin['Asprin'] = 0
aspirin.loc[(aspirin['RXQ510'] == 1) & (aspirin['RXQ515'] == 1), 'Asprin'] = 1
aspirin.loc[(aspirin['RXQ510'] == 0) & (aspirin['RXQ520'] == 1), 'Asprin'] = 1

combined = combined.merge(aspirin[['SEQN', 'Asprin']], on='SEQN', how='left')

# Add cotinine
cotinine['Cotidine'] = cotinine['LBXCOT'] * cotinine['LBDCOTLC']
cotinine['Hydroxtcot'] = cotinine['LBXHCOT'] * cotinine['LBDHCOLC']
combined = combined.merge(cotinine[['SEQN', 'Cotidine', 'Hydroxtcot']], on='SEQN', how='left')

# Final cleaning
combined = combined.drop_duplicates(subset=['SEQN'])
combined = combined.dropna()

# Debugging output
print(combined.head())
print(combined.shape)

modeling, holdout = train_test_split(combined, test_size = .2, random_state = 42)
modeling.to_csv('modeling_data.csv', index=False)
holdout.to_csv('holdout_data.csv', index=False)

        SEQN  Gender                               Education  \
9   109271.0    Male                            9-11th grade   
16  109274.0    Male               Some college or AA degree   
34  109282.0    Male               College graduate or above   
56  109290.0  Female               College graduate or above   
61  109292.0    Male  High school graduate/GED or equivalent   

              Race_Eth   Age  Creatinine  Glucose   HDL  Tchol   Trig    LDL  \
9   Non-Hispanic White  49.0        0.78     95.0  33.0  147.0   84.0   97.0   
16          Other Race  68.0        0.74    153.0  29.0  105.0  133.0   49.0   
34  Non-Hispanic White  76.0        0.88     92.0  43.0  233.0  132.0  164.0   
56  Non-Hispanic Black  68.0        0.69     95.0  40.0  165.0  102.0  105.0   
61      Other Hispanic  58.0        0.95    175.0  52.0  172.0   90.0  102.0   

    Smoker  Cholest_med   BMI  Systolic_blood_pressure  \
9      1.0            0  29.7               107.000000   
16     0.0        

In [2]:
print(np.mean(combined['Coronary_dz']))

0.06221294363256785
