# Missing Data Exploration

## Step 1: Load dataset and examine

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
df = pd.read_csv('../data/data_clean.csv', sep=',', encoding='latin-1')
# print(df.head(), df.columns, df.shape)

In [None]:
df.describe()

Remove PAQ706, since all values missing. Examine missingness of variables in each columns. Remove columns CDQ001, CDQ010, DIQ070, DBD100, highLDL since >1000 missing values

In [2]:
df = df.drop(columns=['PAQ706'])

df = df.drop(columns = ['CDQ001', 'CDQ010', 'DIQ070', 'DBD100', 'highLDL'])

In [None]:
df.isna().sum()

Missingness of diet information. If DR1TKCAL is missing, are all other diet variables missing as well? Yes. But not the same rows as missing DIQ170 data.

In [3]:
df_temp = df[df.DR1TKCAL.notnull()]
df_temp.isna().sum()

SEQN          0
RIDAGEYR      0
RIAGENDR      0
RIDRETH1      0
RIDRETH3      0
DMDCITZN     10
DMDEDUC2      3
MIALANG     206
DMDHHSIZ      0
INDHHIN2    217
INDFMIN2    207
INDFMPIR    361
BMXLEG      170
BMXARML     123
BMXARMC     120
BMXWAIST    165
BMDAVSAD    315
BPXPLS      106
BPXPULS     105
BPXSY1      364
BPXDI1      364
DIQ010        0
DIQ160      788
DIQ170      630
DIQ172      630
DIQ180      630
DIQ050        1
DBQ095Z       0
DRQSPREP      0
DR1STY        0
DRQSDIET      0
DR1TKCAL      0
DR1TPROT      0
DR1TCARB      0
DR1TSUGR      0
DR1TFIBE      0
DR1TTFAT      0
DR1TSFAT      0
DR1TMFAT      0
DR1TPFAT      0
DR1TCHOL      0
DR1TSODI      0
DR1TALCO      0
DR1_320Z      0
LBDHDD      182
HIQ011        0
PAQ635        0
PAQ650        0
PAQ665        0
PAD680       22
PAQ710        1
LBXTC       182
bmi          37
dtype: int64

Examine missingness of diet information related to other columns. Is it MCAR?

In [None]:
# Initialize the missing diet columns
df["diet_missing"] = df["DR1TKCAL"]
# The column is false
df["diet_missing"] = False
# Replace where Height_missing with True where Height is missing
df.loc[df[df['DR1TKCAL'].isnull()].index, "diet_missing"] = True

Is missing diet information related to health insurance status?

In [None]:
df[df["diet_missing"]==True].groupby("HIQ011")["diet_missing"].count()


In [None]:
df[df["diet_missing"]==False].groupby("HIQ011")["diet_missing"].count()

In [None]:
#create cont. table
diet_HI_table = [[567, 154],[3976,1066]]

#perform chi-squared test 
from scipy.stats import chi2_contingency
chi2, p, dof, ex = chi2_contingency(diet_HI_table)

#print results
print(p) #p>0.05 therefore we assume missingness is independent


Is missing diet information related to gender?

In [None]:
#df[df["diet_missing"]==True].groupby("RIAGENDR")["diet_missing"].count()
df[df["diet_missing"]==False].groupby("RIAGENDR")["diet_missing"].count()


In [None]:

#create cont. table
diet_gen_table = [[344, 378],[2414, 2633]]

#perform chi-squared test 
from scipy.stats import chi2_contingency
chi2, p, dof, ex = chi2_contingency(diet_gen_table)

#print results
print(p) #p>0.05 therefore we assume missingness is independent

##FIX: want to test missingness related to LBDHDD 

In [None]:
#df[df["diet_missing"]==True].groupby("PAQ650")["diet_missing"].count()
df[df["diet_missing"]==False].groupby("PAQ650")["diet_missing"].count()

In [None]:
#create cont. table
diet_paq_table = [[1148, 3899],[114, 608]]

#perform chi-squared test 
from scipy.stats import chi2_contingency
chi2, p, dof, ex = chi2_contingency(diet_paq_table)

#print results
print(p) #p < 0.05 therefore we assume missingness is not independent


## Step 2: Create patterns of missingness

In [44]:
#variable choice to create dataset for missignness simulation
df_nonmiss = df[['SEQN','RIDAGEYR', 'RIAGENDR', 'RIDRETH1', 'RIDRETH3',
                 'DMDCITZN', 'DMDEDUC2', 'BMXLEG', 'BPXPULS',
                 'DIQ010', 'DIQ050', 'HIQ011', 'PAQ635', 
                 'PAQ650', 'PAQ665','PAD680', 'PAQ710', 'DR1TKCAL']]
np_nonmiss = df_nonmiss.to_numpy(copy=True, na_value=np.nan) # Convert dataframe to numpy array 

np_nonmiss = np_nonmiss[~np.isnan(np_nonmiss).any(axis=1), :] # delete rows with any missing values

seqn = np_nonmiss[:, 0] #record seqn for rows in df_nonmiss
np_nonmiss = np_nonmiss[:, 1:18] #remove seqn
print(np.shape(np_nonmiss))

(4756, 17)


In [10]:
#proportion missing
pm = [x * 0.1 for x in range(1, 7)]
#print(pm)
p = 0.4

In [34]:
#MCAR

MCAR_data = np_nonmiss #init data

MCAR_missing_indices = np.random.choice( # randomly select values from length ie flat array 
    len(np_nonmiss.flatten()), 
    size=int(len(np_nonmiss.flatten())*p)) 
MCAR_data.flat[MCAR_missing_indices] = np.nan



[ 512   45  939  442  269  272  660  553  598  445  763  609  561   11
  829  952 1041  764 1016  715  896  150  125  598  177   24  846  898
  121   33   90  248   13  530 1008   82  586  719  504  799  273  555
  316  186  943  323  295  634  507  362  736 1086  322  142  236 1039
 1108  797   24  959  878  969  167  665  194  679  925  279   99  290
  470  502  355  231  771  240  606 1034 1040  273 1061  190  129  849
  599  869 1108  426  934  626  612  663    5  417  481  316  195  694
  614  709   76   24  222 1099  269  257  385  306  503 1004  214    6
  919  125  326  330  828  730  150   92  698  864  797  911 1084  551
  556  491 1025  117  828   35  234  895  766  541  419  921   24  593
 1005  913 1015 1057  545  624   84  513  553 1060]
nan


In [None]:
#MAR

# When data are MAR, the fact that the data are missing is systematically related to the observed
# but not the unobserved data, eg. related to age 

MAR_data = np_nonmiss #init data

#determine indicies with PAQ values
PAQ1_ind = np.where(np_nonmiss[:, 12] == 1) #12 col is PAQ650
PAQ2_ind = np.where(np_nonmiss[:, 12] == 2)

#randomly select indices for missing values
MAR_missing_diet_ind = np.random.choice(len(PAQ1_ind[0]),
                                        size = round(len(PAQ1_ind[0])*p/3))
np.append(MAR_missing_diet_ind, 
          np.random.choice(len(PAQ2_ind[0]),
                           size = round(len(PAQ2_ind[0])*2*p/3))) #more PAQ=2 are missing

#remove those values from dataset
for i in MAR_missing_diet_ind: 
    MAR_data[i, 16] = np.nan #16 col is DR1TKCAL


In [63]:
#MNAR

# When data are MNAR, the fact that the data are missing is systematically related to an unobserved data
# eg. DR1TKCAL may be related to bmi (unobserved in our sample subset)

MNAR_data = np_nonmiss #init data

#find bmi values
bmi = df.loc[df['SEQN'].isin(seqn)]['bmi'] #limit to subset in seqn, select col bmi
bmi = bmi.to_numpy(copy=True, na_value=np.nan)

#high bmi
hibmi = (bmi > 25) #def of obesity
#health bmi
hebmi = (bmi < 25)

#determine indicies with hi or low bmi values
hibmi_ind = np.where(hibmi)[0]
hebmi_ind = np.where(hebmi)[0]

#randomly select indices for missing values
MNAR_missing_diet_ind = np.random.choice(len(hibmi_ind),
                                        size = round(len(hibmi_ind)*2*p/3)) #oversample hi bmi
np.append(MNAR_missing_diet_ind, 
          np.random.choice(len(hebmi_ind),
                           size = round(len(hebmi_ind)*p/3))) 

#remove those values from dataset
for i in MNAR_missing_diet_ind: 
    MNAR_data[i, 16] = np.nan #16 col is DR1TKCAL


[[6.900e+01 1.000e+00 4.000e+00 ... 6.000e+02 2.000e+00       nan]
 [5.400e+01 1.000e+00 3.000e+00 ... 5.400e+02 4.000e+00       nan]
 [7.200e+01 1.000e+00 3.000e+00 ... 3.000e+02 4.000e+00       nan]
 ...
 [8.000e+01 1.000e+00 3.000e+00 ... 3.600e+02 2.000e+00 2.432e+03]
 [2.600e+01 1.000e+00 2.000e+00 ... 6.000e+02 2.000e+00 4.687e+03]
 [4.200e+01 2.000e+00 4.000e+00 ... 7.200e+02 5.000e+00 2.475e+03]]


## Step 3: Examine patterns of missingness on SVD imputation accuracy