In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
from sklearn.impute import SimpleImputer 
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer 
from sklearn.impute import KNNImputer 

In [3]:
df = pd.read_csv('../data/data_clean.csv', sep=',', encoding='latin-1')
df = df.drop(columns=['PAQ706'])
df = df.drop(columns = ['CDQ001', 'CDQ010', 'DIQ070', 'DBD100', 'highLDL'])

#variable choice to create dataset for missignness simulation
df_nonmiss = df[['SEQN','RIDAGEYR', 'RIAGENDR', 'RIDRETH1', 'RIDRETH3',
                 'DMDCITZN', 'DMDEDUC2', 'BMXLEG', 'BPXPULS',
                 'DIQ010', 'DIQ050', 'HIQ011', 'PAQ635', 
                 'PAQ650', 'PAQ665','PAD680', 'PAQ710', 'DR1TKCAL']]

cont_cols = ['RIDAGEYR', 'BMXLEG', 'DR1TKCAL', 'PAD680']

#stash col mean, std in order to determine missingness in MAR and MNAR patterns
paq650mean = df_nonmiss['PAQ650'].mean()
paq650std = df_nonmiss['PAQ650'].std()

# Normalize continuous columns to standard Normal 
for cont_col in cont_cols: 
    df_nonmiss[cont_col] = (df_nonmiss[cont_col] - df_nonmiss[cont_col].mean()) / df_nonmiss[cont_col].std() 


np_nonmiss = df_nonmiss.to_numpy(copy=True, na_value=np.nan) # Convert dataframe to numpy array 
np_nonmiss = np_nonmiss[~np.isnan(np_nonmiss).any(axis=1), :] # delete rows with any missing values

seqn = np_nonmiss[:, 0] #record seqn for rows in df_nonmiss
np_nonmiss = np_nonmiss[:, 1:18] #remove seqn
#print(np.shape(np_nonmiss))
print(np_nonmiss)

[[ 1.13241831  1.          4.         ...  0.90678608  2.
  -0.52543534]
 [ 0.27835981  1.          3.         ...  0.60402997  4.
   2.84293864]
 [ 1.30323001  1.          3.         ... -0.60699445  4.
  -0.36223144]
 ...
 [ 1.75872788  1.          3.         ... -0.30423834  2.
   0.3031383 ]
 [-1.31588273  1.          2.         ...  0.90678608  2.
   2.48079981]
 [-0.40488699  2.          4.         ...  1.51229829  5.
   0.34466356]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nonmiss[cont_col] = (df_nonmiss[cont_col] - df_nonmiss[cont_col].mean()) / df_nonmiss[cont_col].std()


In [4]:
#proportion missing
p = 0.1 

#MCAR
MCAR_data = np.copy(np_nonmiss) #init data
MCAR_missing_indices = np.random.choice( # randomly select values from length ie flat array 
    len(np_nonmiss.flatten()), 
    size=int(len(np_nonmiss.flatten())*p)) 

MCAR_data.flat[MCAR_missing_indices] = np.nan 

In [15]:
MNAR_data = np.copy(np_nonmiss) #init data

#find bmi values
bmi = df.loc[df['SEQN'].isin(seqn)]['bmi'] #limit to subset in seqn, select col bmi
bmi = bmi.to_numpy(copy=True, na_value=np.nan)

#high bmi
hibmi = (bmi > 25) #def of obesity
#health bmi
hebmi = (bmi < 25)

#determine indicies with hi or low bmi values
hibmi_ind = np.where(hibmi)[0]
hebmi_ind = np.where(hebmi)[0]

#randomly select indices for missing values
MNAR_missing_diet_ind = np.random.choice(len(hibmi_ind),
                                        size = round(len(hibmi_ind)*2*p/3)) #oversample hi bmi
np.append(MNAR_missing_diet_ind, 
          np.random.choice(len(hebmi_ind),
                           size = round(len(hebmi_ind)*p/3))) 


#remove those values from dataset
for i in MNAR_missing_diet_ind: 
    MNAR_data[i, 16] = np.nan #16 col is DR1TKCAL

#print(MNAR_data[:,16].tolist())


In [6]:
#MAR

# When data are MAR, the fact that the data are missing is systematically related to the observed
# but not the unobserved data, eg. related to age 

MAR_data = np.copy(np_nonmiss) #init data

#determine indicies with PAQ values
PAQ1_ind = np.where(np_nonmiss[:, 12] == 1) #12 col is PAQ650
PAQ2_ind = np.where(np_nonmiss[:, 12] == 2)

#randomly select indices for missing values
MAR_missing_diet_ind = np.random.choice(len(PAQ1_ind[0]),
                                        size = round(len(PAQ1_ind[0])*p/3))
np.append(MAR_missing_diet_ind, 
          np.random.choice(len(PAQ2_ind[0]),
                           size = round(len(PAQ2_ind[0])*2*p/3))) #more PAQ=2 are missing

#remove those values from dataset
for i in MAR_missing_diet_ind: 
    MAR_data[i, 16] = np.nan #16 col is DR1TKCAL


In [7]:
def run(data, missing_indices, method): 
    if method in ['mean', 'most_frequent', 'median']: 
        imp = SimpleImputer(missing_values=np.nan, strategy=method)
        imputed_data = imp.fit_transform(data)
        
    if method == "multivariate_feature_imputation": 
        imp = IterativeImputer(max_iter=5, random_state=0) 
        imputed_data = imp.fit_transform(data) 

    if method == "knn": 
        imputer = KNNImputer(n_neighbors=2, weights="uniform") 
        imputed_data = imputer.fit_transform(data)
    
    # print(imputed_data[:3,:3])         
    print("MSE = ", mean_squared_error(
        np_nonmiss.flat[missing_indices], 
        imputed_data.flat[missing_indices])) 

    

In [8]:
data_dict = {
    "MCAR_data":[MCAR_data, MCAR_missing_indices], 
    "MAR_data":[MAR_data, MAR_missing_diet_ind], 
    "MNAR_data":[MNAR_data, MNAR_missing_diet_ind], 
}

In [9]:
for key in data_dict.keys(): 
    for method in ['mean', 'median', 'most_frequent', 'multivariate_feature_imputation', 'knn']: 
        print("Data: ", key, '\t method:', method)
        run(data_dict[key][0], data_dict[key][1], method)
        print() 
    print("=============================================")

Data:  MCAR_data 	 method: mean
MSE =  0.8143861420942045

Data:  MCAR_data 	 method: median
MSE =  0.9021684450674861

Data:  MCAR_data 	 method: most_frequent
MSE =  1.0989800205788496

Data:  MCAR_data 	 method: multivariate_feature_imputation




MSE =  0.5418165122022942

Data:  MCAR_data 	 method: knn
MSE =  0.8495703776718395

Data:  MAR_data 	 method: mean
MSE =  0.0

Data:  MAR_data 	 method: median
MSE =  0.0

Data:  MAR_data 	 method: most_frequent
MSE =  0.0

Data:  MAR_data 	 method: multivariate_feature_imputation
MSE =  0.0

Data:  MAR_data 	 method: knn
MSE =  0.0

Data:  MNAR_data 	 method: mean
MSE =  0.00542730207698105

Data:  MNAR_data 	 method: median
MSE =  0.0037867525372867993

Data:  MNAR_data 	 method: most_frequent
MSE =  0.0022568687248694197

Data:  MNAR_data 	 method: multivariate_feature_imputation
MSE =  0.002222576298046997

Data:  MNAR_data 	 method: knn
MSE =  0.00017895805941809065

