## Data Imputation 

## Stage 1: 
- preparing data 
- data imputation for 4 columns in the matrix 

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('../data/data_clean.csv', sep=',', encoding='latin-1')
df = df.drop(columns=['SEQN', 'PAQ706'])
# print(df.head(), df.columns, df.shape)


In [39]:
df = df[df.BPXSY1.notnull()]

In [40]:
# regressing on this column 
df = df.drop(columns=['BPXSY1'])

In [41]:
# df = df[["BPXPLS", "BMXARMC", "bmi", "highbp" ]] # Reading 4 columns for now 
np_array = df.to_numpy(copy=True, na_value=np.nan) # Convert dataframe to numpy array 

To test our code, we will first delete values ourselves and compare output

## Stage 1A: 
- 1) Considering data with only non missing values (delete rows with any missing values)
- 2) Dropping values at random from this data 
- 3) Applying Low Rank Model estimation 
- 4) Evaluating the low rank estimate using true values 

In [42]:
## nonmissing_data is our ground truth, 
## MCAR stands for Missing completely at random; MCAR_data is to evaluate low rank model algorithm 

nonmissing_data = np_array[~np.isnan(np_array).any(axis=1), :] # delete rows with any missing values
MCAR_missing_indices = np.random.choice( # randomly select values from length ie flat array 
    len(nonmissing_data.flatten()), 
    size=int(len(nonmissing_data.flatten())/10)) 

MCAR_data = nonmissing_data
MCAR_data.flat[MCAR_missing_indices] = np.nan
print(MCAR_data) 

[[ 58.           1.           3.         ... 179.                  nan
   33.55912298]
 [ 57.           2.           2.         ... 235.           0.
   28.97823892]
 [ 47.                  nan   3.         ... 230.           0.
           nan]
 ...
 [         nan   2.           4.         ... 141.                  nan
   48.41515831]
 [ 57.           2.           4.         ... 241.           1.
   42.86189205]
 [ 43.           2.           4.         ... 169.           0.
   50.0250699 ]]


## Low Rank Estimation 

In [105]:
## FIX: delete this line when no longer working with MCAR_data 
# np_array = MCAR_data 

## Get non-missing indices 
nonmissing_indices = np.argwhere(~np.isnan(np_array.flatten())).reshape(-1) 


## Create initial matrix (impute nans with zeroes) 
rating_matrix_ini = np.zeros(np_array.shape) - 1 
rating_matrix_ini.flat[nonmissing_indices] = np_array.flat[nonmissing_indices] 

# # TEST CODE START
# missing_indices = np.argwhere(np.isnan(np_array.flatten())).reshape(-1) 
# rating_matrix_ini.flat[missing_indices] = np_array.flat[missing_indices] 
# # TEST CODE START

In [34]:
## Check approprate rank that can be used for low rank model estimation 
## Using singular values in the svd of the data matrix 

_, s, _ = np.linalg.svd(rating_matrix_ini) 
print(s)

[3.24470743e+05 8.55944427e+04 4.38254164e+04 2.10138892e+04
 1.41212796e+04 6.66776215e+03 5.09166685e+03 2.59218189e+03
 2.14330145e+03 1.99077694e+03 1.66213797e+03 1.54437376e+03
 1.19428899e+03 1.15317824e+03 8.39913670e+02 7.77087992e+02
 6.35420188e+02 6.21324486e+02 5.06482115e+02 4.64553669e+02
 4.26015033e+02 3.76748114e+02 3.16651728e+02 2.85359489e+02
 1.56742000e+02 1.47892998e+02 1.35060664e+02 1.31938322e+02
 1.22652903e+02 1.21844528e+02 1.00577686e+02 8.77100899e+01
 8.56284323e+01 7.93505094e+01 7.85512558e+01 7.24743930e+01
 6.37256555e+01 6.17804271e+01 5.75657841e+01 5.64669769e+01
 4.09189733e+01 4.02428844e+01 3.88830412e+01 3.78342540e+01
 3.24233069e+01 3.21801145e+01 2.96079139e+01 2.79941066e+01
 2.58129366e+01 2.36473595e+01 2.34407964e+01 2.28918533e+01
 1.95007530e+01 1.50569305e+01 1.40442709e+01 1.01069105e+01]


In [103]:
RANK = 14

Mean Imputation

In [104]:
## Impute missing values with column mean to begin with 
col_mean = np.nanmean(np_array, axis=0)
inds = np.where(np.isnan(np_array))
# np_array[inds] = np.take(col_mean, inds[1]) 
# FIX: it should be rating_matrix_ini where imputed data goes not in np_array
rating_matrix_ini[inds] = np.take(col_mean, inds[1]) 
print(rating_matrix_ini.shape)

(5111, 56)


Experimental Imputation

In [107]:
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer

# imputer = IterativeImputer(random_state=36, max_iter=50)
# imputed = imputer.fit(rating_matrix_ini)
# rating_matrix_ini = np.round(imputed.transform(rating_matrix_ini), 4)

# # imputer = KNNImputer(n_neighbors=2, weights="uniform")
# # rating_matrix_ini = imputer.fit_transform(rating_matrix_ini)

# # imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# # rating_matrix_ini = imputer.fit_transform(rating_matrix_ini)


In [108]:
# print(rating_matrix_ini)
# print(rating_matrix_ini.shape)

[[ 6.90000e+01  1.00000e+00  4.00000e+00 ...  1.67000e+02  1.22600e-01
   2.66838e+01]
 [ 5.40000e+01  1.00000e+00  3.00000e+00 ...  1.70000e+02 -2.47000e-02
   2.86325e+01]
 [ 7.20000e+01  1.00000e+00  3.00000e+00 ...  1.26000e+02  0.00000e+00
   2.89293e+01]
 ...
 [ 8.00000e+01  1.00000e+00  3.00000e+00 ...  1.57000e+02  8.70000e-03
   2.48902e+01]
 [ 2.60000e+01  1.00000e+00  2.00000e+00 ...  1.90000e+02  0.00000e+00
   2.44686e+01]
 [ 4.20000e+01  2.00000e+00  4.00000e+00 ...  1.52000e+02 -3.40000e-02
   3.40150e+01]]
(5111, 56)


Function to achieve low rank approximation

In [97]:
def fit_low_rank_model(rank, 
                       rating_matrix_ini, 
                       train_ind, 
                       train_data, 
                       n_iter, 
                       convergence_thresh, 
                       verbose, 
                       data1=None, 
                       missing_indices=None): 
    
    """Fit the low rank model. 
    Return the estimation of the low rank model - (n_movies * n_users) matrix

    Keyword arguments:
    rank -- the rank of low rank model
    rating_matrix_ini -- imputed initialization
    train_ind -- index of training data
    train_data -- ratings of training set
    n_iter -- the max number of iterations
    convergence_thresh -- the threshold of convergence to 0
    """
    
    previous_fitting_error = 100
    # Initialization
    low_rank_estimate = np.zeros(rating_matrix_ini.shape) - 1 
    # fill input data
    low_rank_estimate.flat[train_ind] = train_data
    # get the indexes of missing data
    missing_inds = np.where(low_rank_estimate.flat == -1)
    # fill missing data with imputed values
    low_rank_estimate.flat[missing_inds] = rating_matrix_ini.flat[missing_inds]

    
    for ind in range(n_iter):
        # Updates
        low_rank_estimate.flat[train_ind] = train_data
        u, s, v = np.linalg.svd(low_rank_estimate)
        s_matrix = s  * np.eye(len(s))
        low_rank_estimate = np.matmul(np.matmul(u[:,0:rank], s_matrix[0:rank,0:rank]), v[0:rank,:])
        # Compute error
        fitting_error = np.sqrt(((train_data - low_rank_estimate.flat[train_ind])**2).mean())
        if (not (data1 == None)): 
            #true fitting error, compared to true values
            true_fitting_error = np.sqrt(((data1.flat[missing_indices] - low_rank_estimate.flat[missing_indices])**2).mean())
        if verbose:
            print("Iteration " + str(ind) + " Error: " + str(fitting_error))
            if (not (data1 == None)): 
                print("Iteration " + str(ind) + " True Error: " + str(true_fitting_error)) 
                print() 
        
        # Stopping criterion
        if (fitting_error <= convergence_thresh):
            print('converged, breaking')
            break
    return low_rank_estimate, fitting_error



Execute Low Rank Approximation Function

In [109]:
convergence_thresh = 1e-4
verbose = True
train_data = np_array.flat[nonmissing_indices] 

n_iter = 20 
rank = RANK 


estimate, _ = fit_low_rank_model(rank, 
                             rating_matrix_ini, 
                             nonmissing_indices, 
                             train_data, 
                             n_iter, 
                             convergence_thresh, 
                             verbose) 

# TEST CODE START
# Loop to find a good rank 
# err_list = []
# n_iter = 2
# for i in range(1,20):
#     rank = i 
#     _, err = fit_low_rank_model(rank, 
#                                 rating_matrix_ini, 
#                                 nonmissing_indices, 
#                                 train_data, 
#                                 n_iter, 
#                                 convergence_thresh, 
#                                 verbose) 
#     err_list.append(err)

# plt.plot([i for i in range(1,20)], err_list)
# plt.show()
# TEST CODE END



Iteration 0 Error: 1.3378585556452987
Iteration 1 Error: 1.3323778394235657
Iteration 2 Error: 1.3303304924597297
Iteration 3 Error: 1.3290337855397316
Iteration 4 Error: 1.3279673024000405
Iteration 5 Error: 1.3270001476998807
Iteration 6 Error: 1.3260968158183424
Iteration 7 Error: 1.3252499516588006
Iteration 8 Error: 1.3244603342092043
Iteration 9 Error: 1.323729753773189
Iteration 10 Error: 1.323058552202019
Iteration 11 Error: 1.322445152970103
Iteration 12 Error: 1.321886404763047
Iteration 13 Error: 1.3213781540613017
Iteration 14 Error: 1.3209157777286389
Iteration 15 Error: 1.3204945821358975
Iteration 16 Error: 1.3201100620650705
Iteration 17 Error: 1.3197580463016059
Iteration 18 Error: 1.3194347633383594
Iteration 19 Error: 1.3191368559715937


In [110]:
with open('lowrank_estimate.npy', 'wb') as f:
    np.save(f, estimate)