## Data Imputation 

## Stage 1: 
- preparing data 
- data imputation for 4 columns in the matrix 

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/data_clean.csv', sep=',', encoding='latin-1')
df = df.drop(columns=['PAQ706'])
# print(df.head(), df.columns, df.shape)


# df = df[["BPXPLS", "BMXARMC", "bmi", "highbp" ]] # Reading 4 columns for now 
np_array = df.to_numpy(copy=True, na_value=np.nan) # Convert dataframe to numpy array 

To test our code, we will first delete values ourselves and compare output

## Stage 1A: 
- 1) Considering data with only non missing values (delete rows with any missing values)
- 2) Dropping values at random from this data 
- 3) Applying Low Rank Model estimation 
- 4) Evaluating the low rank estimate using true values 

In [2]:
## nonmissing_data is our ground truth, 
## MCAR stands for Missing completely at random; MCAR_data is to evaluate low rank model algorithm 

nonmissing_data = np_array[~np.isnan(np_array).any(axis=1), :] # delete rows with any missing values
MCAR_missing_indices = np.random.choice( # randomly select values from length ie flat array 
    len(nonmissing_data.flatten()), 
    size=int(len(nonmissing_data.flatten())/10)) 

MCAR_data = nonmissing_data
MCAR_data.flat[MCAR_missing_indices] = np.nan
print(MCAR_data) 

[[7.36880000e+04 5.80000000e+01 1.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 3.35591230e+01]
 [           nan 5.70000000e+01 2.00000000e+00 ... 0.00000000e+00
             nan 2.89782389e+01]
 [7.44210000e+04 4.70000000e+01 1.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 3.35804853e+01]
 ...
 [8.28570000e+04            nan 2.00000000e+00 ... 0.00000000e+00
  1.00000000e+00            nan]
 [8.31000000e+04 5.70000000e+01            nan ... 1.00000000e+00
  1.00000000e+00 4.28618920e+01]
 [8.33760000e+04 4.30000000e+01 2.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 5.00250699e+01]]


## Low Rank Estimation 

In [3]:
## FIX: delete this line when no longer working with MCAR_data 
# np_array = MCAR_data 

## Get non-missing indices 
nonmissing_indices = np.argwhere(~np.isnan(np_array.flatten())).reshape(-1) 

## Create initial matrix (impute nans with zeroes) 
rating_matrix_ini = np.zeros(np_array.shape)
rating_matrix_ini.flat[nonmissing_indices] = np_array.flat[nonmissing_indices] 

In [4]:
## Check approprate rank that can be used for low rank model estimation 
## Using singular values in the svd of the data matrix 

_, s, _ = np.linalg.svd(rating_matrix_ini) 
print(s)


[5.98647588e+06 1.80680717e+05 8.69177642e+04 4.59628057e+04
 4.38329545e+04 1.45712498e+04 5.30025010e+03 5.02203905e+03
 2.50712961e+03 2.22409090e+03 2.06990511e+03 1.71983149e+03
 1.42890799e+03 1.31522513e+03 1.20736989e+03 8.25271011e+02
 6.64592924e+02 6.51872088e+02 5.28047958e+02 4.86550581e+02
 4.73797660e+02 3.90615986e+02 3.29531058e+02 3.07265875e+02
 1.68065828e+02 1.46440192e+02 1.32163852e+02 1.24109379e+02
 1.15063105e+02 1.11460499e+02 1.04225679e+02 8.39265952e+01
 7.92520488e+01 7.73810572e+01 7.06542126e+01 6.65199521e+01
 6.41531250e+01 5.63958088e+01 5.52050926e+01 5.17119836e+01
 3.92930754e+01 3.77481293e+01 3.50814867e+01 3.38413187e+01
 3.21135450e+01 3.14121557e+01 2.92378566e+01 2.73760266e+01
 2.66489676e+01 2.50650814e+01 2.34772835e+01 2.19209149e+01
 1.81732904e+01 1.61398359e+01 1.52313684e+01 1.39468281e+01
 1.36175005e+01]


In [5]:
RANK = 6 

In [6]:
## Impute missing values with column mean to begin with 
col_mean = np.nanmean(np_array, axis=0)
inds = np.where(np.isnan(np_array))
np_array[inds] = np.take(col_mean, inds[1]) 

In [7]:
def fit_low_rank_model(rank, 
                       rating_matrix_ini, 
                       train_ind, 
                       train_data, 
                       n_iter, 
                       convergence_thresh, 
                       verbose, 
                       data1=None, 
                       missing_indices=None): 
    
    """Fit the low rank model. 
    Return the estimation of the low rank model - (n_movies * n_users) matrix

    Keyword arguments:
    rank -- the rank of low rank model
    rating_matrix_ini -- imputed initialization
    train_ind -- index of training data
    train_data -- ratings of training set
    n_iter -- the max number of iterations
    convergence_thresh -- the threshold of convergence to 0
    """
    
    previous_fitting_error = 100
    # Initialization
    low_rank_estimate = np.zeros(rating_matrix_ini.shape)
    # fill input data
    low_rank_estimate.flat[train_ind] = train_data
    # get the indexes of missing data
    missing_inds = np.where(low_rank_estimate.flat == 0)
    # fill missing data with imputed values
    low_rank_estimate.flat[missing_inds] = rating_matrix_ini.flat[missing_inds]

    
    for ind in range(n_iter):
        # Updates
        low_rank_estimate.flat[train_ind] = train_data
        u, s, v = np.linalg.svd(low_rank_estimate)
        s_matrix = s  * np.eye(len(s))
        low_rank_estimate = np.matmul(np.matmul(u[:,0:rank], s_matrix[0:rank,0:rank]), v[0:rank,:])
        # Compute error
        fitting_error = np.sqrt(((train_data - low_rank_estimate.flat[train_ind])**2).mean())
        if (not (data1 == None)): 
            #true fitting error, compared to true values
            true_fitting_error = np.sqrt(((data1.flat[missing_indices] - low_rank_estimate.flat[missing_indices])**2).mean())
        if verbose:
            print("Iteration " + str(ind) + " Error: " + str(fitting_error))
            if (not (data1 == None)): 
                print("Iteration " + str(ind) + " True Error: " + str(true_fitting_error)) 
                print() 
        
        # Stopping criterion
        if (fitting_error <= convergence_thresh):
            print('converged, breaking')
            break
    return low_rank_estimate

n_iter = 5 
convergence_thresh = 1e-4
verbose = True
rank = RANK 
train_data = np_array.flat[nonmissing_indices] 
# estimate =fit_low_rank_model(rank,rating_matrix_ini,nonmissing_indices,
#         train_data,n_iter,convergence_thresh,verbose, nonmissing_data, MCAR_missing_indices)

estimate = fit_low_rank_model(rank, 
                             rating_matrix_ini, 
                             nonmissing_indices, 
                             train_data, 
                             n_iter, 
                             convergence_thresh, 
                             verbose) 

print("Estimate = \n", estimate)



Iteration 0 Error: 14.384228277859698
Iteration 1 Error: 14.017164601190816
Iteration 2 Error: 13.999889889374879
Iteration 3 Error: 13.998348512863066
Iteration 4 Error: 13.998091314358561
Estimate = 
 [[7.35570586e+04 4.65270619e+01 1.46395085e+00 ... 7.73784437e-02
  3.37930066e-01 2.73494845e+01]
 [7.35580265e+04 4.34549132e+01 8.19239161e-01 ... 1.61152461e-01
  3.89402207e-01 2.94339214e+01]
 [7.35589334e+04 4.47387762e+01 1.44717036e+00 ... 7.15284216e-02
  3.26311827e-01 2.69826023e+01]
 ...
 [8.37259764e+04 5.79025162e+01 1.78042712e+00 ... 8.26397809e-02
  4.41933040e-01 3.04539486e+01]
 [8.37268946e+04 4.68751249e+01 1.32429624e+00 ... 9.72961932e-02
  3.51847349e-01 3.09164917e+01]
 [8.37288680e+04 5.09006404e+01 1.56693695e+00 ... 7.40962849e-02
  3.69658811e-01 3.07177731e+01]]


In [8]:
estimate 

array([[7.35570586e+04, 4.65270619e+01, 1.46395085e+00, ...,
        7.73784437e-02, 3.37930066e-01, 2.73494845e+01],
       [7.35580265e+04, 4.34549132e+01, 8.19239161e-01, ...,
        1.61152461e-01, 3.89402207e-01, 2.94339214e+01],
       [7.35589334e+04, 4.47387762e+01, 1.44717036e+00, ...,
        7.15284216e-02, 3.26311827e-01, 2.69826023e+01],
       ...,
       [8.37259764e+04, 5.79025162e+01, 1.78042712e+00, ...,
        8.26397809e-02, 4.41933040e-01, 3.04539486e+01],
       [8.37268946e+04, 4.68751249e+01, 1.32429624e+00, ...,
        9.72961932e-02, 3.51847349e-01, 3.09164917e+01],
       [8.37288680e+04, 5.09006404e+01, 1.56693695e+00, ...,
        7.40962849e-02, 3.69658811e-01, 3.07177731e+01]])

In [10]:
with open('lowrank_estimate.npy', 'wb') as f:
    np.save(f, estimate)