## Data Imputation 

## Stage 1: 
- preparing data 
- data imputation for 4 columns in the matrix 

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/data_clean.csv', sep=',', encoding='latin-1')
# print(df.head(), df.columns, df.shape)

df = df[["BPXPLS", "BMXARMC", "bmi", "highbp" ]] # Reading 4 columns for now 
np_array = df.to_numpy(copy=True, na_value=np.nan) # Convert dataframe to numpy array 

To test our code, we will first delete values ourselves and compare output

## Stage 1A: 
- 1) Considering data with only non missing values (delete rows with any missing values)
- 2) Dropping values at random from this data 
- 3) Applying Low Rank Model estimation 
- 4) Evaluating the low rank estimate using true values 

In [2]:
## nonmissing_data is our ground truth, 
## MCAR stands for Missing completely at random; MCAR_data is to evaluate low rank model algorithm 

nonmissing_data = np_array[~np.isnan(np_array).any(axis=1), :] # delete rows with any missing values
MCAR_missing_indices = np.random.choice( # randomly select values from length ie flat array 
    len(nonmissing_data.flatten()), 
    size=int(len(nonmissing_data.flatten())/10)) 

MCAR_data = nonmissing_data
MCAR_data.flat[MCAR_missing_indices] = np.nan
print(MCAR_data) 

[[86.         35.3        26.68376063         nan]
 [74.         34.7        28.6324502   1.        ]
 [68.         33.5        28.92930024  1.        ]
 ...
 [54.         31.         24.8902376          nan]
 [60.         29.9        24.46863363  0.        ]
 [80.         37.         34.01503875  1.        ]]


## Low Rank Estimation 

In [3]:
## FIX: delete this line when no longer working with MCAR_data 
np_array = MCAR_data 

## Get non-missing indices 
nonmissing_indices = np.argwhere(~np.isnan(np_array.flatten())).reshape(-1) 

## Create initial matrix (impute nans with zeroes) 
rating_matrix_ini = np.zeros(np_array.shape)
rating_matrix_ini.flat[nonmissing_indices] = np_array.flat[nonmissing_indices] 

In [7]:
## Check approprate rank that can be used for low rank model estimation 
## Using singular values in the svd of the data matrix 

_, s, _ = np.linalg.svd(rating_matrix_ini) 
print(s)
RANK = 1

[5586.98556461 1086.21896908  665.22551182   32.75907023]


In [8]:
## Impute missing values with column mean to begin with 
col_mean = np.nanmean(np_array, axis=0)
inds = np.where(np.isnan(np_array))
np_array[inds] = np.take(col_mean, inds[1]) 

In [10]:
def fit_low_rank_model(rank, 
                       rating_matrix_ini, 
                       train_ind, 
                       train_data, 
                       n_iter, 
                       convergence_thresh, 
                       verbose, 
                       data1, 
                       missing_indices): 
    
    """Fit the low rank model. 
    Return the estimation of the low rank model - (n_movies * n_users) matrix

    Keyword arguments:
    rank -- the rank of low rank model
    rating_matrix_ini -- imputed initialization
    train_ind -- index of training data
    train_data -- ratings of training set
    n_iter -- the max number of iterations
    convergence_thresh -- the threshold of convergence to 0
    """
    
    previous_fitting_error = 100
    # Initialization
    low_rank_estimate = np.zeros(rating_matrix_ini.shape)
    # fill input data
    low_rank_estimate.flat[train_ind] = train_data
    # get the indexes of missing data
    missing_inds = np.where(low_rank_estimate.flat == 0)
    # fill missing data with imputed values
    low_rank_estimate.flat[missing_inds] = rating_matrix_ini.flat[missing_inds]

    
    for ind in range(n_iter):
        # Updates
        low_rank_estimate.flat[train_ind] = train_data
        u, s, v = np.linalg.svd(low_rank_estimate)
        s_matrix = s  * np.eye(len(s))
        low_rank_estimate = np.matmul(np.matmul(u[:,0:rank], s_matrix[0:rank,0:rank]), v[0:rank,:])
        # Compute error
        fitting_error = np.sqrt(((train_data - low_rank_estimate.flat[train_ind])**2).mean())
        #true fitting error, compared to true values
        true_fitting_error = np.sqrt(((data1.flat[missing_indices] - low_rank_estimate.flat[missing_indices])**2).mean())
        if verbose:
            print("Iteration " + str(ind) + " Error: " + str(fitting_error))
            print("Iteration " + str(ind) + " True Error: " + str(true_fitting_error)) 
            print() 
        
        # Stopping criterion
        if (fitting_error <= convergence_thresh):
            print('converged, breaking')
            break
    return low_rank_estimate

n_iter = 5 
convergence_thresh = 1e-4
verbose = True
rank = RANK 
train_data = np_array.flat[nonmissing_indices] 
estimate =fit_low_rank_model(rank,rating_matrix_ini,nonmissing_indices,
        train_data,n_iter,convergence_thresh,verbose, nonmissing_data, MCAR_missing_indices)
print("Estimate = \n", estimate)



Iteration 0 Error: 7.226457640721344
Iteration 0 True Error: 27.950516777801386

Iteration 1 Error: 5.841031803144989
Iteration 1 True Error: 21.6498529983792

Iteration 2 Error: 5.270315582574567
Iteration 2 True Error: 17.410515522344703

Iteration 3 Error: 4.9495701875919895
Iteration 3 True Error: 14.487777574744618

Iteration 4 Error: 4.762444252868567
Iteration 4 True Error: 12.4811579337921

Iteration 5 Error: 4.651678923650088
Iteration 5 True Error: 11.110689896306265

Iteration 6 Error: 4.584939475800316
Iteration 6 True Error: 10.176741537517527

Iteration 7 Error: 4.543851354861755
Iteration 7 True Error: 9.538754653617001

Iteration 8 Error: 4.517947961532613
Iteration 8 True Error: 9.099849927513096

Iteration 9 Error: 4.501212040342588
Iteration 9 True Error: 8.794632567178635

Iteration 10 Error: 4.490134082281851
Iteration 10 True Error: 8.57960464043844

Iteration 11 Error: 4.482630527727341
Iteration 11 True Error: 8.426016740940312

Iteration 12 Error: 4.47743939749