In [4]:
import torch
import numpy as np
import importlib
import matplotlib.pyplot as plt
from fancyimpute import SoftImpute, KNN

Using TensorFlow backend.


In [5]:
tensorTimeSeries = torch.load("time_series.pt")
tensorMasks = torch.load("masks.pt")
tensorDiffs = torch.load("diffs.pt")

In [6]:
array = np.asmatrix(
        [[None, 200 , None],
         [   1, 100 ,    5],
         [   5, 100 ,    1],
         [None, None,    1],
         [   1, 100 , None],
         [   2, 100 ,    2],
         [   1, None,    2],
         [None, 100 , None]])
    
print(array)

[[None 200 None]
 [1 100 5]
 [5 100 1]
 [None None 1]
 [1 100 None]
 [2 100 2]
 [1 None 2]
 [None 100 None]]


In [7]:
# Playing around with soft impute

X_filled_softimpute = SoftImpute().fit_transform(array)

print(X_filled_softimpute)

[SoftImpute] Max Singular Value of X_init = 300.026861
[SoftImpute] Iter 1: observed MAE=1.776682 rank=1
[SoftImpute] Iter 2: observed MAE=1.719992 rank=1
[SoftImpute] Iter 3: observed MAE=1.696957 rank=1
[SoftImpute] Iter 4: observed MAE=1.688916 rank=1
[SoftImpute] Iter 5: observed MAE=1.704980 rank=1
[SoftImpute] Iter 6: observed MAE=1.717302 rank=1
[SoftImpute] Iter 7: observed MAE=1.724652 rank=1
[SoftImpute] Iter 8: observed MAE=1.729046 rank=1
[SoftImpute] Iter 9: observed MAE=1.731669 rank=1
[SoftImpute] Iter 10: observed MAE=1.733222 rank=1
[SoftImpute] Iter 11: observed MAE=1.734125 rank=1
[SoftImpute] Iter 12: observed MAE=1.734631 rank=1
[SoftImpute] Iter 13: observed MAE=1.734892 rank=1
[SoftImpute] Iter 14: observed MAE=1.735002 rank=1
[SoftImpute] Iter 15: observed MAE=1.735018 rank=1
[SoftImpute] Iter 16: observed MAE=1.734977 rank=1
[SoftImpute] Iter 17: observed MAE=1.734900 rank=1
[SoftImpute] Iter 18: observed MAE=1.734801 rank=1
[SoftImpute] Iter 19: observed MAE=1

In [8]:
tensorTimeSeries = torch.load("time_series.pt")
tensorMasks = torch.load("masks.pt")
tensorDiffs = torch.load("diffs.pt")

# turn into numpy arrays

npTimeSeries = tensorTimeSeries.numpy()
npMasks = tensorMasks.numpy()
npDiffs = tensorDiffs.numpy()

print(len(npTimeSeries)) # number of patients: 6261
print(len(npTimeSeries[0])) # columnlength: 192
print(len(npTimeSeries[0][0])) # rowlength: 59

print("Row Length: ", len(npTimeSeries[0][0]))
print("Column Length: ", len(npTimeSeries[0][:,0]))
# print("time series", npTimeSeries, "\n")
# print("masks", npMasks, "\n")
# print("diffs", npDiffs, "\n")

oneTimeSeries = npTimeSeries[1, ... , 58] # patient 1, variable 58
print(oneTimeSeries)
print(len(oneTimeSeries))


6261
192
59
Row Length:  59
Column Length:  192
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.18847929 0.29227915
 0.         0.         0.         0.30463463 0.         0.32856718
 0.         0.30603084 0.         0.2897962  0.         0.3117777
 0.         0.2979544  0.         0.30615848 0.         0.2502098
 0.         0.30017987 0.         0.2937583  0.         0.3050539
 0.         0.2943076  0.         0.31639075 0.         0.30030572
 0.         0.31802812

In [9]:
def calculateGlobalMean(variableNum, allTimeSeries, allDiffs):
    meanForPatientArray=[]
    
    #CHANGE TO 6261 TO ITERATE OVER ALL PATIENTS
    numOfPatients=1000
    
    for i in range(numOfPatients):
        timeSeriesForPatientAndVariable = np.asarray(allTimeSeries[i, ... , variableNum])
        diffForPatientAndVariable = np.asarray(allDiffs[i, ..., variableNum])
        if (diffForPatientAndVariable[191]>=3.9791667):
            #print('patient ', i, ' does not have any observations for variable ', variableNum)
            pass
        else:
            meanForPatient = np.mean(timeSeriesForPatientAndVariable, dtype ='float64')
            meanForPatientArray.append(meanForPatient)
    
    if(len(meanForPatientArray)<(.05*6261)):     #if there are too few observations for this variable
        globalMean = 0.0                        
        #impute with zero because healthy ppl are not tested for this variable
        print('imputed with zeros')
        #print('length for meanForPatientsArray: ', len(meanForPatientArray))
    else:
        globalMean = np.mean(meanForPatientArray, dtype ='float64')
    
    return globalMean

In [10]:
# this works for ONE VARIABLE ONLY (one column, 192 time steps)

def removeOutliers(originalTS):
    
    TS = np.array(originalTS)
    cleanTS = []
    for i in range(len(TS[0])): #len(TS[0]) = 3
        
        varColumn = TS[:,i]
        
        mean = np.mean(varColumn)
        sd = np.std(varColumn)
        boundSet = (mean - 2 * sd, mean + 2 * sd)
        resultList = []
        for y in varColumn:
            if y >= boundSet[0] and y <= boundSet[1]:
                resultList.append(y)
            else:
                resultList.append(0)
        
        cleanTS.append(resultList)
    
    newTS = np.transpose(cleanTS)
    
    return newTS

In [34]:
def softImpute(timeSeries, masks, diffs):
    
    sizeArr = list(timeSeries.size())
    # numPatients = sizeArr[0]
    numTimeSteps = sizeArr[1]
    numVariables = sizeArr[2]
    
    # COMMENT THIS OUT IF YOU WANT TO RUN METHOD ON ALL PATIENTS
    numPatients = 2
    
    timeSeriesTensor = torch.zeros(size=(numPatients, numTimeSteps, numVariables-1))
    maskTensor = torch.zeros(size=(numPatients, numTimeSteps, numVariables-1))
    diffTensor = torch.zeros(size=(numPatients, numTimeSteps, numVariables-1))
    
    # this is the entire time series, 6261 x 192 x 59 
    numpyTimeSeries = timeSeries.numpy()
    numpyMasks = masks.numpy()
    numpyDiffs = diffs.numpy()
    
    for i in range(numPatients):
        
        oneTimeSeries = numpyTimeSeries[i] # 192 x 59  FOR ONE PATIENT ONLY
        oneMask = numpyMasks[i]
        oneDiffs = numpyDiffs[i]
        
        
        """
        INSERT GLOBAL MEAN CODE HERE
        inputting column with global mean of variable if entire column is missing with 0's

        """

        # take the outliers out of the time series of ONE PATIENT, 192 x 59

        roTS = removeOutliers(oneTimeSeries)

        # create new masking tensor for ONE PATIENT
        for row in range(len(roTS)):
            for col in range(len(roTS[0])):
                if oneTimeSeries[row][col] - roTS[row][col] >= 1 and oneMask[row][col] == 0:
                    oneMask[row][col] = 1

        # this is where we modify the diffs
        # if mask = 0 (observed), diff = 0

        for col in range(len(oneMask[0])): # 3
            for row in range(len(oneMask)): # 7
                if(row==0):
                    oneDiffs[row][col] = 0
                elif oneMask[row][col] == 0:
                    oneDiffs[row][col] = 0
                else:
                    oneDiffs[row][col] = oneDiffs[row-1][col] + 1/48 
                    
        # If there's a 0 in the time series, impute with "None"
        for row in range(len(roTS)):
            for col in range(len(roTS[0])):
                if roTS[row][col] == 0:
                    roTS[row][col] = None
                
        # this is where we do soft impute on the transformed time series, 
        # softImpute_TimeSeries is 192 x 59
        softImpute_TimeSeries = SoftImpute().fit_transform(roTS)
        
        # then we go through all 59 columns of time series and discount the 19th column
        
        for j in range (numVariables): # 59
            
            if (j == 19):
                pass
               
            oneColumnTimeSeries = softImpute_TimeSeries[:,j] # This is ONE 192-LENGTH column for patient i, column j
            oneColumnMask = numpyMasks[i, ..., j]
            oneColumnDiffs = numpyDiffs[i, ..., j]
            
            if (j<19):
                timeSeriesTensor[i, ..., j] = (torch.from_numpy((oneColumnTimeSeries))) # stores into one giant tensor, 6261 x 192 x 59
                maskTensor[i, ..., j] = (torch.from_numpy(np.asarray(oneColumnMask)))
                diffTensor[i, ..., j] = (torch.from_numpy(np.asarray(oneColumnDiffs)))
                
            else:
                timeSeriesTensor[i, ..., j-1] = (torch.from_numpy((oneColumnTimeSeries))) # stores into one giant tensor, 6261 x 192 x 59
                maskTensor[i, ..., j-1] = (torch.from_numpy(np.asarray(oneColumnMask)))
                diffTensor[i, ..., j-1] = (torch.from_numpy(np.asarray(oneColumnDiffs)))
                
    return timeSeriesTensor, maskTensor, diffTensor


In [35]:
out1, out2, out3 = softImpute(tensorTimeSeries, tensorMasks, tensorDiffs)

[SoftImpute] Max Singular Value of X_init = 9.919182
[SoftImpute] Iter 1: observed MAE=0.019924 rank=1
[SoftImpute] Iter 2: observed MAE=0.019924 rank=1
[SoftImpute] Iter 3: observed MAE=0.019924 rank=1
[SoftImpute] Stopped after iteration 3 for lambda=0.198384
[SoftImpute] Max Singular Value of X_init = 7.462622
[SoftImpute] Iter 1: observed MAE=0.008566 rank=10
[SoftImpute] Iter 2: observed MAE=0.008365 rank=10
[SoftImpute] Iter 3: observed MAE=0.008214 rank=8
[SoftImpute] Iter 4: observed MAE=0.008035 rank=7
[SoftImpute] Iter 5: observed MAE=0.007909 rank=7
[SoftImpute] Iter 6: observed MAE=0.007853 rank=6
[SoftImpute] Iter 7: observed MAE=0.007784 rank=6
[SoftImpute] Iter 8: observed MAE=0.007767 rank=6
[SoftImpute] Iter 9: observed MAE=0.007758 rank=6
[SoftImpute] Iter 10: observed MAE=0.007760 rank=6
[SoftImpute] Iter 11: observed MAE=0.007774 rank=6
[SoftImpute] Iter 12: observed MAE=0.007788 rank=6
[SoftImpute] Iter 13: observed MAE=0.007803 rank=6
[SoftImpute] Iter 14: observe

In [19]:
for i in range(2):
    
    oneTimeSeries = npTimeSeries[i]
    
    X_filled_softimpute = SoftImpute().fit_transform(oneTimeSeries)

    print(X_filled_softimpute)



[SoftImpute] Max Singular Value of X_init = 9.919305
[SoftImpute] Iter 1: observed MAE=0.000260 rank=3
[SoftImpute] Iter 2: observed MAE=0.000260 rank=3
[SoftImpute] Iter 3: observed MAE=0.000260 rank=3
[SoftImpute] Iter 4: observed MAE=0.000260 rank=3
[SoftImpute] Iter 5: observed MAE=0.000260 rank=3
[SoftImpute] Iter 6: observed MAE=0.000260 rank=3
[SoftImpute] Iter 7: observed MAE=0.000260 rank=3
[SoftImpute] Iter 8: observed MAE=0.000260 rank=3
[SoftImpute] Iter 9: observed MAE=0.000260 rank=3
[SoftImpute] Iter 10: observed MAE=0.000260 rank=3
[SoftImpute] Iter 11: observed MAE=0.000260 rank=3
[SoftImpute] Iter 12: observed MAE=0.000260 rank=3
[SoftImpute] Iter 13: observed MAE=0.000260 rank=3
[SoftImpute] Iter 14: observed MAE=0.000260 rank=3
[SoftImpute] Iter 15: observed MAE=0.000260 rank=3
[SoftImpute] Iter 16: observed MAE=0.000260 rank=3
[SoftImpute] Iter 17: observed MAE=0.000260 rank=3
[SoftImpute] Iter 18: observed MAE=0.000260 rank=3
[SoftImpute] Iter 19: observed MAE=0.0

[SoftImpute] Iter 89: observed MAE=0.001428 rank=13
[SoftImpute] Iter 90: observed MAE=0.001428 rank=13
[SoftImpute] Iter 91: observed MAE=0.001428 rank=13
[SoftImpute] Iter 92: observed MAE=0.001428 rank=13
[SoftImpute] Iter 93: observed MAE=0.001428 rank=13
[SoftImpute] Iter 94: observed MAE=0.001428 rank=13
[SoftImpute] Iter 95: observed MAE=0.001428 rank=13
[SoftImpute] Iter 96: observed MAE=0.001428 rank=13
[SoftImpute] Iter 97: observed MAE=0.001428 rank=13
[SoftImpute] Iter 98: observed MAE=0.001428 rank=13
[SoftImpute] Iter 99: observed MAE=0.001428 rank=13
[SoftImpute] Iter 100: observed MAE=0.001428 rank=13
[SoftImpute] Stopped after iteration 100 for lambda=0.155270
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.4145498  0.07772809 0.2590936 ]
 [0.         0.  