In [1]:
import torch
import numpy as np
import importlib

In [2]:
timeseries = torch.load("time_series.pt")
masks = torch.load("masks.pt")

# turn into numpy arrays

npTimeSeries = timeseries.numpy()
npMasks = masks.numpy()

print(len(npTimeSeries)) # number of patients: 6261
print(len(npTimeSeries[0])) # columnlength: 192
print(len(npTimeSeries[0][0])) # rowlength: 52

print("Row Length: ", len(npTimeSeries[0][0]))
print("Column Length: ", len(npTimeSeries[0][:,0]))
print("time series", npTimeSeries)
print()
print("masks", npMasks)



6261
192
59
Row Length:  59
Column Length:  192
time series [[[0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  ...
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]]

 [[0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  ...
  [0.         0.         0.         ... 0.4145498  0.07772809 0.2590936 ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.44404888 0.08325917 0.27753055]]

 [[0.         0.         0.         

In [6]:
# Ladies and gents, this is LOCF (Last observation carried forward), SIMPLE EXAMPLE ... 

TS = np.array([[0, 0, 5],
    [3, 4, 0],
     [0, 0, 0],
    [7, 2, 0],
    [0, 0, 2],
    [2, 1, 0],
    [0, 0, 0]])

masks = np.array([[1, 1, 0],
        [0, 0, 1],
        [1, 1, 1],
        [0, 0, 1],
        [1, 1, 0],
        [0, 0, 1],
        [1, 1, 1]])

# removeOutliers works

def removeOutliers(originalTS):
    
    TS = np.array(originalTS)
    cleanTS = []
    for i in range(len(TS[0])): #len(TS[0]) = 3
        
        varColumn = TS[:,i]
        
        mean = np.mean(varColumn)
        sd = np.std(varColumn)
        boundSet = (mean - 2 * sd, mean + 2 * sd)
        resultList = []
        for y in varColumn:
            if y >= boundSet[0] and y <= boundSet[1]:
                resultList.append(y)
            else:
                resultList.append(0)
        
        cleanTS.append(resultList)
    
    newTS = np.transpose(cleanTS)
    
    return newTS

def simpleLOCF(TSarray, maskArray):
    
    # first, remove the outliers from the original time series array
    roTS = removeOutliers(TSarray)
    
    # next, create the "new masking tensor":
    # if there's an outlier in the TS data and it was masked with a "0", 
    # turn the mask position into a "1" to indicate absence
    
    for row in range(len(roTS)):
        for col in range(len(roTS[0])):
            if TSarray[row][col] - roTS[row][col] >= 1 and maskArray[row][col] == 0:
                maskArray[row][col] = 1
            
    # Now, we actually do LOCF
    # It works :)
    
    rowLength = len(roTS[0]) # len: 3
    columnLength = len(roTS) # len: 7
    
    for col in range(rowLength): # 3
        for row in range(columnLength): # 7

            # if the variable is the first row and is missing, keep looking one row ahead until
            # you have found first non-missing value
            if (row==0) and maskArray[row][col] == 1:
                subIndex = 1
                while subIndex < columnLength:
                    if maskArray[subIndex][col] == 0:
                        roTS[row][col] = roTS[subIndex][col]
                        break
                    subIndex +=1

            # else, if the missing variable is anywhere else in the list
            elif(maskArray[row][col] == 1):
                roTS[row][col] = roTS[row-1][col]
            
    return roTS, maskArray
       
print(simpleLOCF(TS, masks))


(array([[3, 2, 2],
       [3, 2, 2],
       [3, 2, 2],
       [3, 2, 2],
       [3, 2, 2],
       [2, 1, 2],
       [2, 1, 2]]), array([[1, 1, 1],
       [0, 1, 1],
       [1, 1, 1],
       [1, 0, 1],
       [1, 1, 0],
       [0, 0, 1],
       [1, 1, 1]]))


In [9]:
# ONLY FOR THE FIRST TWO PATIENTS



for i in range(2): # first two patients
    print("Patient ", i, "'s' time series and masks: ", simpleLOCF(npTimeSeries[i], npMasks[i]))
    


Patient  0 's' time series and masks:  (array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32))
Patient  1 's' time series and masks:  (array([[0.        , 0.        , 0.        , ..., 0.30156687, 0.05654379,
        0.18847929],
       [0.        , 0.        , 0.        , ..., 0.30156687, 0.05654379,
        0.18847929],
       [0.        , 0.        , 0.        , ..., 0.30156687, 0.05654379,
        0.18847929],
       ...,
       [0.        , 0.        , 0.        , ..., 0.4145498 , 0.07772809,
        0.25909361],
       [0.        , 0.        , 0

In [13]:
\\
"""
This is how to install any library onto jupyter notebook, ignore this

in order to import library onto jupyter notebook
$ which jupyter
/YOURPATH/bin/jupyter
$ /YOURPATH/bin/pip install scipy

"""



'\nin order to import library onto jupyter notebook\n$ which jupyter\n/YOURPATH/bin/jupyter\n$ /YOURPATH/bin/pip install scipy\n\n'

In [20]:
# experimenting with impyute library

import impyute as impy

n = 5
arr = np.random.uniform(high = 6, size =(n,n))
for _ in range(3):
    arr[np.random.randint(n), np.random.randint(n)] = np.nan

print(arr)
print()

print("After mean imputation: \n", impy.mean(arr))
print()

print("After last observation carried forward: \n", impy.locf(arr))


[[0.15864484 0.93719454 0.66266944        nan 3.98385244]
 [3.79529651 2.97665631 3.05637488        nan 0.50749763]
 [1.67499387 4.8133987  5.52483045 5.69448824 0.0825605 ]
 [3.02572561 1.26020213 5.14272867 3.52962749        nan]
 [3.60938975 5.18467779 5.47147672 1.4702489  3.95397446]]

After mean imputation: 
 [[0.15864484 0.93719454 0.66266944 3.56478821 3.98385244]
 [3.79529651 2.97665631 3.05637488 3.56478821 0.50749763]
 [1.67499387 4.8133987  5.52483045 5.69448824 0.0825605 ]
 [3.02572561 1.26020213 5.14272867 3.52962749 2.13197126]
 [3.60938975 5.18467779 5.47147672 1.4702489  3.95397446]]

After last observation carried forward: 
 [[0.15864484 3.79529651 1.67499387 3.02572561 3.60938975]
 [0.93719454 2.97665631 4.8133987  1.26020213 5.18467779]
 [0.66266944 3.05637488 5.52483045 5.14272867 5.47147672]
 [0.66266944 3.05637488 5.69448824 3.52962749 1.4702489 ]
 [3.98385244 0.50749763 0.0825605  3.52962749 3.95397446]]
