## Sandbox notebook


In [None]:
import os
import numpy as np
import time
import shutil
import pandas as pd

from lib.script_01_00 import generate_initial_variables
from lib.script_01_01 import generate_raw_confounds
from lib.script_01_02 import generate_nonlin_confounds

from src.nets import nets_load_match, nets_inverse_normal, nets_normalise, nets_demean, nets_deconfound
from src.duplicate import duplicate_categorical, duplicate_demedian_norm_by_site
from src.preproc import datenum, days_in_year

from src.memmap.MemoryMappedDF import MemoryMappedDF

In [None]:
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Output directory (will eventually be equal to data_dir)
out_dir = '/well/nichols/users/inf852/confounds/data/'

In [None]:
import time

t1 = time.time()
IDPs, nonIDPs, misc = generate_initial_variables(data_dir, out_dir)
t2 = time.time()

print(t2-t1)
# Get the subject IDs
sub_ids = IDPs.index

# Generate raw confounds
t1 = time.time()
confounds = generate_raw_confounds(data_dir, sub_ids)
t2 = time.time()

print(t2-t1)

In [None]:
import pickle
import uuid

# ------------------------------------------------------------------------------
# Function to read a MemoryMappedDF instance from a file
# ------------------------------------------------------------------------------
def read_memmap(filename):
    
    # Read in self_copy and create a new instance
    with open(filename, 'rb') as f:
        self_copy = pickle.load(f)
        
    # Create a new hash
    self_copy.hash = str(uuid.uuid4())
        
    print(type(self_copy))
    
    # Create empty memmap
    memmap_df = MemoryMappedDF(pd.DataFrame())
    
    # We now make a new copy of the memory map files behind the scenes so we 
    # don't delete the original files on close
    for dtype, memmap_file in self_copy.memory_maps.items():
        
        # Create new memory map filename using new hash
        filename = os.path.join(self_copy.directory, f"{self_copy.hash}_{dtype}.dat")
        
        # Create new memory map
        memmap_df.memory_maps[dtype]= np.memmap(filename, 
                                                dtype=self_copy.data_types[dtype], 
                                                mode='w+', shape=self_copy.shape)
        
        # Open old memory map
        prev_memmap = np.memmap(memmap_file, dtype=dtype, mode='c', shape=self_copy.shape)
        
        # Copy the existing memory map file over 
        memmap_df.memory_maps[dtype][:] = prev_memmap[:]
        
        # Delete the previous memory map
        del prev_memmap
        
        
    print(self_copy.memory_maps)
    
    # Update with new info
    memmap_df.__dict__.update(self_copy)
    
    
    # Convert self.memory_maps back to a dictionary of memmaps
    memmap_df.memory_maps = {
        dtype: np.memmap(filename, dtype=memmap_df.data_types[dtype], 
                         mode='r', shape=memmap_df.shape)
        for dtype, filename in memmap_df.memory_maps.items()
    }
    
    # Return the new instance
    return(memmap_df)

confounds_tmp = read_memmap(os.path.join(os.getcwd(),'saved_memmaps','confounds.npz'))

In [None]:


confounds_tmp = read_memmap(os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz'))

In [None]:
confounds_tmp.memory_maps['float32'].filename

In [None]:
confounds_tmp.memory_maps['float32'].shape

In [None]:
filename = os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz')

# Read in self_copy and create a new instance
with open(filename, 'rb') as f:
    self_copy = pickle.load(f)

# Create empty memmap
memmap_df = MemoryMappedDF(pd.DataFrame())

# Update with new info
memmap_df.__dict__ = self_copy

In [None]:
self_copy.memory_maps

In [None]:
for i in range(len(nan_patterns_y)):
    
    print(y.shape[0]-sum(tmp[i]['pattern']))

In [None]:
# Initialise i
i = 0

j = 0

# Set maximum memory
MAXMEM = 2**34 

t1 = time.time()

# Loop through all unique nan patterns in y
while i < len(nan_patterns_y):
    
    j = j+1
    
    t1_tmp = time.time()
    
    # Work out the next largest block we have
    num_non_zero = y.shape[0] - sum(tmp[i]['pattern'])
    
    # Get the memory required for one matrix
    mem_vol = sys.getsizeof(np.zeros((num_non_zero, conf2.shape[1]),dtype='uint64'))

    # Work out the number of blocks of this size that we can
    # perform computation for concurrently
    blksize = int(np.floor(MAXMEM/(8*mem_vol)))
    
    # Check if we've hit the end of the list
    if len(nan_patterns_y)-i < blksize:
        
        # Set blocksize the most we can
        blksize = len(nan_patterns_y)-i
    
    # Create numpy array of zeros to store conf for this block
    conf_block = np.zeros((blksize, num_non_zero, conf2.shape[1]))
    
    # Create numpy array of zeros to store y for this block
    y_block = np.zeros((blksize, num_non_zero, 1)) # MARKER 1 needs to be replaced
    
    print(i, conf_block.shape)
    
    # Add the confounds to the block
    for k in range(blksize):
    
        # Get the pattern
        non_nan = ~np.array(tmp[i+k]['pattern'],dtype=bool)

        # Number of non na values
        n_non_na = int(np.sum(1*non_nan))
        
        # Check if we have at least 5 non-nan values
        if n_non_na > 5:

            # Subset y to the appropriate columns
            cols = tmp[i+k]['columns']
            
            # Get the y's we're interested in
            y_current = y[tmp[i+k]['columns']]
            
            # Get the dimensions of y
            y1 = y_current.shape[1]
            x1 = conf2.shape[1]

            # Subset y and conf to the appropriate rows
            y_block[k,:n_non_na,:y1] = y_current[non_nan]
            conf_block[k,:n_non_na,:x1] = conf2[non_nan]
            

    # Multiply the left-singular values which contribute to the rank of conf
    # by the corresponding singular values to rank reduce conf
    U, D, Vt = np.linalg.svd(conf_block, full_matrices=False)

    # Get the rank of the matrix
    for k in range(blksize):

        rank = np.sum(D[k,:] > 1e-10)

        # Rank reduce U
        U[k, :, rank:] = 0

    # Compute U'Y
    UtY = U.transpose((0,2,1)) @ y_block

    # Get deconfounding variable predicted values to regress out
    deconf_pred = U @ UtY# pd.DataFrame(U @ UtY)
#     deconf_pred.index = y_current.index
    
    # Update i
    i = i + blksize
    
    print(time.time()-t1_tmp)
    
print(j,i)
print(time.time()-t1)

In [None]:
y.shape[0] - sum(tmp[i]['pattern'])

In [None]:

# Get the pattern
non_nan = ~np.array(nan_patterns_y[i]['pattern'],dtype=bool)

# Number of non na values
n_non_na = int(np.sum(1*non_nan))
n_non_na

In [None]:
existing_array = np.random.rand(521, 509)

# Wrap this array with Dask, specifying the desired chunk size (100x100 in this case).
conf2_dask = da.from_array(conf2, chunks=(100, 100))


In [None]:
from src.nantools.all_non_nan_inds import all_non_nan_inds
from src.nantools.create_nan_patterns import create_nan_patterns

# Get the indices for non-nan rows in conf
conf_non_nan_inds = all_non_nan_inds(conf)

# Reduce conf and y down, ignoring the nan rows for conf
conf = conf[conf_non_nan_inds]
y = y[conf_non_nan_inds]

# We now need to get the nan-patterns for y
nan_patterns_y = create_nan_patterns(y)

In [None]:
y = y.values
conf = conf.values

In [None]:
import dask.array as da

y_da = da.from_array(y, chunks=(1000, None))
conf_da = da.from_array(conf, chunks=(1000, None))

In [None]:
y_da

In [None]:
#MARKER
t1 = time.time()

# Create array to store result
pred_y_da = da.zeros(y_da.shape, chunks=(1000, None))
    
# Loop through all unique nan patterns in y
for i in nan_patterns_y:

    print('Deconfounding: ', i+1, '/', len(nan_patterns_y))
    print('Time elapsed: ', time.time()-t1)
    print('Predicted time: ', len(nan_patterns_y)*(time.time()-t1)/(i+1))

    # Get the pattern
    non_nan = ~np.array(nan_patterns_y[i]['pattern'],dtype=bool)

    # Check if we have at least 5 non-nan values
    if np.sum(1*non_nan) > 5:
        
        # Get the y's we're interested in
        y_current_da = y_da[:,i]
        
        # Subset y and conf to the appropriate rows
        y_current_da = y_current_da[non_nan]
        conf_current_da = conf_da[non_nan]
        
        # Increase the precision on conf_current (just in case overflow
        # becomes a risk)
        conf_current_da = conf_current_da.astype(np.float64)
        
        # Multiply the left-singular values which contribute to the rank of conf
        # by the corresponding singular values to rank reduce conf
        U_da, S_da, Vt_da = da.linalg.svd(conf_current_da)
        
        # Rank reduce U
        U_da = U_da*(S_da > 1e-10)
        
        # Get deconfounding variable predicted values to regress out
        pred_y_current_da = U_da @ (U_da.T @ y_current_da)
        
        # Fill in pred_y_da
        pred_y_da[non_nan,i] = pred_y_current_da
    


t2 = time.time()

In [None]:
t1 = time.time()
pred_y_da.compute()
t2 = time.time()

In [None]:
print(t2-t1)

In [None]:
s = np.array([1,1,3,2,1,0,0,0,0])

tmp = 1*(s > 1e-10)

In [None]:
x = np.random.randn(30, s.shape[0])

In [None]:
x*(s > 1e-10)

In [None]:


t1 = time.time()
# -----------------------------------------------------------------------------
# Step 1: Outside loop
# -----------------------------------------------------------------------------
# Multiply the left-singular values which contribute to the rank of conf
# by the corresponding singular values to rank reduce conf
U, D, Vt = np.linalg.svd(conf2, full_matrices=False)

# Get the rank of the matrix
rank = np.sum(D > 1e-10)

# Rank reduce U
U = U[:, :rank]

# Compute U'Y
UtY = U.T @ My

# Get deconfounding variable predicted values to regress out
deconf_pred2 = U @ UtY

t2 = time.time()

# -----------------------------------------------------------------------------
# Step 2: Outside loop (numba)
# -----------------------------------------------------------------------------
# Multiply the left-singular values which contribute to the rank of conf
# by the corresponding singular values to rank reduce conf
deconf_pred = numba_predict(conf2, My)

t3 = time.time()

# -----------------------------------------------------------------------------
# Step 3: Outside loop (numba compiled)
# -----------------------------------------------------------------------------
# Multiply the left-singular values which contribute to the rank of conf
# by the corresponding singular values to rank reduce conf
for i in np.arange(20):
    
    deconf_pred3 = numba_predict(conf2, My)

t4 = time.time()

print('Step 1 time: ', t2-t1)
print('Step 2 time: ', t3-t2)
print('Step 3 time: ', (t4-t3)/20)
print('Total: ', t4-t1)

In [None]:
!pip uninstall torch

# 

In [None]:

from numba import njit
import numba as nb

def non_numba_predict(X,Y):
    
    # Multiply the left-singular values which contribute to the rank of conf
    # by the corresponding singular values to rank reduce conf
    U, D, _ = np.linalg.svd(X, full_matrices=False)

    # Fix ordering of U (needed for fast numba compile)
    U=U[::1]
    
    # Get the rank of the matrix
    rank = np.sum(D > 1e-10)

    # Rank reduce U
    U = U[:, :rank]
    
    # Compute U'Y
    UtY = U.T.copy() @ Y
    
    # Get deconfounding variable predicted values to regress out
    pred = U @ UtY
    
    return(pred)

@njit(nb.float64[:,:](nb.float64[:,:],nb.float64[:,:]),nopython=True)
def numba_predict(X,Y):
    
    # Multiply the left-singular values which contribute to the rank of conf
    # by the corresponding singular values to rank reduce conf
    U, D, _ = np.linalg.svd(X, full_matrices=False)

    # Fix ordering of U (needed for fast numba compile)
    U=U[::1]
    
    # Get the rank of the matrix
    rank = np.sum(D > 1e-10)

    # Rank reduce U
    U = U[:, :rank]
    
    # Compute U'Y
    UtY = U.T @ Y
    
    # Get deconfounding variable predicted values to regress out
    pred = U @ UtY
    
    return(pred)

In [None]:
conf.dtype

In [None]:
t_total = 0
for i in np.arange(5):
    
    t1_running = time.time()
    pred = non_numba_predict(conf)
    t2_running = time.time()
    
    t_total = t_total + t2_running-t1_running

print(t_total/5)

t1 = time.time()
pred = numba_predict(conf)
t2 = time.time()

print(t2-t1)

t_total = 0
for i in np.arange(5):
    
    t1_running = time.time()
    pred = numba_predict(conf)
    t2_running = time.time()

    t_total = t_total + t2_running-t1_running

print(t_total/5)

In [None]:
X.shape

In [None]:
print('X: ',nb.typeof(X))
print('Y: ',nb.typeof(Y))
print('U1: ',nb.typeof(U1))
print('D: ',nb.typeof(D))
print('U2: ',nb.typeof(U2))
print('rank: ',nb.typeof(rank))
print('U: ',nb.typeof(U))
print('UtY: ',nb.typeof(UtY))
print('pred: ',nb.typeof(pred))

In [None]:
nb.typeof(np.transpose(U))

In [None]:
import numpy as np
from numba import jit, njit
import numba as nb
from timeit import default_timer as timer
import scipy.linalg as spl

# Function to generate a random matrix
def random_matrix(m, n):
    return np.random.rand(m, n)

def non_numba_predict(X):
    # Multiply the left-singular values which contribute to the rank of conf
    # by the corresponding singular values to rank reduce conf
    U1, D, _ = np.linalg.svd(X)
    return U1

# Numba-compiled version of np.linalg.svd
@njit(float64[:](float64[:]), cache=True)
def numba_predict(X):
    # Multiply the left-singular values which contribute to the rank of conf
    # by the corresponding singular values to rank reduce conf
    U1, D, _ = np.linalg.svd(X)
    return U1

# SciPy version of SVD
def scipy_predict(X):
    U1, D, _ = spl.svd(X)
    return U1

# Matrix size
m, n = 67470, 378

# Generate random matrices
X = random_matrix(m, n)
Y = random_matrix(m, n)

# Benchmark non_numba_predict
start = timer()
U1 = non_numba_predict(X, Y)
end = timer()
numpy_time = end - start
print(f"NumPy SVD time: {numpy_time:.6f} seconds")

# Benchmark numba_predict
start = timer()
U1 = numba_predict(X, Y)
end = timer()
numba_time = end - start
print(f"Numba SVD time: {numba_time:.6f} seconds")

# Benchmark scipy_predict
start = timer()
U1 = scipy_predict(X, Y)
end = timer()
scipy_time = end - start
print(f"SciPy SVD time: {scipy_time:.6f} seconds")

print(f"Speedup (Numba over NumPy): {numpy_time / numba_time:.2f}x")
print(f"Speedup (SciPy over NumPy): {numpy_time / scipy_time:.2f}x")

# Debugging Numba function
print("\nDebugging Numba function:")
numba_predict.inspect_types()

In [None]:
import dask

@njit(nb.float64[:,:](nb.float64[:,:]))
def numba_svd(X):
    U, _, _ = np.linalg.svd(X, full_matrices=False)
    return(U)
    
    
def dask_svd(X,chunks=(1000, 100)):
    da.from_array(X, chunks=chunks).persist()
    U, _, _ = da.linalg.svd(X, full_matrices=False)
    return(U.compute)


n_rep = 50
t1 = time.time()
for i in np.arange(n_rep):
    U, _, _ = np.linalg.svd(conf, full_matrices=False)
t2 = time.time()

print((t2-t1)/n_rep)

U = numba_svd(conf)

t1 = time.time()
for i in np.arange(n_rep):
    U = numba_svd(conf)
t2 = time.time()
    
print((t2-t1)/n_rep)

In [None]:
type(conf)

In [None]:
import dask.array as da
conf_da = da.from_array(conf_reduced, chunks=(1000, None)).persist()
U1, D1, V1 = da.linalg.svd(conf_da)

In [None]:
import dask
dask.visualize(U1, D1, V1)

In [None]:
t1 = time.time()
U1 = U1.compute()
D1 = D1.compute()
V1 = V1.compute()
t2 = time.time()
print(t2-t1)

In [None]:
print(np.amax(np.abs((tmp @ (tmp.T @ My))- (tmp2 @ (tmp2.T @ My)))))
print(np.amax(np.abs(My- (tmp2 @ (tmp2.T @ My)))))
print(np.amax(np.abs(My- (tmp2 @ (tmp2.T @ My)))))

In [None]:
np.any(np.isnan(tmp2))

In [None]:
t1 = time.time()
U, D, V = np.linalg.svd(conf_reduced, full_matrices=False)

# Get the rank of the matrix
# rank = np.sum(D > 1e-10)

# Multiply the left-singular values which contribute to the rank of conf
# by the corresponding singular values to rank reduce conf
# tmp = tmp[:, :rank]
t2 = time.time()
print(t2-t1)

In [None]:
U1.dtype

In [None]:
print(np.amax(np.abs(U @ np.diag(D) @ V - conf_reduced)))
print(np.amax(np.abs(U1 @ np.diag(D1) @ V1 - conf_reduced)))

In [None]:
conf_reduced = conf[:,~np.all(conf==0,axis=0)]

In [None]:
!pip install dask --upgrade