## Sandbox notebook


In [None]:
import os
import time 
import shutil 
import numpy as np
import pandas as pd
pd.set_option("display.precision", 20)

from lib.script_01_00 import generate_initial_variables
from lib.script_01_01 import generate_raw_confounds
from lib.script_01_02 import generate_nonlin_confounds

from src.nets.nets_load_match import nets_load_match
from src.nets.nets_inverse_normal import nets_inverse_normal 
from src.nets.nets_normalise import nets_normalise 
from src.nets.nets_demean import nets_demean
from src.nets.nets_deconfound import nets_deconfound

from src.duplicate.duplicate_categorical import duplicate_categorical
from src.duplicate.duplicate_demedian_norm_by_site import duplicate_demedian_norm_by_site

from src.preproc.datenum import datenum
from src.preproc.days_in_year import days_in_year

from src.memmap.MemoryMappedDF import MemoryMappedDF
from src.memmap.read_memmap_df import read_memmap_df
from src.memmap.addBlockToMmap import addBlockToMmap

In [None]:
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Output directory (will eventually be equal to data_dir)
out_dir = '/well/nichols/users/inf852/confounds/data/'

In [None]:

# Read in precomputed memmaps
IDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz'))
nonIDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonIDPs.npz'))
misc = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','misc.npz'))
confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','confounds.npz'))

In [None]:
import os
import numpy as np
import pandas as pd
from src.nets.nets_svd import nets_svd
from src.nets.nets_demean import nets_demean
from src.memmap.MemoryMappedDF import MemoryMappedDF
from src.nantools.all_non_nan_inds import all_non_nan_inds
from src.nantools.create_nan_patterns import create_nan_patterns


In [None]:
cluster_cfg = {'cluster_type':'local','num_nodes':12}

In [None]:

# Load the Local Cluster
from dask.distributed import LocalCluster
from dask.distributed import Client, as_completed
cluster = LocalCluster()


# --------------------------------------------------------------------------------
# Connect to client
# --------------------------------------------------------------------------------

# Connect to cluster
client = Client(cluster)   

# --------------------------------------------------------------------------------
# Scale
# --------------------------------------------------------------------------------
cluster.scale(cluster_cfg['num_nodes'])


In [None]:
cluster

In [None]:
i = 3
y = IDPs[:,:]
conf = confounds[:,:]
mode = 'nets_svd'
demean = True
dtype = 'float64'
conf_has_nans = False


In [None]:
from src.preproc.switch_type import switch_type

In [None]:

# Save original index
original_index = y.index

# Check if confounds have NaNs
if conf_has_nans is None:
    
    # If the type is memory mapped
    if type(conf)==MemoryMappedDF:

        # Work out if the confounds have nans
        conf_has_nans = conf[:,:].isna().sum().sum()

    else:
        # Work out if the confounds have nans
        conf_has_nans = conf.isna().sum().sum()

# If the confounds have nans
if conf_has_nans:
    
    # If the type is memory mapped
    if type(conf)==MemoryMappedDF:
        
        # We are trying to avoid reading everything in at once
        conf_non_nan_inds = all_non_nan_inds(conf, safeMode=True)
        
    else:
        
        # Otherwise, we can get the indices for non-nan rows in conf directly
        conf_non_nan_inds = all_non_nan_inds(conf)

    # Reduce conf and y down, ignoring the nan rows for conf
    conf = conf[conf_non_nan_inds]
    y = y[conf_non_nan_inds]
    
    # If we have subset the data we need to demean again
    if demean:
        
        # Demean y and conf
        y = nets_demean(y)
        conf = nets_demean(conf)

    
# We now need to get the nan-patterns for y (we don't include
# columns with 5 or less values).
nan_patterns_y = create_nan_patterns(y, thresh=5)

# Number of columns which meet our nan-thresholding requirements
n_cols = len([j for i in nan_patterns_y for j in nan_patterns_y[i]['columns']])

# Initialize empty nan dataframe
y_deconf = pd.DataFrame(np.zeros((y.shape[0],n_cols),dtype=dtype),index=y.index)

# We're only including column names for the variables that were not removed during nan pattern
# identification.
y_deconf.columns = [j for i in nan_patterns_y for j in nan_patterns_y[i]['columns']]

# Change types to memory mapped dfs
MemoryMappedDF(y).save(os.path.join(os.getcwd(),'temp_mmap','y.npz'))
MemoryMappedDF(conf).save(os.path.join(os.getcwd(),'temp_mmap','conf.npz'))
# conf = switch_type(conf,out_type="filename",fname=os.path.join(os.getcwd(),'temp_mmap','y.conf'))

t1 = time.time()

# Empty futures list
futures = []

# Loop through all unique nan patterns in y
for i in nan_patterns_y:
    
    print('Deconfounding: ', i+1, '/', len(nan_patterns_y))

    # Get the pattern
    non_nan = ~np.array(nan_patterns_y[i]['pattern'],dtype=bool)
    
    # Submit a job to the local cluster
    future = client.submit(inside_loop, os.path.join(os.getcwd(),'temp_mmap','y.npz'), 
                           os.path.join(os.getcwd(),'temp_mmap','conf.npz'), non_nan, mode, pure=False)

    # Append to list 
    futures.append(future)

# Completed jobs
completed = as_completed(futures)

# Wait for results
for i in completed:
    i.result()

t2 = time.time()
print('dask time: ', t2-t1)

# Delete the future objects (NOTE: see above comment in setup section).
del i, completed, futures, future_b


# # Get the list of columns in y that are also in y_deconf
# common_columns = [col for col in y.columns if col in y_deconf.columns]

# # Reorder y_deconf columns to match the order of common columns in y
# y_deconf = y_deconf[common_columns]
    
# # Initialise output dataframe
# deconf_out = pd.DataFrame(index=original_index,columns=y_deconf.columns,dtype=dtype)

# # Restore the nan rows
# if conf_has_nans:
#     deconf_out[conf_non_nan_inds] = np.array(y_deconf.values,dtype=dtype)
# else:
#     deconf_out[:] = np.array(y_deconf.values,dtype=dtype)


In [None]:
len(non_nan)

In [None]:

def inside_loop(y, conf, non_nan, mode):

    # Change types to memory mapped dfs
    y = switch_type(y,out_type="MemoryMappedDF")
    conf = switch_type(conf,out_type="MemoryMappedDF")
    
    # Get the y's we're interested in
    y_current = y[nan_patterns_y[i]['columns']]

    # Subset y and conf to the appropriate rows
    y_current = y_current[non_nan]
    conf_current = conf[non_nan]
    
    # Save y index and columns
    y_index = y_current.index
    y_columns = y_current.columns
    
    # If we are demeaning
    if demean:
        
        # Demean conf_current
        conf_current = nets_demean(conf_current)
        
    # We don't want to work on views of the data as it will slow the computation
    conf_current = np.array(conf_current.values)
    y_current = np.array(y_current.values)
    
    # Check if we are using psuedo inverse
    if mode.lower() == 'pinv':

        # Regress conf out of y_current - we perform the pseudo inverse on
        # conf^T @ conf as we expect the number of columns to be much(!) less
        # than the number of rows and thus this ends up being more numerically
        # stable than trying to invert, or approximately invert, conf itself.
        betahat = np.linalg.pinv(conf_current.T @ conf_current) @ conf_current.T @ y_current

        # Set computational zeros to actual zeros
        betahat[np.abs(betahat) < 1e-10] = 0

        # Get deconfounding variable predicted values to regress out
        deconf_pred = pd.DataFrame(conf_current @ betahat)
        deconf_pred.index = y_index
        deconf_pred.columns = y_columns

    # Otherwise use svd
    elif mode.lower() == 'nets_svd':
        
        # Multiply the left-singular values which contribute to the rank of conf
        # by the corresponding singular values to rank reduce conf
        U, S, _ = nets_svd(conf_current, reorder=False)
        
        # Rank reduce U and reduce datatype as only need to multiply
        # U = U[:, S < 1e-10]
        
        # Get deconfounding variable predicted values to regress out
        deconf_pred = pd.DataFrame(U @ (U.T @ y_current))
        deconf_pred.index = y_index
        deconf_pred.columns = y_columns

    # Otherwise use svd
    elif mode.lower() == 'svd':
        
        # Multiply the left-singular values which contribute to the rank of conf
        # by the corresponding singular values to rank reduce conf
        U, S, _ = np.linalg.svd(conf_current, full_matrices=False)
        
        # Get the rank of the matrix
        rank = np.sum(S > 1e-10)
        
        # Rank reduce U and reduce datatype as only need to multiply
        U = U[:, :rank]
        
        # Get deconfounding variable predicted values to regress out
        deconf_pred = pd.DataFrame(U @ (U.T @ y_current))
        deconf_pred.index = y_index
        deconf_pred.columns = y_columns
        
    else:

        # Perform qr decomposition
        Q, R = np.linalg.qr(conf_current)
        betahat = np.linalg.pinv(R) @ (Q.T @ y_current)

        # Set computational zeros to actual zeros
        betahat[np.abs(betahat) < 1e-10] = 0

        # Get deconfounding variable predicted values to regress out
        deconf_pred = pd.DataFrame(conf_current @ betahat)
        deconf_pred.index = y_index
        deconf_pred.columns = y_columns
        
    # Get deconfounded y
    y_deconf_current = pd.DataFrame(y_current, index=y_index, columns=y_columns) - deconf_pred

    # If we are demeaning, demean y
    if demean:
        y_deconf_current = nets_demean(y_deconf_current)
    
    # Update deconfounded y 
    y_deconf_current_with_nans = np.ones((len(y_deconf.index), 
                                          len(y_deconf_current.columns)))*np.NaN
    
    # Update with current values
    y_deconf_current_with_nans[non_nan,:] = y_deconf_current.values[:,:]
    
    # Make into a dataframe with correct index and rows
    y_deconf_current_with_nans = pd.DataFrame(y_deconf_current_with_nans,
                                             index=y_deconf.index,
                                             columns=y_deconf_current.columns)
    
    # Horizontal concatenate
    y_deconf.update(y_deconf_current_with_nans)

    t2 = time.time()

    print('iteration time: ', t2-t1)


In [None]:

# Get the y's we're interested in
y_current = y[nan_patterns_y[i]['columns']]

non_nan = ~np.array(nan_patterns_y[i]['pattern'],dtype=bool)

len(non_nan),sum(non_nan),y_current.index[non_nan]

In [None]:
# confounds = mmap
confounds_np_copy = np.array(confounds[:,:])
confounds_np = np.array(confounds_np_copy)
confounds_mask = np.array(np.abs(confounds_np_copy)>1e-10,dtype=bool)
confounds_mask_copy = np.array(confounds_mask)

In [None]:
confounds_copy.shape

t1 = time.time()
y_copy = IDPs[:,i].values
t2 = time.time()
print(t2-t1)

t1 = time.time()
confounds_np_copy[~non_nan,:]=0
confounds_mask_copy[~non_nan,:]=0
y_copy[~non_nan,:]=0
t2 = time.time()
print(t2-t1)


t1 = time.time()
confounds_num_vals = np.sum(confounds_mask_copy,axis=0)
confounds_num_vals[confounds_num_vals==0]=1
confounds_means = np.sum(confounds_np_copy,axis=0)/confounds_num_vals
confounds_np_copy[:,:] = confounds_np_copy - confounds_means
t2 = time.time()
print(t2-t1)

In [None]:
t1 = time.time()
u,d,vt = nets_svd(confounds_np_copy)
t2 = time.time()
print(t2-t1)

t1 = time.time()
u,d,vt = nets_svd(confounds_np_copy[:,confounds_num_vals>0])
t2 = time.time()
print(t2-t1)

In [None]:
confounds_num_vals[confounds_num_vals==0]=1

In [None]:

conf_current = np.array(conf_current.values)
y_current = np.array(y_current.values)
np.sum(np.isnan(U @ (U.T @ y_current)))

In [None]:
# type(U), y_current.shape
t1 = time.time()
U @ (U.T @ y_current)
t2 = time.time()
print(t2-t1)

y_current2 = np.array(y_current.values)

t1 = time.time()
U @ (U.T @ y_current2)
t2 = time.time()
print(t2-t1)


In [None]:
!pip install -U jax jaxlib

In [None]:

import jax.numpy as jnp

conf_current = np.array(conf_current)

t1 = time.time()
u,d,v = np.linalg.svd(conf_current, full_matrices=False)
t2 = time.time()
print(t2-t1)


t1 = time.time()
u2,d2,v2 = jnp.linalg.svd(conf_current, full_matrices=False)
t2 = time.time()
print(t2-t1)

print(np.allclose(u,u2))

In [None]:
conf_current.dtype

In [None]:
#conf_current.shape, y_current.shape
betahat = np.linalg.pinv(conf_current.T @ conf_current) @ conf_current.T @ y_current

# Set computational zeros to actual zeros
betahat[np.abs(betahat) < 1e-10] = 0

print(conf_current.shape, betahat.shape)

# Get deconfounding variable predicted values to regress out
deconf_pred = pd.DataFrame(conf_current @ betahat)
# deconf_pred.index = y_current.index

In [None]:
y = tmp2[:,0]

In [None]:
import numpy as np
from numba import njit

@njit
def tmp_svd(x):
    u, d, v = np.linalg.svd(x,full_matrices=False)
    return(u,d,v)

nrep = 10
t_total1 = 0
t_total2 = 0
for i in range(nrep):
    t1 = time.time()
    u, d, v = np.linalg.svd(x,full_matrices=False)
    tmp1 = u @ (u.T @ y)
    t2 = time.time()
    t_total1 = t_total1+t2-t1
    
    t1 = time.time()
    q,r = np.linalg.qr(x)
    tmp2 = q @ (q.T @ y)
    t2 = time.time()
    t_total2 = t_total2+t2-t1

    print(np.allclose(tmp1,tmp2))

print(t_total1/nrep, t_total2/nrep)

In [None]:
from cvxopt import spmatrix, cholmod
from scipy.sparse import csr_matrix

t1 = time.time()

# Convert to CVXOPT spmatrix

# Convert to scipy.sparse CSR matrix
scipy_sparse_matrix = csr_matrix(tmp)

# Convert to CVXOPT spmatrix
A = spmatrix(tmp[tmp!=0][:],np.where(tmp!=0)[0],np.where(tmp!=0)[0])
B = matrix(y)
t2 = time.time()
print(t2-t1)

t1 = time.time()
cholmod.linsolve(A,B)
t2 = time.time()
print(t2-t1)
print(X)



In [None]:
list(tmp[tmp!=0][:]),list(np.where(tmp!=0)[0]),list(np.where(tmp!=0)[0])

In [None]:
import os
import shutil
import numpy as np
import pandas as pd

from src.nets.nets_load_match import nets_load_match
from src.nets.nets_normalise import nets_normalise
from src.nets.nets_inverse_normal import nets_inverse_normal
from src.nets.nets_deconfound import nets_deconfound

from src.preproc.filter_columns_by_site import filter_columns_by_site

from src.memmap.MemoryMappedDF import MemoryMappedDF

In [None]:

# Confound groups we are interested in.
conf_name = ['AGE', 'AGE_SEX', 'HEAD_SIZE',  'TE', 'STRUCT_MOTION', 
             'DVARS', 'HEAD_MOTION', 'HEAD_MOTION_ST', 'TABLE_POS', 
             'EDDY_QC']

# Get all the confounds in the group
conf_group = all_conf.get_groups(conf_name)

# Get the subject ids
sub_ids = IDPs.index

# Read in the IDs for site
site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)

# Get the unique site ids
unique_site_ids = np.unique(site_ids)

# Initialize indSite as a list to hold the indices
inds_per_site = []

# Loop over each value in site ids
for site_id in unique_site_ids:

    # Find the indices where all elements in a row of siteDATA match the current valueSite
    # Note: This assumes siteDATA and siteValues have compatible shapes or values for comparison
    indices = np.where((site_ids == site_id).all(axis=1))[0]

    # Append the found indices to the indSite list
    inds_per_site.append(indices)

# Delete the indices
del indices

# Initialise empty array to store results
conf_nonlin = pd.DataFrame(index=conf_group.index)

In [None]:
#nets_deconfound(IDPs[:,:], all_conf[:,:], 'svd')
import os
import numpy as np
import pandas as pd
from src.nantools.create_nan_patterns import create_nan_patterns
from src.nantools.all_non_nan_inds import all_non_nan_inds
from src.nets.nets_demean import nets_demean

import time
y = IDPs
conf = confounds

In [None]:

t1 = time.time()
# Save original index
original_index = y.index

# Get the indices for non-nan rows in conf
conf_non_nan_inds = all_non_nan_inds(conf, safeMode=True)

# Reduce conf and y down, ignoring the nan rows for conf
if not safeMode:
# conf = conf[conf_non_nan_inds]
# y = y[conf_non_nan_inds]

# # Initialize empty nan dataframe
# y_deconf = pd.DataFrame(index=y.index,dtype='float64')

# # If we are demeaning
# if demean:
    
#     # Demean y and conf
#     y = nets_demean(y)
#     conf = nets_demean(conf)
    
# # We now need to get the nan-patterns for y
# nan_patterns_y = create_nan_patterns(y)

# t2 = time.time()
# print('init time: ', t2-t1)

# # Loop through all unique nan patterns in y
# for i in nan_patterns_y:
    
#     t1 = time.time()
#     print('Deconfounding: ', i+1, '/', len(nan_patterns_y))

#     # Get the pattern
#     non_nan = ~np.array(nan_patterns_y[i]['pattern'],dtype=bool)

#     t2 = time.time()
#     print('nonnan time: ', t2-t1)
    
#     # Check if we have at least 5 non-nan values
#     if np.sum(1*non_nan) > 5:

#         t1 = time.time()
        
#         # Subset y to the appropriate columns
#         cols = nan_patterns_y[i]['columns']

#         # Get the y's we're interested in
#         y_current = y[nan_patterns_y[i]['columns']]

#         # Subset y and conf to the appropriate rows
#         y_current = y_current[non_nan]
#         conf_current = conf[non_nan]
        
#         t2 = time.time()
#         print('subsetting time: ', t2-t1)
    
#         t1 = time.time()
#         # If we are demeaning
#         if demean:
            
#             # Demean conf_current
#             conf_current = nets_demean(conf_current)

#         t2 = time.time()
#         print('demean time: ', t2-t1)
        
#         t1 = time.time()
#         # Increase the precision on conf_current (just in case overflow
#         # becomes a risk)
#         conf_current = np.array(conf_current,dtype=np.float64)

#         t2 = time.time()
#         print('copy time: ', t2-t1)
        
#         # Check if we are using psuedo inverse
#         if mode.lower() == 'pinv':

#             # Regress conf out of y_current - we perform the pseudo inverse on
#             # conf^T @ conf as we expect the number of columns to be much(!) less
#             # than the number of rows and thus this ends up being more numerically
#             # stable than trying to invert, or approximately invert, conf itself.
#             betahat = np.linalg.pinv(conf_current.T @ conf_current) @ conf_current.T @ y_current

#             # Set computational zeros to actual zeros
#             betahat[np.abs(betahat) < 1e-10] = 0

#             # Get deconfounding variable predicted values to regress out
#             deconf_pred = pd.DataFrame(conf_current @ betahat)
#             deconf_pred.index = y_current.index

#         # Otherwise use svd
#         elif mode.lower() == 'svd':

#             t1 = time.time()
#             # Multiply the left-singular values which contribute to the rank of conf
#             # by the corresponding singular values to rank reduce conf
#             U, S, Vt = np.linalg.svd(conf_current, full_matrices=False)

#             # Get the rank of the matrix
#             rank = np.sum(S > 1e-10)

#             # Rank reduce U
#             U = U[:, :rank] 

#             # Get deconfounding variable predicted values to regress out
#             deconf_pred = pd.DataFrame(U @ (U.T @ y_current))
#             deconf_pred.index = y_current.index
            
#             t2 = time.time()
#             print('svd time: ', t2-t1)
            
#         else:

#             # Perform qr decomposition
#             Q, R = np.linalg.qr(conf_current)
#             betahat = np.linalg.pinv(R) @ (Q.T @ y_current)

#             # Set computational zeros to actual zeros
#             betahat[np.abs(betahat) < 1e-10] = 0

#             # Get deconfounding variable predicted values to regress out
#             deconf_pred = pd.DataFrame(conf_current @ betahat)
#             deconf_pred.index = y_current.index

#         t1 = time.time()
#         # Get deconfounded y
#         y_deconf_current = y_current - deconf_pred
#         t2 = time.time()
#         print('deconf_current time: ', t2-t1)
    
#         t1 = time.time()
#         # If we are demeaning, demean y
#         if demean:
#             y_deconf_current = nets_demean(y_deconf_current)
#         t2 = time.time()
#         print('demean time 2: ', t2-t1)
        
#         t1 = time.time()
#         # Update deconfounded y (v2)
#         y_deconf_current_with_nans = np.ones((len(y_deconf.index), 
#                                               len(y_deconf_current.columns)))*np.NaN
        
#         # Update with current values
#         y_deconf_current_with_nans[non_nan,:] = y_deconf_current.values[:,:]
        
#         # Make into a dataframe with correct index and rows
#         y_deconf_current_with_nans = pd.DataFrame(y_deconf_current_with_nans,
#                                                  index=y_deconf.index,
#                                                  columns=y_deconf_current.columns)
        
#         # Horizontal concatenate
#         y_deconf = pd.concat((y_deconf_current_with_nans, y_deconf), axis=1)
        
#         t2 = time.time()
#         print('update deconfounded: ', t2-t1)

# t1 = time.time()
# # Get the list of columns in y that are also in y_deconf
# common_columns = [col for col in y.columns if col in y_deconf.columns]

# # Reorder y_deconf columns to match the order of common columns in y
# y_deconf = y_deconf[common_columns]
    
# # Remove columns where all values are NaN
# y_deconf = y_deconf.dropna(axis=1, how='all')

# # Restore the nan rows
# deconf_out = pd.DataFrame(index=original_index,columns=y_deconf.columns,dtype=dtype)
# deconf_out[conf_non_nan_inds] = np.array(y_deconf.values,dtype='float64')
# t2 = time.time()
# print('deconf out: ', t2-t1)

In [None]:
from src.memmap.MemoryMappedDF import MemoryMappedDF

type(conf)==MemoryMappedDF


In [None]:
t1 = time.time()
conf_non_nan_inds = all_non_nan_inds(conf, safeMode=True)
t2 = time.time()

In [None]:
x = conf


# Create an empty boolean array
nan_array = np.zeros(x.shape[1], dtype=bool)

# Loop through columns one by one
for col in range(x.shape[1]):
    nan_array[col] = np.isnan(x[:, col]).any()


In [None]:
nan_array.shape, np.isnan(x[:, col]).any().shape

In [None]:
col

In [None]:
np.isnan(x[:, 0]).any()

In [None]:
def all_non_nan_inds(x, safeMode=False):

    # If we aren't in safe mode just read everything in.
    if not safeMode:
        
        # If the type is memory mapped
        if type(x)==MemoryMappedDF:
    
            # Get the values
            x = x[:,:].values

        return(~np.isnan(x).any(axis=1))

    # Assume we can't load all data in at once
    else:

        # Create an empty boolean array
        nan_array = np.zeros(x.shape[1], dtype=bool)

        # Loop through columns one by one
        for col in range(x.shape[1]):
            nan_array[col] = np.isnan(x[:, col].values).any()

        # Return result
        return(nan_array)

In [None]:
import numpy as np
import time
from src.nets.nets_svd import nets_svd

tmp = np.random.randn(60000,400)


t1 = time.time()
u,d,v = np.linalg.svd(tmp, full_matrices=False)
t2 = time.time()
print(t2-t1)

t1 = time.time()
u2,d2,v2 = nets_svd(tmp, reorder=False)
t2 = time.time()
print(t2-t1)

In [None]:
np.amax(np.abs(tmp - (u @ np.diag(d) @ v))), np.amax(np.abs(tmp - (u2 @ np.diag(d2) @ v2)))

In [None]:
u2.shape

In [None]:
from dask.distributed import LocalCluster
cluster = LocalCluster()

In [None]:
cluster


In [None]:
import dask
dask.config.config

In [None]:
print(dask.config.paths)


In [None]:
from dask.distributed import LocalCluster
from dask.distributed import Client, as_completed
cluster = LocalCluster()

# Connect to cluster
client = Client(cluster)   

# Read in number of nodes we need
num_nodes = 12

# Scale the cluster
cluster.scale(num_nodes)
    

In [None]:
MARKER DASK

In [None]:

y = IDPs[:,:]
conf = confounds[:,:]
mode='nets_svd'
demean=True, 
dtype='float64'
conf_has_nans=None

t1 = time.time()
# Save original index
original_index = y.index

# Check if confounds have NaNs
if conf_has_nans is None:
    
    # If the type is memory mapped
    if type(conf)==MemoryMappedDF:

        # Work out if the confounds have nans
        conf_has_nans = conf[:,:].isna().sum().sum()

    else:
        # Work out if the confounds have nans
        conf_has_nans = conf.isna().sum().sum()

# If the confounds have nans
if conf_has_nans:
    
    # If the type is memory mapped
    if type(conf)==MemoryMappedDF:
        
        # We are trying to avoid reading everything in at once
        conf_non_nan_inds = all_non_nan_inds(conf, safeMode=True)
        
    else:
        
        # Otherwise, we can get the indices for non-nan rows in conf directly
        conf_non_nan_inds = all_non_nan_inds(conf)

    # Reduce conf and y down, ignoring the nan rows for conf
    conf = conf[conf_non_nan_inds]
    y = y[conf_non_nan_inds]
    
    # If we have subset the data we need to demean again
    if demean:
        
        # Demean y and conf
        y = nets_demean(y)
        conf = nets_demean(conf)

# Initialize empty nan dataframe
y_deconf = pd.DataFrame(index=y.index,dtype=dtype)

    
# We now need to get the nan-patterns for y
nan_patterns_y = create_nan_patterns(y)

t2 = time.time()

print('init time: ', t2-t1)
    
# Empty futures list
futures = []

# Submit jobs
for i in nan_patterns_y:

    print('Deconfounding: ', i+1, '/', len(nan_patterns_y))

    # Get the pattern
    non_nan = ~np.array(nan_patterns_y[i]['pattern'],dtype=bool)
    
    # Check if we have at least 5 non-nan values
    if np.sum(1*non_nan) > 5:
        
        # Run the i^{th} job.
        future_i = client.submit(inside_loop, 
                                 y[nan_patterns_y[i]['columns']], 
                                 conf, non_nan, mode, pure=False)
    
        # Append to list 
        futures.append(future_i)

# Completed jobs
completed = as_completed(futures)

    
# # Get the list of columns in y that are also in y_deconf
# common_columns = [col for col in y.columns if col in y_deconf.columns]

# # Reorder y_deconf columns to match the order of common columns in y
# y_deconf = y_deconf[common_columns]
    
# # Remove columns where all values are NaN
# y_deconf = y_deconf.dropna(axis=1, how='all')

# # Initialise output dataframe
# deconf_out = pd.DataFrame(index=original_index,columns=y_deconf.columns,dtype=dtype)

# # Restore the nan rows
# if conf_has_nans:
#     deconf_out[conf_non_nan_inds] = np.array(y_deconf.values,dtype=dtype)
# else:
#     deconf_out[:] = np.array(y_deconf.values,dtype=dtype)


In [None]:
from src.preproc.switch_type import switch_type

#nonIDPs.save(os.path.join(os.getcwd(),'tmp_file2'))
x=os.path.join(os.getcwd(),'tmp_file2')


In [None]:
x=switch_type(x,'pandas')

In [None]:
x

In [None]:

from src.memmap.read_memmap_df import read_memmap_df
y = read_memmap_df(x)

In [None]:
y[:,:]