## Sandbox notebook


In [None]:
import os
import time 
import shutil 
import numpy as np
import pandas as pd
pd.set_option("display.precision", 20)

from lib.script_01_00 import generate_initial_variables
from lib.script_01_01 import generate_raw_confounds
from lib.script_01_02 import generate_nonlin_confounds

from src.nets.nets_load_match import nets_load_match
from src.nets.nets_inverse_normal import nets_inverse_normal 
from src.nets.nets_normalise import nets_normalise 
from src.nets.nets_demean import nets_demean
# from src.nets.nets_deconfound import nets_deconfound

from src.duplicate.duplicate_categorical import duplicate_categorical
from src.duplicate.duplicate_demedian_norm_by_site import duplicate_demedian_norm_by_site

from src.preproc.datenum import datenum
from src.preproc.switch_type import switch_type
from src.preproc.days_in_year import days_in_year
from src.preproc.filter_columns_by_site import filter_columns_by_site

from src.memmap.MemoryMappedDF import MemoryMappedDF
from src.memmap.read_memmap_df import read_memmap_df
from src.memmap.addBlockToMmap import addBlockToMmap

In [None]:
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Output directory (will eventually be equal to data_dir)
out_dir = '/well/nichols/users/inf852/confounds/data/'

In [None]:

# Read in precomputed memmaps
IDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz'))
IDPs_deconf = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz'))
nonIDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonIDPs.npz'))
misc = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','misc.npz'))
confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','confounds.npz'))
#nonlinear_confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds.npz'))
p1 = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','p.npz'))
nonlinear_confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds_reduced.npz'))


In [None]:

# Convert input to memory mapped dataframes if it isn't already
all_conf = switch_type(confounds, out_type='MemoryMappedDF')
IDPs = switch_type(IDPs, out_type='MemoryMappedDF')

# Confound groups we are interested in.
conf_name = ['AGE', 'AGE_SEX', 'HEAD_SIZE',  'TE', 'STRUCT_MOTION', 
             'DVARS', 'HEAD_MOTION', 'HEAD_MOTION_ST', 'TABLE_POS', 
             'EDDY_QC']

# Get all the confounds in the group
conf_group = all_conf.get_groups(conf_name)

# Get the subject ids
sub_ids = IDPs.index


In [None]:
nonlinear_confounds[:,:]

In [None]:
import os  
import time
import numpy as np  
import pandas as pd
from scipy.stats import f  
from scipy.stats import t 
from scipy.linalg import pinv, lstsq  
from src.nets.nets_demean import nets_demean
from src.nets.nets_pearson import nets_pearson
from src.preproc.switch_type import switch_type
from src.memmap.read_memmap_df import read_memmap_df
from src.nets.nets_load_match import nets_load_match
from src.memmap.addBlockToMmap import addBlockToMmap


## V2

In [None]:
data_dir = data_dir 
IDP_indices = np.arange(410,510)
nonlinear_confounds = nonlinear_confounds#[:,:]#conf_ct
IDPs_deconf = IDPs_deconf#IDPs                         
method=1 
dtype=np.float64
p_fname=None
ve_fname=None
return_df=True
# --------------------------------------------------------------------------------
# Convert to appropriate datatype. If we have a filename for a memory mapped 
# dataframe we want to read it in as a memory mapped df (after all it is already 
# saved on disk so no extra memory usage there), otherwise if it is a pandas 
# dataframe we leave it as it is (after all it is already in memory so we are not
# increasing usage).
# --------------------------------------------------------------------------------
t1_total = time.time()
    
# Initialise empty arrays for p values
p_df2 = pd.DataFrame(np.zeros((len(IDP_indices),num_conf_nonlin))*np.NaN,columns=nonlinear_confounds.columns)

# Initialise empty arrays for explained variances
ve_df2 = pd.DataFrame(np.zeros((len(IDP_indices),num_conf_nonlin))*np.NaN,columns=nonlinear_confounds.columns)

for i, IDP_index in enumerate(IDP_indices):
    
    # If we only have filename
    if type(nonlinear_confounds) == str:
        
        # Convert input to memory mapped dataframes if it isn't already
        nonlinear_confounds = switch_type(nonlinear_confounds, out_type='MemoryMappedDF')
        
    # If we only have filename
    if type(IDPs_deconf) == str:
        
        # Convert input to memory mapped dataframes if it isn't already
        IDPs_deconf = switch_type(IDPs_deconf, out_type='MemoryMappedDF')
        
    # Get the subject ids
    sub_ids = IDPs_deconf.index
    
    # Read in the IDs for site
    site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)
    
    # Get the unique site ids
    unique_site_ids = np.unique(site_ids)
    
    # Initialize indSite as a list to hold the indices
    inds_per_site = []
    
    # Loop over each value in site ids
    for site_id in unique_site_ids:
    
        # If we have a pandas dataframe
        if type(IDPs_deconf) == pd.core.frame.DataFrame:
            IDPs_values = IDPs_deconf.iloc[:, IDP_index].values.flatten()
        else:
            IDPs_values = IDPs_deconf[:, IDP_index].values.flatten()
    
        # Find the non-nan indices for this site
        indices = np.where(~np.isnan(IDPs_values) & (site_ids == site_id).all(axis=1))[0]
    
        # Append the found indices to the indSite list
        inds_per_site.append(indices)
    
    # Delete the indices
    del indices
    
    # Get the number of nonlinear confounds
    num_conf_nonlin = nonlinear_confounds.shape[1]
    
    # Get the number of IDPs
    num_IDPs = IDPs_deconf.shape[1]
    
    # Get IDP
    if type(IDPs_deconf) == pd.core.frame.DataFrame:
        IDP = IDPs_deconf.iloc[:, IDP_index].values
    else:
        IDP = IDPs_deconf[:, IDP_index].values
    
    # If coincident we can speed things up by considering multiple columns at once
    for site_no in (unique_site_ids + 1):
    
        # Get the columns of nonlinear_confounds for this site
        nonlinear_confounds_site = filter_columns_by_site(nonlinear_confounds, site_no)
    
        # Check if we have enough values to perform the comparison
        if len(inds_per_site[site_no-1])!=0:
            
            # Subset to just this site (remembering zero indexing)
            nonlinear_confounds_site = nonlinear_confounds_site.iloc[inds_per_site[site_no-1],:]
    
            # --------------------------------------------------------
            # Get X,Y and predicted Y
            # --------------------------------------------------------
            # Demean the confound data for the current site and nonlinear confound
            X = nets_demean(nonlinear_confounds_site).values
        
            # Get the IDP
            Y = IDP[inds_per_site[site_no-1]]
        
            # Get predicted Y = Xbeta (note this is being done seperately for each
            # column so we are not doing the usual inv(X.T @ X) @ X.T @ Y
            pred_Y = ((np.sum(X*Y,axis=0)/np.sum(X*X,axis=0))*X)
            
            # Compute the residuals
            resids = Y - pred_Y
            
            # First method
            if method==1:
                
                # --------------------------------------------------------
                # Variance explained version 1
                # --------------------------------------------------------
                # Get variance explained by pred_Y
                ve = 100*((np.std(pred_Y,axis=0)/np.std(Y,axis=0))**2)
            
                # --------------------------------------------------------
                # P Version 1
                # --------------------------------------------------------
            
                # Compute the sum of squares for the effect
                SSeffect = np.linalg.norm(pred_Y - np.mean(pred_Y),axis=0)**2  
                
                # Compute the sum of squares for the error
                SSerror = np.linalg.norm(resids - np.mean(resids),axis=0)**2  
                
                # Degrees of freedom for the effect should be one as we are only 
                # regressing the one column, unless a column has no observations
                # at all
                df = 1*np.any(np.abs(X)>1e-8,axis=0)
                
                # Compute the degrees of freedom for the error
                dferror = len(Y) - df  
                
                # Compute the F-statistic
                F = (SSeffect / df) / (SSerror / dferror)  
                
                # Compute p[i] using the F-distribution
                p = 1 - f.cdf(F, df, dferror)  
    
            # Second method
            if method==2:
                
                # --------------------------------------------------------
                # Variance explained version 2
                # --------------------------------------------------------
                
                # Construct new design matrix
                XplusIntercept = np.ones((X.shape[1], X.shape[0], 2))
                XplusIntercept[:,:,1] = X.T[:]
    
                # Perform OLS regression
                U, D, Vt = np.linalg.svd(XplusIntercept, full_matrices=False)
            
                # Get the rank of the matrix
                rank = np.sum(D > 1e-10,axis=1)
            
                # Rank reduce U, D and Vt
                for i, rank_current in enumerate(rank):
                    U[i,:, rank_current:]=0 
                    Vt[i,rank_current:,:]=0
                    D[i,rank_current:]=0
            
                # Get betahat
                beta = (Vt.transpose((0,2,1))/D.reshape(*D.shape,1)) @ (U.transpose((0,2,1)) @ Y)
            
                # Get residuals
                resids = Y - XplusIntercept @ beta
            
                # Get sigma^2 estimator
                sigma2 = np.sum(resids**2,axis=1)/Y.shape[0]
            
                # Contrast for beta2
                L = np.array([0,1]).reshape((1,2,1))
                
                # Contrast variance
                invDVtL = Vt/D.reshape(*D.shape,1) @ L 
                varLtBeta = np.sqrt(sigma2.reshape(*sigma2.shape,1)*(invDVtL.transpose((0,2,1)) @ invDVtL))
            
                # T statistic for contrast
                T = L.transpose((0,2,1)) @ beta / varLtBeta
                        
                # Second version of variance explained
                ve = 100*(1-(np.std(resids,axis=1)**2/np.std(Y,axis=0)**2)).flatten()
            
                # --------------------------------------------------------
                # P-value version 2
                # --------------------------------------------------------
            
                # P value
                p = 2*t.sf(np.abs(T.flatten()), dferror)
    
            # Third method
            if method==3:
                
                # --------------------------------------------------------
                # P-value version 3
                # --------------------------------------------------------
            
                # Number of elements
                n = X.shape[0]
                
                # Compute numerator
                numerator = np.sum(X*Y,axis=0) - n*np.mean(X,axis=0)*np.mean(Y,axis=0)
                
                # Compute denominator
                denom_X = np.sqrt(np.linalg.norm(X,axis=0)**2 - n*np.mean(X,axis=0)**2)
                denom_Y = np.sqrt(np.linalg.norm(Y,axis=0)**2 - n*np.mean(Y,axis=0)**2)
                
                # Compute coefficient
                R = numerator/(denom_X*denom_Y)
                 
                # Get T statistic
                T = R*np.sqrt((n-2)/(1-R**2))
                
                # Assuming 't' is your t-statistic and 'n' is sample size
                p = 2*t.sf(np.abs(T), n-2)
            
                # --------------------------------------------------------
                # Variance explained version 3
                # --------------------------------------------------------
            
                # Compute version 3 of variance explained
                ve = 100*R**2
    
            # Save p values and variance explained
            ve_df2.loc[i,nonlinear_confounds_site.columns] = ve
            p_df2.loc[i,nonlinear_confounds_site.columns] = p
    
    # Convert back to numpy
    ve = ve_df2.values.flatten()
    p = p_df2.values.flatten()
            
    # Check if we are returning the result
    if not return_df:
        
        # Get the memmap filenames for p values
        if p_fname is None:
            p_fname = os.path.join(os.getcwd(),'temp_mmap', 'p.npy')
            
        # Get the memmap filenames for p values
        if ve_fname is None:
            ve_fname = os.path.join(os.getcwd(),'temp_mmap', 've.npy')
        
        # Indices for where to add to memmap
        indices = np.ix_([IDP_index],np.arange(num_conf_nonlin))
        
        # Add p values to memory maps
        addBlockToMmap(p_fname, p, indices,(num_IDPs, num_conf_nonlin),dtype=np.float64)
        
        # Add explained variance values to memory maps
        addBlockToMmap(ve_fname, ve, indices,(num_IDPs, num_conf_nonlin),dtype=np.float64)
    
    else:
    
        # Return ve and p
        pass
    
t2_total = time.time()
print('Time to beat: ', t2_total-t1_total)

In [None]:
ve

## V_new


In [None]:


data_dir = data_dir 
IDP_indices = np.arange(410,510)
nonlinear_confounds = nonlinear_confounds#[:,:]#conf_ct
IDPs_deconf = IDPs_deconf#IDPs                         
method=1 
dtype=np.float64
p_fname=None
ve_fname=None
return_df=True
# --------------------------------------------------------------------------------
# Convert to appropriate datatype. If we have a filename for a memory mapped 
# dataframe we want to read it in as a memory mapped df (after all it is already 
# saved on disk so no extra memory usage there), otherwise if it is a pandas 
# dataframe we leave it as it is (after all it is already in memory so we are not
# increasing usage).
# --------------------------------------------------------------------------------
t1_total = time.time()

# If we only have filename
if type(nonlinear_confounds) == str:
    
    # Convert input to memory mapped dataframes if it isn't already
    nonlinear_confounds = switch_type(nonlinear_confounds, out_type='MemoryMappedDF')
    
# If we only have filename
if type(IDPs_deconf) == str:
    
    # Convert input to memory mapped dataframes if it isn't already
    IDPs_deconf = switch_type(IDPs_deconf, out_type='MemoryMappedDF')
    
# Get the subject ids
sub_ids = IDPs_deconf.index

# Read in the IDs for site
site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)

# Get the unique site ids
unique_site_ids = np.unique(site_ids)

# Initialize indSite as a list to hold the indices
inds_per_site = []

# Loop over each value in site ids
for site_id in unique_site_ids:

    # Find the indices for this site
    indices = np.where((site_ids == site_id).all(axis=1))[0]

    # Append the found indices to the indSite list
    inds_per_site.append(indices)

# Delete the indices
del indices

# Get the number of nonlinear confounds
num_conf_nonlin = nonlinear_confounds.shape[1]

# Get the number of IDPs
num_IDPs = IDPs_deconf.shape[1]

# Get the number of IDPs in the block
num_IDPs_block = len(IDP_indices)

# Initialise empty arrays for p values
p_df = pd.DataFrame(np.zeros((num_IDPs_block,num_conf_nonlin))*np.NaN,columns=nonlinear_confounds.columns)

# Initialise empty arrays for explained variances
ve_df = pd.DataFrame(np.zeros((num_IDPs_block,num_conf_nonlin))*np.NaN,columns=nonlinear_confounds.columns)

# Get IDP
if type(IDPs_deconf) == pd.core.frame.DataFrame:
    IDP_block = IDPs_deconf.iloc[:, IDP_indices].values
else:
    IDP_block = IDPs_deconf[:, IDP_indices].values

# If coincident we can speed things up by considering multiple columns at once
for site_no in (unique_site_ids + 1):

    
    # Get the columns of nonlinear_confounds for this site
    nonlinear_confounds_site = filter_columns_by_site(nonlinear_confounds, site_no)

    # Check if we have enough values to perform the comparison
    if len(inds_per_site[site_no-1])!=0:
        
        # Subset to just this site (remembering zero indexing)
        nonlinear_confounds_site = nonlinear_confounds_site.iloc[inds_per_site[site_no-1],:]

        # Demean the confound data for the current site and nonlinear confound
        X = nets_demean(nonlinear_confounds_site).values

        # Number of confounds for site
        num_conf_site = nonlinear_confounds_site.shape[1]
        
        # --------------------------------------------------------
        # Get X,Y and predicted Y
        # --------------------------------------------------------

        # Get the IDP
        Y = IDP_block[inds_per_site[site_no-1],:]

        # Get zerod version
        Y_with_zeros = np.array(Y)
        Y_with_zeros[np.isnan(Y)]=0

        # Compute Y'Y. Here each column is treated sepeately so Y is (n x 1)
        # and Y'Y is a single value for each column
        YtY = np.einsum('ij,ij->j',Y_with_zeros, Y_with_zeros).reshape(1,num_IDPs_block)

        # Compute X'Y for each X and Y individually (i.e. for each column
        # of X and each column of Y we do the (1 x n) by (n x 1) matrix
        # multiplication to get a single value
        XtY = np.zeros((num_conf_site, num_IDPs_block))

        # Compute X'X. There is no easy way to do this in a broadcasted way
        # as we have to construct a different X for each pattern of NaN values
        # in Y.
        XtX = np.zeros((num_conf_site, num_IDPs_block))

        # Block storing number of observations for each column
        n_per_col = np.zeros(IDP_block.shape[1])

        # Loop through y columns (the nan removal we cannot broadcast)
        for IDP_no in range(IDP_block.shape[1]):

            # Get current Y
            Y_current = Y[:,IDP_no:(IDP_no+1)]
            
            # Find the non-nan indices for this site
            indices = np.where(~np.isnan(Y_current))[0]

            # Subset X and Y
            X_current = X[indices,:]
            Y_current = Y_current[indices,:]

            # Save n
            n_per_col[IDP_no] = Y_current.shape[0]

            # Demean
            X_current = X_current - np.mean(X_current, axis=0)

            # Compute XtX current
            XtX[:,IDP_no] = np.einsum('ij,ij->j', X_current, X_current)

            # Compute XtY current
            XtY[:,IDP_no] = np.einsum('ij,ik->j', X_current, Y_current)

        # Get betahat
        betahat = XtY/XtX

        # Get variance explained
        ve = 100*(XtY**2)/YtY/XtX/n_per_col

        # Get degrees of freedom
        df = 1*np.any(np.abs(X)>1e-8,axis=0)

        # Get error degrees of freedom
        dferror = n_per_col.reshape((1, num_IDPs_block)) - df.reshape((num_conf_site,1))

        # F stat
        F = ((XtY**2)/YtY/XtX)/(df.reshape((num_conf_site,1))/dferror)
    
        # Compute p[i] using the F-distribution
        p = 1 - f.cdf(F, df.reshape((num_conf_site,1)), dferror)

        # Save p values and variance explained
        ve_df[[*nonlinear_confounds_site.columns]] = ve.T
        p_df[[*nonlinear_confounds_site.columns]] = p.T
        
# Check if we are returning the result
if not return_df:
    
    # Get the memmap filenames for p values
    if p_fname is None:
        p_fname = os.path.join(os.getcwd(),'temp_mmap', 'p.npy')
        
    # Get the memmap filenames for p values
    if ve_fname is None:
        ve_fname = os.path.join(os.getcwd(),'temp_mmap', 've.npy')
    
    # Indices for where to add to memmap
    indices = np.ix_([IDP_index],np.arange(num_conf_nonlin))
    
    # Add p values to memory maps
    addBlockToMmap(p_fname, p, indices,(num_IDPs, num_conf_nonlin),dtype=np.float64)
    
    # Add explained variance values to memory maps
    addBlockToMmap(ve_fname, ve, indices,(num_IDPs, num_conf_nonlin),dtype=np.float64)

else:

    # Return ve and p
    pass
    
t2_total = time.time()
print('Time: ', t2_total-t1_total)

In [None]:
type(IDP_index) in (np.int64, np.int32, np.int16, 'int64', 'int32', 'int16')

In [None]:
(p_df2-p_df).abs().max().max()

In [None]:
Y_current[:,0]*X_current[:,j]

In [None]:
result=np.einsum('ij,ij->j',X_current, X_current)

In [None]:
XtY.shape

In [None]:
XtX = np.zeros(XtY.shape)
XtX[:,IDP_no]=result

In [None]:
j = 29
k = 0

result[j,k]-np.sum(X_current[:,j]*Y_current[:,k])

In [None]:
p.shape

In [None]:
p2.flatten().shape

In [None]:

# Get sigma^2 estimator
sigma2 = np.sum(resids**2,axis=1)/Y.shape[0]

# Contrast for beta2
L = np.array([0,1]).reshape((1,2,1))

# Contrast variance
invDVtL = Vt/D.reshape(*D.shape,1) @ L 
varLtBeta = np.sqrt(sigma2.reshape(*sigma2.shape,1)*(invDVtL.transpose((0,2,1)) @ invDVtL))

# T statistic for contrast
T = L.transpose((0,2,1)) @ beta / varLtBeta

In [None]:
ve1

In [None]:
ve2

In [None]:
ve3

In [None]:
p = 2*t.sf(np.abs(T.flatten()), dferror)

In [None]:
p.shape

In [None]:
# --------------------------------------------------------
# Variance explained version 2
# --------------------------------------------------------

# Construct new design matrix
XplusIntercept = np.ones((X.shape[1], X.shape[0], 2))
XplusIntercept[:,:,1] = X.T[:]


# Perform OLS regression
U, D, Vt = np.linalg.svd(XplusIntercept, full_matrices=False)

In [None]:
sigma2 = np.sum(resids**2,axis=1)/Y.shape[0]

In [None]:

# Contrast for beta2
L = np.array([0,1]).reshape((1,2,1))

# Contrast variance
invDVtL = Vt/D.reshape(*D.shape,1) @ L 
varLtBeta = np.sqrt(sigma2.reshape(*sigma2.shape,1))*(invDVtL.transpose((0,2,1)) @ invDVtL)


In [None]:
T = L.transpose((0,2,1)) @ beta / varLtBeta

In [None]:
Y.shape

In [None]:

# Number of elements
n = X.shape[0]

# Compute numerator
numerator = np.sum(X*Y,axis=0) - n*np.mean(X,axis=0)*np.mean(Y,axis=0)

# Compute denominator
denom_X = np.sqrt(np.linalg.norm(X,axis=0)**2 - n*np.mean(X,axis=0)**2)
denom_Y = np.sqrt(np.linalg.norm(Y,axis=0)**2 - n*np.mean(Y,axis=0)**2)

# Compute coefficient
R = numerator/(denom_X*denom_Y)
 
# Get T statistic
T = R*np.sqrt((n-2)/(1-R**2))

# Assuming 't' is your t-statistic and 'n' is sample size
p = 2*t.sf(np.abs(T), n-2)

In [None]:
p

In [None]:
tmp=np.where(~np.isnan(crossed_inds))

In [None]:
import time
IDP_index = 2000
t1 = time.time()

# Get the subject ids
sub_ids = IDPs_deconf.index

# Read in the IDs for site
site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)

# Get the unique site ids
unique_site_ids = np.unique(site_ids)

# Initialize indSite as a list to hold the indices
inds_per_site = []

# Loop over each value in site ids
for site_id in unique_site_ids:

    # If we have a pandas dataframe
    if type(IDPs_deconf) == pd.core.frame.DataFrame:
        IDPs_values = IDPs_deconf.iloc[:, IDP_index].values.flatten()
    else:
        IDPs_values = IDPs_deconf[:, IDP_index].values.flatten()

    # Find the non-nan indices for this site
    indices = np.where(~np.isnan(IDPs_values) & (site_ids == site_id).all(axis=1))[0]

    # Append the found indices to the indSite list
    inds_per_site.append(indices)

# Delete the indices
del indices
t2 = time.time()
print(t2-t1)

In [None]:
IDPs_deconf.shape

In [None]:
tmp = IDPs_deconf[:,:].values.T

In [None]:

t1 = time.time()
IDPs_deconf[:, IDP_index]
t2 = time.time()
print(t2-t1)

In [None]:
IDPs_values

In [None]:
import time
t1 = time.time()
# Get the subject ids
sub_ids = nonlinear_confounds.index

# Read in the IDs for site
site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)

# Get the unique site ids
unique_site_ids = np.unique(site_ids)

# Initialize indSite as a list to hold the indices
inds_per_site = {}

# Loop over each value in site ids
for site_id in (unique_site_ids + 1):

    # Find the indices where all elements in a row of siteDATA match the current valueSite
    # Note: This assumes siteDATA and siteValues have compatible shapes or values for comparison
    indices = np.where((site_ids == site_id-1).all(axis=1))[0]

    # Append the found indices to the indSite list
    inds_per_site[site_id] = indices

# Delete the indices
del indices
t2 = time.time()
print(t2-t1)

In [None]:

# Rough estimate of maximum memory (bytes)
MAXMEM = 2**32

# Get the number of subjects
n_sub = len(sub_ids)

# Block size computation
blksize = int(MAXMEM/n_sub/8/64)

In [None]:
# Initialise empty dict to store headers
columns_for_sites = {}

# Number of crossed terms we will consider
n_ct = 0
n_ct_per_site = {}

# Number of confound in each site
n_conf_per_site = {}

# Create a dict of site-specific column headers
for site_index in (unique_site_ids + 1):

    # Get the columns for this site
    columns_for_sites[site_index] = filter_columns_by_site(confounds, 
                                                           site_index, return_df=False)

    # Add nonlinear columns
    columns_for_sites[site_index] = columns_for_sites[site_index] + \
                                    filter_columns_by_site(nonlinear_confounds, 
                                                           site_index, return_df=False)
    
    # Add the number of confounds for this site
    n_conf_per_site[site_index] = int(len(columns_for_sites[site_index]))
    
    # Add the number of crossed terms for this site
    n_ct_per_site[site_index] = int((len(columns_for_sites[site_index])-1)*(len(columns_for_sites[site_index]))/2)
    n_ct = n_ct + n_ct_per_site[site_index]

In [None]:
n_ct_per_site

In [None]:
crossed_inds = np.array(np.zeros((n_ct,3)),dtype='int16')

In [None]:

# This array gives the indices for the site-specific confounds in the crossed 
# term confound matrix. e.g. crossed_terms[:,site_idx[i]:site_idx[i+1]] are 
# crossed terms for site i.
site_idx = np.cumsum([n_ct_per_site[site] for site in n_ct_per_site])
site_idx = np.insert(site_idx,0,0)

In [None]:
# We now construct the crossed_inds matrix. This is interpreted as follows: row
# k represents the k-th confound - it is constructed from the product of the
# crossed_inds[k,1]^th and crossed_inds[k,2]^th terms from site number
# crossed_inds[k,0].
for i in range(len(site_idx)-1):

    # Set the site indices
    crossed_inds[site_idx[i]:site_idx[i+1],0]=i

    # Set the indices for the first crossed factor
    crossed_inds[site_idx[i]:site_idx[i+1],1] = np.concatenate([np.repeat(i+1, i+1) for i in range(n_conf_per_site[i+1]-1)])
    
    # Set the indices for the second crossed factor
    crossed_inds[site_idx[i]:site_idx[i+1],2] = np.concatenate([np.arange(i+1) for i in range(n_conf_per_site[i+1]-1)])

In [None]:
crossed_inds[-10:,:]

In [None]:

# Combine the two
confounds_full = pd.concat([confounds[:,:],nonlinear_confounds[:,:]], axis=1)

In [None]:
confounds_full=MemoryMappedDF(confounds_full)

In [None]:
nonlinear_confounds.list_groups()

In [None]:
tmp=confounds.__dict__['groups']
tmp2=nonlinear_confounds.__dict__['groups']

In [None]:

# Add groupings
groups = {**confounds.__dict__['groups'],**nonlinear_confounds.__dict__['groups']}

# Loop through groups
for group_name in groups:

    # Read in the current variable group
    current_group = groups[group_name]

    # Initialise empty list for this group
    updated_group = []
    
    # Loop through the variables
    for variable in current_group:

        # Check if its in the reduced confounds
        if (variable in nonlinear_confounds.columns) or (variable in confounds.columns):

            # Add to updated_group
            updated_group = updated_group + [variable]

    # If the new groups not empty save it as a group in the new memory mapped df
    if len(updated_group) > 0:

        # Add the updated group
        confounds_full.set_group(group_name, updated_group)

In [None]:
confounds_full.list_groups()

In [None]:
confounds_full.set_group('nonlin', list(nonlinear_confounds.columns))

In [None]:
confounds_full.get_groups(['nonlin'])

In [None]:
np.where(~np.isnan(crossed_inds)) 

In [None]:
addBlockToMmap(os.path.join(os.getcwd(),'temp_mmap', 'crossed_inds.dat'), 
                   crossed_inds, np.where(~np.isnan(crossed_inds)),
                   crossed_inds.shape, dtype='int16')

In [None]:
tmp4=np.memmap(os.path.join(os.getcwd(),'temp_mmap', 'crossed_inds.dat'), shape=(n_ct,3),dtype='int16') 

In [None]:
tmp4

In [None]:

    # Rough estimate of maximum memory (bytes)
    MAXMEM = 2**32

    # Get the number of subjects
    n_sub = len(sub_ids)

    # Block size computation
    blksize = int(MAXMEM/n_sub/8/64)
    

In [None]:
idx = np.arange(n_ct)

In [None]:

    
    # Get the number of blocks we are breaking computation into
    num_blks = int(np.ceil(n_ct/blksize))

In [None]:
blocks = [idx[i*blksize:min((i+1)*blksize,n_ct)] for i in range(num_blks)]

In [None]:
len(blocks)

In [None]:
crossed_inds = crossed_inds[block,:]
n_ct_block = crossed_inds.shape[0]

In [None]:
n_ct_block

In [None]:

    # Set nonlinear confound group
    confounds_full.set_group('nonlin', list(nonlinear_confounds.columns))

In [None]:
confounds_for_site=filter_columns_by_site(confounds_full, 4, return_df=False)
# filter_columns_by_site(confounds, site_index, return_df=False)

In [None]:
confounds[:,confounds_for_site[10]]

In [None]:
len(filter_columns_by_site(nonlinear_confounds, 1, return_df=False))


In [None]:
int(block[0]/blksize)