# UK Biobank Confounds Processing

## Imports

In [None]:
import os
import time 
import shutil
import numpy as np
import pandas as pd

from lib.script_01_00 import generate_initial_variables
from lib.script_01_01 import generate_raw_confounds
from lib.script_01_02 import generate_nonlin_confounds

from src.nets.nets_load_match import nets_load_match
from src.nets.nets_inverse_normal import nets_inverse_normal 
from src.nets.nets_normalise import nets_normalise 
from src.nets.nets_demean import nets_demean
from src.nets.nets_deconfound import nets_deconfound

from src.duplicate.duplicate_categorical import duplicate_categorical
from src.duplicate.duplicate_demedian_norm_by_site import duplicate_demedian_norm_by_site

from src.preproc.datenum import datenum
from src.preproc.days_in_year import days_in_year

from src.memmap.MemoryMappedDF import MemoryMappedDF
from src.memmap.read_memmap_df import read_memmap_df
from src.memmap.addBlockToMmap import addBlockToMmap

In [None]:
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Output directory (will eventually be equal to data_dir)
out_dir = '/well/nichols/users/inf852/confounds/data/'

## Script 01_00: gen_init_vars

In [None]:
# If you set skip to True, we will skip script 01_00 and load in a presaved output
skip = True

# Run notebook 00
if not skip:

    # Time the notebook
    t1 = time.time()
    IDPs, nonIDPs, misc = generate_initial_variables(data_dir, out_dir)
    t2 = time.time()

    # Print the time
    print(t2-t1)
    
else: 
    
    # Read in precomputed memmaps
    IDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz'))
    nonIDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonIDPs.npz'))
    misc = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','misc.npz'))

*The previous run of notebook zero took 246.62735080718994 seconds ≈ 4.1 minutes.*

## Script 01_01: gen_raw_conf_gpu

In [None]:
# Get the subject IDs
sub_ids = IDPs.index

# If you set skip to True, we will skip script 01_01 and load in a presaved output
skip = True

# Run notebook 01
if not skip:

    # Generate raw confounds
    t1 = time.time()
    confounds = generate_raw_confounds(data_dir, sub_ids)
    t2 = time.time()

    # Print the time
    print(t2-t1)
    
else:
    
    # Read in precomputed confounds
    confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','confounds.npz'))

*Previous run took 12.58459210395813 seconds.*

## Script 01_02: gen_nonlin_conf_gpu

In [None]:
# If you set skip to True, we will skip script 01_02 and load in a presaved output
skip = False

# Run notebook 02
if not skip:

    # Set cluster configuration
    local_cluster = {'cluster_type':'local','num_nodes':12}

    # Generate non linear confounds and deconfound IDPs
    t1 = time.time()
    nonlinear_confounds, IDPs_deconf = generate_nonlin_confounds(data_dir, confounds, IDPs, local_cluster)
    t2 = time.time()

    # Time the notebook
    print(t2-t1)

    # Save the results as files we can reconstruct memory mapped dataframes from
    nonlinear_confounds_fname = os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds.npz')
    nonlinear_confounds.save(nonlinear_confounds_fname)
    
    # Save the results as files we can reconstruct memory mapped dataframes from
    IDPs_deconf_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz')
    IDPs_deconf.save(IDPs_deconf_fname)
    
else:
    
    # Read in precomputed confounds
    nonlinear_confounds_fname = os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds.npz')
    nonlinear_confounds = read_memmap_df(nonlinear_confounds_fname)

    # Read in precomputed IDPs
    IDPs_deconf_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz')
    IDPs_deconf = read_memmap_df(IDPs_deconf_fname)

*Last local cluster run (12 nodes) took 1225.3047969341278 seconds ≈ 20.4 minutes.*

*Comparison on the same machine; MatLab local cluster run (12 nodes) took 2305.303473 seconds ≈ 38.4 minutes.* 

In [None]:
nonlinear_confounds[1:20,1:200]

In [None]:
import pandas as pd
import numpy as np

B = IDPs_deconf[:,:]

common_cols = list(set(IDPs_deconf.columns) & set(IDPs.columns))
B.loc[:, common_cols] = IDPs.loc[:, common_cols].mask(IDPs_deconf.loc[:, common_cols].isna(), np.nan)

print(B)

In [None]:
IDPs_deconf.mode

In [None]:
a = np.random.randn(100,100)
b = np.random.randn(100,100)

a[a>0] = np.NaN


In [None]:
tmp = nonlinear_confounds[1:10,300:310]
y_current = tmp[['HeadMotion_mean_dMRI_rel_Site_1_squared']]
non_nan = ~np.array((y_current==0).astype(int).values,dtype=bool)

In [None]:
tmp[~non_nan.flatten()]=np.NaN
tmp

## Script 01_03-4: gen_jobs/gen_nonlin_conf


In [None]:
# If you set skip to True, we will skip script 01_03 and load in a presaved output
skip = True

# Run notebook 03
if not skip:

    # Set cluster configuration
    dask_cluster = {'cluster_type':'local','num_nodes':12}

    # Generate non linear confounds and deconfound IDPs
    t1 = time.time()
    get_p_vals_and_ve(data_dir, out_dir, nonlinear_confounds, IDPs_deconf, cluster_cfg=None)
    t2 = time.time()

    # Time the notebook
    print(t2-t1)

    # # Save the results as files we can reconstruct memory mapped dataframes from
    # nonlinear_confounds_fname = os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds.npz')
    # nonlinear_confounds.save(nonlinear_confounds_fname)
    
    # # Save the results as files we can reconstruct memory mapped dataframes from
    # IDPs_deconf_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz')
    # IDPs_deconf.save(IDPs_deconf_fname)
    
else:
    
    # # Read in precomputed confounds
    # nonlinear_confounds_fname = os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds.npz')
    # nonlinear_confounds = read_memmap_df(nonlinear_confounds_fname)

    # # Read in precomputed IDPs
    # IDPs_deconf_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz')
    # IDPs_deconf = read_memmap_df(IDPs_deconf_fname)

## Script 01_05: gen_nonlin_conf

This code is called to by `script_01_03-04.py` and does not need to be run at this level.

## Script 01_06: gen_nonlin_conf

This script has been passed over, as it is only for generating plots in matlab.

In [None]:
i = 100
num_IDP = 55

# Get the subject ids
sub_ids = IDPs_deconf.index

# Read in the IDs for site
site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)

# Get the unique site ids
unique_site_ids = np.unique(site_ids)

# Initialize indSite as a list to hold the indices
inds_per_site = []

# Loop over each value in site ids
for site_id in unique_site_ids:

    # Find the non-nan indices for this site
    indices = np.where(~np.isnan(IDPs_deconf[:, num_IDP].values.flatten()) & (site_ids == site_id).all(axis=1))[0]

    # Append the found indices to the indSite list
    inds_per_site.append(indices)

# Delete the indices
del indices

In [None]:
# Get the site of the non-linear confound
site_no = int(nonlinear_confounds.columns[i].split('Site_')[1][0])

# Get the nonlinear confound for this site
nonlinear_confound = nonlinear_confounds[:,i].values

# Subset to just this site (remembering zero indexing)
nonlinear_confound = nonlinear_confound[inds_per_site[site_no-1]]

# Get X
X = nets_demean(pd.DataFrame(nonlinear_confound)).values

# Get Y
Y = IDPs_deconf[:, num_IDP].values
Y = Y[inds_per_site[site_no-1]]

# Get predicted Y = Xbeta
pred_Y = np.nansum(X*Y)/np.nansum(X**2)*X

# Get variance explained by pred_Y
ve1 = 100*((np.nanstd(pred_Y)/np.std(Y[~np.isnan(X)]))**2)

In [None]:

from scipy.stats import f  
from scipy.stats import t 
from scipy.linalg import pinv, lstsq  
from src.nets.nets_demean import nets_demean
from src.nets.nets_pearson import nets_pearson
from src.nets.nets_load_match import nets_load_match
# Demean the confound data for the current site and nonlinear confound
X = nets_demean(pd.DataFrame(nonlinear_confound)).values

# Get Y
Y = IDPs_deconf[:, num_IDP].values
Y = Y[inds_per_site[site_no-1]]

# Remove potential nans from X
Y = Y[~np.isnan(X)]
X = X[~np.isnan(X)]

# Get predicted Y = Xbeta
pred_Y = np.nansum(X*Y)/np.nansum(X**2)*X

# Compute the residuals
resids = Y - pred_Y

# --------------------------------------------------------
# Variance explained version 1
# --------------------------------------------------------
# Get variance explained by pred_Y
ve1 = 100*((np.nanstd(pred_Y)/np.std(Y[~np.isnan(X)]))**2)

# --------------------------------------------------------
# P Version 1
# --------------------------------------------------------

# Compute the sum of squares for the effect
SSeffect = np.linalg.norm(pred_Y - np.mean(pred_Y))**2  

# Compute the sum of squares for the error
SSerror = np.linalg.norm(resids - np.mean(resids))**2  

# Compute the degrees of freedom for the effect
df = np.linalg.matrix_rank(X) 

# Compute the degrees of freedom for the error
dferror = len(Y) - df  

# Compute the F-statistic
F = (SSeffect / df) / (SSerror / dferror)  

# Compute p[i] using the F-distribution
p = 1 - f.cdf(F, df, dferror)  


# --------------------------------------------------------
# Variance explained version 2
# --------------------------------------------------------

# Construct new design matrix
XplusIntercept = np.ones((X.shape[0],2))
XplusIntercept[:,1] = X[:]

# Perform OLS regression
U, D, Vt = np.linalg.svd(XplusIntercept, full_matrices=False)

# Get the rank of the matrix
rank = np.sum(D > 1e-10)

# Rank reduce U, D and Vt
U = U[:, :rank] 
Vt = Vt[:rank,:]
D = D[:rank]

# Get betahat
beta = (Vt.T/D) @ (U.T @ Y)

# Get residuals
resids = Y - XplusIntercept @ beta

# Get sigma^2 estimator
sigma2 = np.sum(resids**2)/Y.shape[0]

# Contrast for beta2
L = np.array([[0],[1]])

# Contrast variance
invDVtL = Vt/D @ L 
varLtBeta = np.sqrt(sigma2*invDVtL.T @ invDVtL)

# T statistic for contrast
T = L.T @ beta / varLtBeta

# Second version of variance explained
ve2 = 100*(1-(np.std(resids)**2/np.std(Y)**2))


# --------------------------------------------------------
# P-value version 2
# --------------------------------------------------------

# P value
p2 = 1 - t.cdf(T, dferror)[0,0]  

# --------------------------------------------------------
# P-value version 3
# --------------------------------------------------------

# Compute pearson coefficient
R, p3 = nets_pearson(X,Y)

# --------------------------------------------------------
# Variance explained version 3
# --------------------------------------------------------

# Compute version 3 of variance explained
ve3 = 100*R**2

print(p, p2, p3)

In [None]:
import time
from lib.func_01_05 import func_01_05_gen_nonlin_conf

t1 = time.time()
func_01_05_gen_nonlin_conf(data_dir, out_dir, num_IDP, nonlinear_confounds, IDPs_deconf)
t2 = time.time()

print(t2-t1)

In [None]:
num_IDP

In [None]:

import os
import shutil
import numpy as np
import pandas as pd
from dask.distributed import Client, as_completed
from lib.script_01_05 import func_01_05_gen_nonlin_conf

cluster_cfg = {'cluster_type':'slurm','num_nodes':100}

# --------------------------------------------------------------------------------
# Handle empty configuration
# --------------------------------------------------------------------------------
if cluster_cfg is None:

    # Set new local configuration
    cluster_cfg = {'cluster_type':'local','num_nodes':1}

# --------------------------------------------------------------------------------
# Set up cluster
# --------------------------------------------------------------------------------
if 'cluster_type' in cluster_cfg:

    # Check if we are using a HTCondor cluster
    if cluster_cfg['cluster_type'].lower() == 'htcondor':

        # Load the HTCondor Cluster
        from dask_jobqueue import HTCondorCluster
        cluster = HTCondorCluster()

    # Check if we are using an LSF cluster
    elif cluster_cfg['cluster_type'].lower() == 'lsf':

        # Load the LSF Cluster
        from dask_jobqueue import LSFCluster
        cluster = LSFCluster()

    # Check if we are using a Moab cluster
    elif cluster_cfg['cluster_type'].lower() == 'moab':

        # Load the Moab Cluster
        from dask_jobqueue import MoabCluster
        cluster = MoabCluster()

    # Check if we are using a OAR cluster
    elif cluster_cfg['cluster_type'].lower() == 'oar':

        # Load the OAR Cluster
        from dask_jobqueue import OARCluster
        cluster = OARCluster()

    # Check if we are using a PBS cluster
    elif cluster_cfg['cluster_type'].lower() == 'pbs':

        # Load the PBS Cluster
        from dask_jobqueue import PBSCluster
        cluster = PBSCluster()

    # Check if we are using an SGE cluster
    elif cluster_cfg['cluster_type'].lower() == 'sge':

        # Load the SGE Cluster
        from dask_jobqueue import SGECluster
        cluster = SGECluster()

    # Check if we are using a SLURM cluster
    elif cluster_cfg['cluster_type'].lower() == 'slurm':

        # Load the SLURM Cluster
        from dask_jobqueue import SLURMCluster
        cluster = SLURMCluster()

    # Check if we are using a local cluster
    elif cluster_cfg['cluster_type'].lower() == 'local':

        # Load the Local Cluster
        from dask.distributed import LocalCluster
        cluster = LocalCluster()

    # Raise a value error if none of the above
    else:
        raise ValueError('The cluster type, ' + cluster_cfg['cluster_type'] + ', is not recognized.')

else:
    # Raise a value error if the cluster type was not specified
    raise ValueError('Please specify "cluster_type" in the cluster configuration.')

# --------------------------------------------------------------------------------
# Connect to client
# --------------------------------------------------------------------------------

# Connect to cluster
client = Client(cluster)   

# Read in number of nodes we need
num_nodes = int(cluster_cfg['num_nodes'])

# Scale the cluster
cluster.scale(num_nodes)

# Get the dashboard link
dashboard_link = client.cluster.dashboard_link
dashboard_port = dashboard_link.split(':')[-1].split('/')[0]

print("Dask distributed is now running at the following address on your cluster: " + 
      dashboard_link + ". If you wish to run locally, port " + str(dashboard_port) +
      " to your machine. e.g. run something like: \n \n ssh -L "+ str(dashboard_port) + ':localhost:' +
      str(dashboard_port) + " username@cluster_address \n \nand then navigate to http://localhost:" + 
      str(dashboard_port) +"/status to view the console.")

# --------------------------------------------------------------------------------
# Run cluster jobs
# --------------------------------------------------------------------------------

# Get the number of nonlinear confounds
num_conf_nonlin = nonlinear_confounds.shape[1]

# Get the number of IDPs
num_IDPs = IDPs_deconf.shape[1]

# Empty futures list
futures = []

# Submit jobs
for i in np.arange(num_IDPs):

    # Run the i^{th} job.
    future_i = client.submit(func_01_05_gen_nonlin_conf, data_dir, out_dir, i, 
                             nonlinear_confounds_fname, IDPs_deconf_fname, 
                             pure=False)

    # Append to list 
    futures.append(future_i)

# Completed jobs
completed = as_completed(futures)

# Wait for results
for i in completed:
    i.result()

# Delete the future objects (NOTE: This is important! If you don't delete the 
# futures dask tries to rerun them every time you call the result function).
del i, completed, futures, future_i

# Work out columns and index for dataframes
indices = IDPs_deconf.columns
columns = nonlinear_confounds.columns

# Create p1 memory mapped df
p1 = np.memmap(os.path.join(out_dir, 'p1.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
p1 = pd.DataFrame(p1,index=indices,columns=columns)
p1 = MemoryMappedDF(p1)

# Create p2 memory mapped df
p2 = np.memmap(os.path.join(out_dir, 'p2.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
p2 = pd.DataFrame(p2,index=indices,columns=columns)
p2 = MemoryMappedDF(p2)

# Create p3 memory mapped df
p3 = np.memmap(os.path.join(out_dir, 'p3.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
p3 = pd.DataFrame(p3,index=indices,columns=columns)
p3 = MemoryMappedDF(p3)

# Create ve1 memory mapped df
ve1 = np.memmap(os.path.join(out_dir, 've1.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
ve1 = pd.DataFrame(ve1,index=indices,columns=columns)
ve1 = MemoryMappedDF(ve1)

# Create ve2 memory mapped df
ve2 = np.memmap(os.path.join(out_dir, 've2.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
ve2 = pd.DataFrame(ve2,index=indices,columns=columns)
ve2 = MemoryMappedDF(ve2)

# Create ve3 memory mapped df
ve3 = np.memmap(os.path.join(out_dir, 've3.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
ve3 = pd.DataFrame(ve3,index=indices,columns=columns)
ve3 = MemoryMappedDF(ve3)

# Remove original files
fnames = [os.path.join(out_dir, 'p1.npy'), os.path.join(out_dir, 'p2.npy'),
          os.path.join(out_dir, 'p3.npy'), os.path.join(out_dir, 've1.npy'),
          os.path.join(out_dir, 've2.npy'), os.path.join(out_dir, 've3.npy')]

# Loop through files removing each
for fname in fnames:
    os.remove(fname)

#return(p1,p2,p3,ve1,ve2,ve3)

In [None]:

# Get the number of nonlinear confounds
num_conf_nonlin = nonlinear_confounds.shape[1]

# Get the number of IDPs
num_IDPs = IDPs_deconf.shape[1]

tmp = np.memmap(os.path.join(out_dir, 'p1.npy'),dtype=np.float32,shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]

In [None]:

# Work out columns and index for dataframes
indices = IDPs_deconf.columns
columns = nonlinear_confounds.columns

# Create p1 memory mapped df
p1 = np.memmap(os.path.join(out_dir, 'p1.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
p1 = pd.DataFrame(p1,index=indices,columns=columns)
p1 = MemoryMappedDF(p1)

# Create p2 memory mapped df
p2 = np.memmap(os.path.join(out_dir, 'p2.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
p2 = pd.DataFrame(p2,index=indices,columns=columns)
p2 = MemoryMappedDF(p2)

# Create p3 memory mapped df
p3 = np.memmap(os.path.join(out_dir, 'p3.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
p3 = pd.DataFrame(p3,index=indices,columns=columns)
p3 = MemoryMappedDF(p3)

# Create ve1 memory mapped df
ve1 = np.memmap(os.path.join(out_dir, 've1.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
ve1 = pd.DataFrame(ve1,index=indices,columns=columns)
ve1 = MemoryMappedDF(ve1)

# Create ve2 memory mapped df
ve2 = np.memmap(os.path.join(out_dir, 've2.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
ve2 = pd.DataFrame(ve2,index=indices,columns=columns)
ve2 = MemoryMappedDF(ve2)

# Create ve3 memory mapped df
ve3 = np.memmap(os.path.join(out_dir, 've3.npy'),dtype=np.float32,
               shape=(num_IDPs, num_conf_nonlin),mode='r')[:,:]
ve3 = pd.DataFrame(ve3,index=indices,columns=columns)
ve3 = MemoryMappedDF(ve3)

# # Remove original files
# fnames = [os.path.join(out_dir, 'p1.npy'), os.path.join(out_dir, 'p2.npy'),
#           os.path.join(out_dir, 'p3.npy'), os.path.join(out_dir, 've1.npy'),
#           os.path.join(out_dir, 've2.npy'), os.path.join(out_dir, 've3.npy')]

# # Loop through files removing each
# for fname in fnames:
#     os.remove(fname)

In [None]:
IDPs_deconf.shape[0]-IDPs_deconf.search_cols('ASL_region*').isna().sum()

In [None]:

# Save the results as files we can reconstruct memory mapped dataframes from
p1_fname = os.path.join(os.getcwd(),'saved_memmaps','p1.npz')
p1.save(p1_fname)

# Save the results as files we can reconstruct memory mapped dataframes from
p2_fname = os.path.join(os.getcwd(),'saved_memmaps','p2.npz')
p2.save(p2_fname)

# Save the results as files we can reconstruct memory mapped dataframes from
p3_fname = os.path.join(os.getcwd(),'saved_memmaps','p3.npz')
p3.save(p3_fname)

# Save the results as files we can reconstruct memory mapped dataframes from
ve1_fname = os.path.join(os.getcwd(),'saved_memmaps','ve1.npz')
ve1.save(ve1_fname)

# Save the results as files we can reconstruct memory mapped dataframes from
ve2_fname = os.path.join(os.getcwd(),'saved_memmaps','ve2.npz')
ve2.save(ve2_fname)

# Save the results as files we can reconstruct memory mapped dataframes from
ve3_fname = os.path.join(os.getcwd(),'saved_memmaps','ve3.npz')
ve3.save(ve3_fname)

## Garbage Collection

In [None]:
# Note this won't execute in Jupyter until the code is restarted.
#del IDPs, nonIDPs, misc, categorical_IDPs, continuous_IDPs, other_IDPs