# UK Biobank Confounds Processing

## Imports

In [None]:
import os
import time
import shutil
import numpy as np
import pandas as pd

from lib.script_01_00 import generate_initial_variables
from lib.script_01_01 import generate_raw_confounds
from lib.script_01_02 import generate_nonlin_confounds

from src.nets.nets_load_match import nets_load_match
from src.nets.nets_inverse_normal import nets_inverse_normal 
from src.nets.nets_normalise import nets_normalise 
from src.nets.nets_demean import nets_demean
from src.nets.nets_deconfound import nets_deconfound

from src.duplicate.duplicate_categorical import duplicate_categorical
from src.duplicate.duplicate_demedian_norm_by_site import duplicate_demedian_norm_by_site

from src.preproc.datenum import datenum
from src.preproc.days_in_year import days_in_year

from src.memmap.MemoryMappedDF import MemoryMappedDF
from src.memmap.read_memmap_df import read_memmap_df
from src.memmap.addBlockToMmap import addBlockToMmap

In [None]:
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Output directory (will eventually be equal to data_dir)
out_dir = '/well/nichols/users/inf852/confounds/data/'

## Script 01_00: gen_init_vars

In [None]:
# If you set skip to True, we will skip script 01_00 and load in a presaved output
skip = True

# Run notebook 00
if not skip:

    # Time the notebook
    t1 = time.time()
    IDPs, nonIDPs, misc = generate_initial_variables(data_dir, out_dir)
    t2 = time.time()

    # Print the time
    print(t2-t1)
    
else:
    
    # Read in precomputed memmaps
    IDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz'))
    nonIDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonIDPs.npz'))
    misc = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','misc.npz'))

*The previous run of notebook zero took 234.92218828201294 seconds ≈ 4 minutes.*

## Script 01_01: gen_raw_conf_gpu

In [None]:
# Get the subject IDs
sub_ids = IDPs.index

# If you set skip to True, we will skip script 01_01 and load in a presaved output
skip = True

# Run notebook 00
if not skip:

    # Generate raw confounds
    t1 = time.time()
    confounds = generate_raw_confounds(data_dir, sub_ids)
    t2 = time.time()

    # Print the time
    print(t2-t1)
    
else:
    
    # Read in precomputed confounds
    confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','confounds.npz'))

*Previous run took 9.763055801391602 seconds.*

## script_01_02: gen_nonlin_conf_gpu

In [None]:
# If you set skip to True, we will skip script 01_02 and load in a presaved output
skip = True

# Run notebook 00
if not skip:

    # Generate non linear confounds and deconfound IDPs
    t1 = time.time()
    nonlinear_confounds, IDPs_deconf = generate_nonlin_confounds(data_dir, confounds, IDPs)
    t2 = time.time()

    # Time the notebook
    print(t2-t1)
    
else:
    
    # Read in precomputed confounds
    nonlinear_confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds.npz'))
    IDPs_deconf = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz'))
    

*Last run took 13329.225360631943 seconds ≈ 3.7 hours.*

## script_01_03-4: gen_jobs/gen_nonlin_conf


In [None]:
i = 100
num_IDP = 55

# Get the subject ids
sub_ids = IDPs_deconf.index

# Read in the IDs for site
site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)

# Get the unique site ids
unique_site_ids = np.unique(site_ids)

# Initialize indSite as a list to hold the indices
inds_per_site = []

# Loop over each value in site ids
for site_id in unique_site_ids:

    # Find the non-nan indices for this site
    indices = np.where(~np.isnan(IDPs_deconf[:, num_IDP].values.flatten()) & (site_ids == site_id).all(axis=1))[0]

    # Append the found indices to the indSite list
    inds_per_site.append(indices)

# Delete the indices
del indices

In [None]:
# Get the site of the non-linear confound
site_no = int(nonlinear_confounds.columns[i].split('Site_')[1][0])

# Get the nonlinear confound for this site
nonlinear_confound = nonlinear_confounds[:,i].values

# Subset to just this site (remembering zero indexing)
nonlinear_confound = nonlinear_confound[inds_per_site[site_no-1]]

# Get X
X = nets_demean(pd.DataFrame(nonlinear_confound)).values

# Get Y
Y = IDPs_deconf[:, num_IDP].values
Y = Y[inds_per_site[site_no-1]]

# Get predicted Y = Xbeta
pred_Y = np.nansum(X*Y)/np.nansum(X**2)*X

# Get variance explained by pred_Y
ve1 = 100*((np.nanstd(pred_Y)/np.std(Y[~np.isnan(X)]))**2)

In [None]:

from scipy.stats import f  
from scipy.stats import t 
from scipy.linalg import pinv, lstsq  
from src.nets.nets_demean import nets_demean
from src.nets.nets_pearson import nets_pearson
from src.nets.nets_load_match import nets_load_match
# Demean the confound data for the current site and nonlinear confound
X = nets_demean(pd.DataFrame(nonlinear_confound)).values

# Get Y
Y = IDPs_deconf[:, num_IDP].values
Y = Y[inds_per_site[site_no-1]]

# Remove potential nans from X
Y = Y[~np.isnan(X)]
X = X[~np.isnan(X)]

# Get predicted Y = Xbeta
pred_Y = np.nansum(X*Y)/np.nansum(X**2)*X

# Compute the residuals
resids = Y - pred_Y

# --------------------------------------------------------
# Variance explained version 1
# --------------------------------------------------------
# Get variance explained by pred_Y
ve1 = 100*((np.nanstd(pred_Y)/np.std(Y[~np.isnan(X)]))**2)

# --------------------------------------------------------
# P Version 1
# --------------------------------------------------------

# Compute the sum of squares for the effect
SSeffect = np.linalg.norm(pred_Y - np.mean(pred_Y))**2  

# Compute the sum of squares for the error
SSerror = np.linalg.norm(resids - np.mean(resids))**2  

# Compute the degrees of freedom for the effect
df = np.linalg.matrix_rank(X) 

# Compute the degrees of freedom for the error
dferror = len(Y) - df  

# Compute the F-statistic
F = (SSeffect / df) / (SSerror / dferror)  

# Compute p[i] using the F-distribution
p = 1 - f.cdf(F, df, dferror)  


# --------------------------------------------------------
# Variance explained version 2
# --------------------------------------------------------

# Construct new design matrix
XplusIntercept = np.ones((X.shape[0],2))
XplusIntercept[:,1] = X[:]

# Perform OLS regression
U, D, Vt = np.linalg.svd(XplusIntercept, full_matrices=False)

# Get the rank of the matrix
rank = np.sum(D > 1e-10)

# Rank reduce U, D and Vt
U = U[:, :rank] 
Vt = Vt[:rank,:]
D = D[:rank]

# Get betahat
beta = (Vt.T/D) @ (U.T @ Y)

# Get residuals
resids = Y - XplusIntercept @ beta

# Get sigma^2 estimator
sigma2 = np.sum(resids**2)/Y.shape[0]

# Contrast for beta2
L = np.array([[0],[1]])

# Contrast variance
invDVtL = Vt/D @ L 
varLtBeta = np.sqrt(sigma2*invDVtL.T @ invDVtL)

# T statistic for contrast
T = L.T @ beta / varLtBeta

# Second version of variance explained
ve2 = 100*(1-(np.std(resids)**2/np.std(Y)**2))


# --------------------------------------------------------
# P-value version 2
# --------------------------------------------------------

# P value
p2 = 1 - t.cdf(T, dferror)[0,0]  

# --------------------------------------------------------
# P-value version 3
# --------------------------------------------------------

# Compute pearson coefficient
R, p3 = nets_pearson(X,Y)

# --------------------------------------------------------
# Variance explained version 3
# --------------------------------------------------------

# Compute version 3 of variance explained
ve3 = 100*R**2

print(p, p2, p3)

In [None]:
import time
from lib.func_01_05 import func_01_05_gen_nonlin_conf

t1 = time.time()
func_01_05_gen_nonlin_conf(data_dir, out_dir, num_IDP, nonlinear_confounds, IDPs_deconf)
t2 = time.time()

print(t2-t1)

In [None]:

# Get the number of nonlinear confounds
num_conf_nonlin = nonlinear_confounds.shape[1]

# Get the number of IDPs
num_IDPs = IDPs_deconf.shape[1]

tmp = np.memmap(os.path.join(out_dir, 'p1.npy'), mode='r+', shape=(num_IDPs, num_conf_nonlin), dtype=np.float32)

In [None]:
tmp[55,:]

In [None]:
num_IDP

In [None]:
import pickle
filename=os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz')
print(filename)
# Read in self_copy and create a new instance
with open(filename, 'rb') as f:
    self_copy = pickle.load(f)

## Garbage Collection

In [None]:
# Note this won't execute in Jupyter until the code is restarted.
#del IDPs, nonIDPs, misc, categorical_IDPs, continuous_IDPs, other_IDPs