# UK Biobank Confounds Processing

## Imports

In [1]:
import os
import shutil
import numpy as np
import pandas as pd

from lib.script_01_00 import generate_initial_variables
from lib.script_01_01 import generate_raw_confounds
from lib.script_01_02 import generate_nonlin_confounds

from src.nets import nets_load_match, nets_inverse_normal, nets_normalise, nets_demean, nets_deconfound
from src.duplicate import duplicate_categorical, duplicate_demedian_norm_by_site
from src.preproc import datenum, days_in_year

from src.memmap import MemoryMappedDF

In [2]:
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Output directory (will eventually be equal to data_dir)
out_dir = '/well/nichols/users/inf852/confounds/data/'

## Script 01_00: gen_init_vars

In [3]:
import time

t1 = time.time()
IDPs, nonIDPs, misc = generate_initial_variables(data_dir, out_dir)
t2 = time.time()

print(t2-t1)

KeyboardInterrupt: 

## Script 01_01: gen_raw_conf_gpu

In [None]:
# Get the subject IDs
sub_ids = IDPs.index

# Generate raw confounds
t1 = time.time()
confounds = generate_raw_confounds(data_dir, sub_ids)
t2 = time.time()

print(t2-t1)

## script_01_02: gen_nonlin_conf_gpu

In [None]:
# Generate non linear confounds and deconfound IDPs
t1 = time.time()
nonlinear_confounds, IDPs_deconf = generate_nonlin_confounds(data_dir, confounds, IDPs)
t2 = time.time()

print(t2-t1)

In [None]:
nonlinear_confounds.get_groups('age_nonlin')


In [None]:
# Notes:
#  - lib = functions
#  - src =common_matlab 
#  - ALL_IDs = sub_ids
#  - N = n
#  - var_QC_IDPs = QC_IDPs
#  - siteDATA = site_ids

In [None]:

# Get the variable values for the IDs we have
values = nets_load_match(values_file, ids)


# Demedian globally each column (note this automatically
# ignores nans)
medians = values.median()
values = values - medians

print(values.shape, values)

# Get the absolute value of the demedianed data, multiplied
# by 1.48
mads = 1.48 * values.abs().median()

print(mads)

# Identify columns where the absolute value of mads is less than np.finfo(float).eps
mask = mads.abs() < np.finfo(float).eps

# Calculate standard deviation for columns where mads is too small
std_devs = values.loc[:, mask].std()

# Update mads to use standard deviation for these columns
mads[mask] = std_devs

# Standardise with the final mads
values = values/mads

print(mads)

print(values.value_counts())

In [None]:
values[2][35264:35274]

In [None]:
duplicates['TE_tfMRI_Site_1']

In [None]:
x = np.random.randn(6000,700)
y = np.random.randn(6000,3000) + x @ np.random.randn(700,1)


t1 = time.time()
# Multiply the left-singular values which contribute to the rank of conf
# by the corresponding singular values to rank reduce conf
U, S, Vt = np.linalg.svd(x, full_matrices=False)

# Get the rank of the matrix
rank = np.sum(S > 1e-10)

# Multiply the left-singular values which contribute to the rank of conf
# by the corresponding singular values to rank reduce conf
x_reduced = U[:, :rank] @ np.diag(S[:rank])

# Compute the betahat estimator
betahat = np.linalg.pinv(x_reduced) @ y

# Set computational zeros to actual zeros
betahat[np.abs(betahat) < 1e-10] = 0

# Get deconfounding variable predicted values to regress out
deconf1 = x_reduced @ betahat

t2 = time.time()

print(t2-t1)

t1 = time.time()
Q, R = np.linalg.qr(x)
betahat = np.linalg.pinv(R) @ (Q.T @ y)

# Set computational zeros to actual zeros
betahat[np.abs(betahat) < 1e-10] = 0

# Get deconfounding variable predicted values to regress out
deconf2 = x @ betahat

t2 = time.time()
print(t2-t1)


In [None]:
np.amax(np.abs(deconf2-deconf1))

In [None]:
nonlinear_confounds.shape

In [None]:
constant_cols(df_nan, c=8, mode='preserve')

In [None]:
np.amax(np.abs(deconf1-deconf2))

## Garbage Collection

In [None]:
# Note this won't execute in Jupyter until the code is restarted.
#del IDPs, nonIDPs, misc, categorical_IDPs, continuous_IDPs, other_IDPs