## Sandbox notebook


In [None]:
import os
import time 
import shutil 
import numpy as np
import pandas as pd
pd.set_option("display.precision", 20)

from lib.script_01_00 import generate_initial_variables
from lib.script_01_01 import generate_raw_confounds
from lib.script_01_02 import generate_nonlin_confounds

from src.nets.nets_load_match import nets_load_match
from src.nets.nets_inverse_normal import nets_inverse_normal 
from src.nets.nets_normalise import nets_normalise 
from src.nets.nets_demean import nets_demean
from src.nets.nets_deconfound import nets_deconfound

from src.duplicate.duplicate_categorical import duplicate_categorical
from src.duplicate.duplicate_demedian_norm_by_site import duplicate_demedian_norm_by_site

from src.preproc.datenum import datenum
from src.preproc.days_in_year import days_in_year

from src.memmap.MemoryMappedDF import MemoryMappedDF
from src.memmap.read_memmap_df import read_memmap_df
from src.memmap.addBlockToMmap import addBlockToMmap

In [None]:
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Output directory (will eventually be equal to data_dir)
out_dir = '/well/nichols/users/inf852/confounds/data/'

In [None]:

# Read in precomputed memmaps
IDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz'))
nonIDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonIDPs.npz'))
misc = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','misc.npz'))
confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','confounds.npz'))

In [None]:
import os
import numpy as np
import pandas as pd
from src.nets.nets_svd import nets_svd
from src.nets.nets_demean import nets_demean
from src.memmap.MemoryMappedDF import MemoryMappedDF
from src.nantools.all_non_nan_inds import all_non_nan_inds
from src.nantools.create_nan_patterns import create_nan_patterns


In [None]:
from src.nets.nets_deconfound_once import inside_loop

In [None]:
from src.nets.nets_load_match import nets_load_match
from src.preproc.filter_columns_by_site import filter_columns_by_site

# Get the subject ids
sub_ids = nonlinear_confounds_reduced.index

# Read in the IDs for site
site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)

# Get the unique site ids
unique_site_ids = np.unique(site_ids)

# Initialize indSite as a list to hold the indices
inds_per_site = {}

# Loop over each value in site ids
for site_id in (unique_site_ids + 1):

    # Find the indices where all elements in a row of siteDATA match the current valueSite
    # Note: This assumes siteDATA and siteValues have compatible shapes or values for comparison
    indices = np.where((site_ids == site_id-1).all(axis=1))[0]

    # Append the found indices to the indSite list
    inds_per_site[site_id] = indices

# Delete the indices
del indices

# Initialise empty dict to store headers
columns_for_sites = {}

# Number of crossed terms we will consider
n_ct = 0
n_ct_per_site = {}

# Create a dict of site-specific column headers
for site_index in (unique_site_ids + 1):

    # Get the columns for this site
    columns_for_sites[site_index] = filter_columns_by_site(confounds, 
                                                           site_index, return_df=False)

    # Add nonlinear columns
    columns_for_sites[site_index] = columns_for_sites[site_index] + \
                                    filter_columns_by_site(nonlinear_confounds_reduced, 
                                                           site_index, return_df=False)

    # Add the number of crossed terms for this site
    n_ct_per_site[site_index] = int((len(columns_for_sites[site_index])-1)*(len(columns_for_sites[site_index]))/2)
    n_ct = n_ct + n_ct_per_site[site_index]

# Get number of subjects
n_sub = len(sub_ids)

In [None]:
# We now need to initialise a memory map of size n_sub by n_ct
#ct = np.zeros((n_sub,n_ct))

# for site_index in (unique_site_ids + 1):
site_index = 1


In [None]:
import time
# Get the number of subjects for this site
n_sub_site_i = len(inds_per_site[site_index])

t1 = time.time()
# Get the non-crossed confounds for site i
conf_site_i = filter_columns_by_site(confounds[inds_per_site[site_index],:],site_index)
conf_nonlin_site_i = filter_columns_by_site(nonlinear_confounds_reduced[inds_per_site[site_index],:],site_index)
t2 = time.time()
print(t2-t1)

# Combine the two
conf_site_i = pd.concat([conf_site_i,conf_nonlin_site_i], axis=1)

# We now need to initialise a memory map of size n_sub by n_ct_per_site[site_index]
ct_site_i = pd.DataFrame(np.zeros((n_sub_site_i, n_ct_per_site[site_index])))

print(ct_site_i.shape, conf_site_i.shape)

In [None]:
# Get the site-specific columns
site_cols = conf_site_i.columns

# Current column we are adding crossed term for
current_col = 0

# List for column names
col_names = []

# Loop through generating confound terms
for i in range(len(site_cols)):

    # Cross term i with term j
    for j in range(i):

        # Add column name
        col_names = col_names + [conf_site_i.columns[i] + '__x__' + conf_site_i.columns[j]]

        # Add crossed term
        ct_site_i.iloc[:,current_col] = conf_site_i.iloc[:,i]*conf_site_i.iloc[:,j]
        
        # Update current column
        current_col = current_col + 1

# Update columns in df
ct_site_i.columns = col_names

In [None]:
t1 = time.time()
# Convert to memmapped dfs for memory management
# ct_site_i = MemoryMappedDF(ct_site_i)
# conf_site_i = MemoryMappedDF(conf_site_i)

# Set cluster configuration
local_cluster = {'cluster_type':'local','num_nodes':12}

# Run nets_deconfound
conf_ct_site_i = nets_deconfound(ct_site_i, conf_site_i,
                                 'svd', 
                                 check_nan_patterns=True)

t2 = time.time()
print(t2-t1)

In [None]:
conf_ct_site_i[conf_ct_site_i.abs()<1e-10]=0

In [None]:
conf_ct_site_i.index = inds_per_site[site_index]

In [None]:
conf_ct_site_i

In [None]:
# Create empty dataframe for crossed terms
conf_ct = pd.DataFrame(index=confounds.index)

In [None]:
t1 = time.time()
conf_ct = pd.concat((conf_ct,conf_ct_site_i),axis=1).fillna(0)
t2 = time.time()
print(t2-t1)

In [None]:
sorted_columns = np.sort(conf_ct.columns)

conf_ct[[*sorted_columns]]

In [None]:
from lib.script_01_09 import gen_ct_conf

t1 = time.time()
confounds_crossed_terms = gen_ct_conf(confounds, nonlinear_confounds_reduced, data_dir)
t2 = time.time()
print(t2-t1)

In [None]:
t1 = time.time()
# Get full confounds
conf_full = pd.concat((confounds[:,:],nonlinear_confounds_reduced[:,:]),axis=1)

# Save as memory map
conf_full = MemoryMappedDF(conf_full)
t2 = time.time()
print(t2-t1)

print(conf_full.shape)

In [None]:
conf_full[1:10,375:380]

In [None]:
t1 = time.time()
# Set cluster configuration
local_cluster = {'cluster_type':'local','num_nodes':12}

# Deconfound using full confounds (but not crossed terms)
IDPs_deconf_w_nonlin =  nets_deconfound(IDPs_deconf, conf_full,
                                        'nets_svd', conf_has_nans=False,
                                        check_nan_patterns=False,
                                        cluster_cfg=local_cluster)
t2 = time.time()
print(t2-t1)