# UK Biobank Confounds Processing

## Imports

In [None]:
import os
import time 
import shutil
import numpy as np
import pandas as pd

from lib.script_01_00 import generate_initial_variables
from lib.script_01_01 import generate_raw_confounds
from lib.script_01_02 import generate_nonlin_confounds
from lib.script_01_03_to_4 import get_p_vals_and_ve
from lib.script_01_06_to_8 import threshold_ve
from lib.script_01_09_to_12 import generate_crossed_confounds_cluster
from lib.script_01_16 import generate_smoothed_confounds

from src.nets.nets_load_match import nets_load_match
from src.nets.nets_inverse_normal import nets_inverse_normal 
from src.nets.nets_normalise import nets_normalise 
from src.nets.nets_demean import nets_demean

from src.memmap.MemoryMappedDF import MemoryMappedDF
from src.memmap.read_memmap_df import read_memmap_df 
from src.memmap.addBlockToMmap import addBlockToMmap

In [None]:
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Output directory (will eventually be equal to data_dir)
out_dir = '/well/nichols/users/inf852/confounds/data/'

## Script 01_00: gen_init_vars

In [None]:
# If you set skip to True, we will skip script 01_00 and load in a presaved output
skip = True

# Run notebook 00
if not skip:

    # Time the notebook
    t1 = time.time()
    IDPs, nonIDPs, misc = generate_initial_variables(data_dir, out_dir)
    t2 = time.time()

    # Print the time
    print(t2-t1)
    
    # Files we can reconstruct memory mapped dataframes from
    IDPs_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz')
    nonIDPs_fname = os.path.join(os.getcwd(),'saved_memmaps','nonIDPs.npz')
    misc_fname = os.path.join(os.getcwd(),'saved_memmaps','misc.npz')

    # Save the results
    IDPs.save(IDPs_fname)
    nonIDPs.save(nonIDPs_fname)
    misc.save(misc_fname)
    
else: 
    
    # Read in precomputed memmaps
    IDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz'))
    nonIDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonIDPs.npz'))
    misc = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','misc.npz'))

*The previous run of notebook zero took 246.62735080718994 seconds ≈ 4.1 minutes.*

## Script 01_01: gen_raw_conf_gpu

In [None]:
# Get the subject IDs
sub_ids = IDPs.index

# If you set skip to True, we will skip script 01_01 and load in a presaved output
skip = True

# Run script 01
if not skip:

    # Generate raw confounds
    t1 = time.time()
    confounds = generate_raw_confounds(data_dir, sub_ids)
    t2 = time.time()

    # Print the time
    print(t2-t1)

    # Files we can reconstruct memory mapped dataframes from
    confounds_fname = os.path.join(os.getcwd(),'saved_memmaps','confounds.npz')

    # Save the results
    confounds.save(confounds_fname)
    
else:
    
    # Read in precomputed confounds
    confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','confounds.npz'))

*Previous run took 12.58459210395813 seconds.*

## Script 01_02: gen_nonlin_conf_gpu

In [None]:
# If you set skip to True, we will skip script 01_02 and load in a presaved output
skip = True

# Run script 02
if not skip:

    # Set cluster configuration
    local_cluster = {'cluster_type':'local','num_nodes':12}

    # Generate non linear confounds and deconfound IDPs
    t1 = time.time()
    nonlinear_confounds, IDPs_deconf = generate_nonlin_confounds(data_dir, confounds, IDPs, local_cluster)
    t2 = time.time()

    # Time the notebook
    print(t2-t1)

    # Save the results as files we can reconstruct memory mapped dataframes from
    nonlinear_confounds_fname = os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds.npz')
    nonlinear_confounds.save(nonlinear_confounds_fname)
    
    # Save the results as files we can reconstruct memory mapped dataframes from
    IDPs_deconf_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz')
    IDPs_deconf.save(IDPs_deconf_fname)
    
else:
    
    # Read in precomputed confounds
    nonlinear_confounds_fname = os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds.npz')
    nonlinear_confounds = read_memmap_df(nonlinear_confounds_fname)

    # Read in precomputed IDPs
    IDPs_deconf_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz')
    IDPs_deconf = read_memmap_df(IDPs_deconf_fname)

*Last local cluster run (12 nodes) took 405.12547612190247 seconds ≈ 6.75 minutes.*

*Comparison on the same machine; MatLab local cluster run (12 nodes) took 2305.303473 seconds ≈ 38.4 minutes.* 

## Script 01_03_to_4: gen_jobs/gen_nonlin_conf


In [None]:
# If you set skip to True, we will skip script 01_03 and load in a presaved output
skip = True

# Run scripts 03-04
if not skip:

    # Set cluster configuration
    dask_cluster = {'cluster_type':'slurm','num_nodes':100}

    # Generate non linear confounds and deconfound IDPs
    t1 = time.time()
    p, ve = get_p_vals_and_ve(data_dir, nonlinear_confounds, IDPs_deconf, cluster_cfg=dask_cluster)
    t2 = time.time()

    # Time the notebook
    print(t2-t1)

    # Create filenames for memory mapped dataframes to save
    p_fname = os.path.join(os.getcwd(),'saved_memmaps','p.npz')
    ve_fname = os.path.join(os.getcwd(),'saved_memmaps','ve.npz')

    # Save memory mapped dataframes
    p.save(p_fname)
    ve.save(ve_fname)
    
else:

    # Precomputed filenames
    p_fname = os.path.join(os.getcwd(),'saved_memmaps','p.npz')
    ve_fname = os.path.join(os.getcwd(),'saved_memmaps','ve.npz')
    
    # Read in precomputed p and ve
    p = read_memmap_df(p_fname)
    ve = read_memmap_df(ve_fname)

*Last SGE cluster run (100 nodes) took 2147.7868587970734 seconds ≈ 36 minutes.*

*The Matlab version of 01_05 took approximately 15-20 minutes to run most of the analyses but several jobs did not complete. The code in matlab script_01_06 was run to regenerate the remaining p-values. This took approximately 2 days (for the cluster jobs to time out), plus 2 hours (for the regeneration of the remaining values).*

A large factor in the above computation times is the method used to submit jobs. Iteration for iteration, the matlab took between 30-60 seconds and the Python took between 20-40 seconds.

## Script 01_05: gen_nonlin_conf

This code is called to by `script_01_03-04.py` and does not need to be run at this level.

## Script 01_06_to_08: gen_nonlin_conf

In [None]:
# If you set skip to True, we will skip script 01_06 and load in a presaved output
skip = True

# Run script 06-08
if not skip:

    # Work out thresholded variance explaineds
    t1 = time.time()
    nonlinear_confounds_reduced = threshold_ve(ve, nonlinear_confounds, out_dir)
    t2 = time.time()

    # Time the notebook
    print(t2-t1)
    
    # Create filename for reduced nonlinear confounds 
    nonlinear_confounds_reduced_fname = os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds_reduced.npz')

    # Save memory mapped dataframe
    nonlinear_confounds_reduced.save(nonlinear_confounds_reduced_fname)

# Otherwise load in
else:
    
    # Precomputed filenames
    nonlinear_confounds_reduced_fname = os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds_reduced.npz')
    
    # Read in precomputed
    nonlinear_confounds_reduced = read_memmap_df(nonlinear_confounds_reduced_fname)

*Previous run took 9.174669981002808 seconds*

## Script 01_09-01_15: gen_ct_conf_gpu

In [None]:
# If you set skip to True, we will skip script 01_09 and load in a presaved output
skip = True

# Run script 06-08
if not skip:

    # Set cluster configuration
    dask_cluster = {'cluster_type':'slurm','num_nodes':100}
    
    # Work out thresholded variance explaineds
    t1 = time.time()
    IDPs_deconf_ct, confounds_with_ct = generate_crossed_confounds_cluster(IDPs, confounds, nonlinear_confounds_reduced, data_dir, out_dir, cluster_cfg=dask_cluster)
    t2 = time.time()
    print(t2-t1)
    
    # Create filenames for memory mapped dataframes to save
    IDPs_deconf_ct_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf_ct.npz')
    confounds_with_ct_fname = os.path.join(os.getcwd(),'saved_memmaps','confounds_with_ct.npz')

    # Save memory mapped dataframes
    IDPs_deconf_ct.save(IDPs_deconf_ct_fname)
    confounds_with_ct.save(confounds_with_ct_fname)
    
# Otherwise load in
else:
    
    # Precomputed filenames
    IDPs_deconf_ct_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf_ct.npz')
    confounds_with_ct_fname = os.path.join(os.getcwd(),'saved_memmaps','confounds_with_ct.npz')
    
    # Read in precomputed
    IDPs_deconf_ct = read_memmap_df(IDPs_deconf_ct_fname)
    confounds_with_ct = read_memmap_df(confounds_with_ct_fname)

*Previous run took 3187.9112865924835 seconds ≈ 53 minutes*

## Script 01_16: gen_date_time_conf_gpu

In [None]:
# If you set skip to True, we will skip script 01_16 and load in a presaved output
skip = True

# Run script 06-08
if not skip:

    # Set cluster configuration
    dask_cluster = {'cluster_type':'slurm','num_nodes':100}
    
    # Get smoothed confounds
    t1 = time.time()
    IDPs_deconf_smooth, confounds_with_smooth = generate_smoothed_confounds(IDPs, confounds_with_ct, nonIDPs, data_dir, out_dir, dask_cluster)
    t2 = time.time()
    print(t2-t1)
    
    # Create filenames for memory mapped dataframes to save
    IDPs_deconf_smooth_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf_smooth.npz')
    confounds_with_smooth_fname = os.path.join(os.getcwd(),'saved_memmaps','confounds_with_smooth.npz')

    # Save memory mapped dataframes
    IDPs_deconf_smooth.save(IDPs_deconf_smooth_fname)
    confounds_with_smooth.save(confounds_with_smooth_fname)
    
# Otherwise load in
else:
    
    # Precomputed filenames
    IDPs_deconf_smooth_fname = os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf_smooth.npz')
    confounds_with_smooth_fname = os.path.join(os.getcwd(),'saved_memmaps','confounds_with_smooth.npz')
    
    # Read in precomputed
    IDPs_deconf_smooth = read_memmap_df(IDPs_deconf_smooth_fname)
    confounds_with_smooth = read_memmap_df(confounds_with_smooth_fname)

*Last Matlab script_01_16 took 22785.612559 seconds ≈ 6.3 hours.*

*Last Python run (with around SGE 15 nodes) took 4175.982312679291 seconds ≈ 1.2 hours.*

## Garbage Collection

In [None]:
# Note this won't execute in Jupyter until the code is restarted.
#del IDPs, nonIDPs, misc, categorical_IDPs, continuous_IDPs, other_IDPs