Take the netstack and linkage out of the rest of the pipeline and instead pickle things to disk for speed reasons.

In [1]:
# Imports
import os
import sys
import glob
import numpy as np
import pandas as pd
import nibabel as nib
import brainbox as bb

In [2]:
# Dummy variables
pheno_list = [
              '/data1/abide/Pheno/combined_406.csv',
              '/data1/abide/Pheno/site_balanced_279.csv',
              '/data1/abide/Pheno/site_and_age_balanced_194.csv'
              ]
sample_names = [
                'combined_406_sample',
                'site_279_sample',
                'site_age_194_sample'
                ]
scale_list = [7, 12, 20, 36, 64]
mtp_list = [
            'rmap_part', 
            'dual_regression',
            'stability_maps'
            ]
warn = True
# Select things
phen_id = 1
sc_id = 0
mtp_id = 0
# Translate into variables
scale = scale_list[sc_id]
mtp = mtp_list[mtp_id]
name = sample_names[phen_id]
pheno_path = pheno_list[phen_id]

# Input paths
tpl = '*_fmri_{:07d}_session_1_run1_{}.nii.gz'
prior_path = '/data1/cambridge/template/template_cambridge_basc_multiscale_sym_scale{:03d}.nii.gz'.format(scale)
data_path = '/data1/abide/Out/Scores/sc{:02d}/time'.format(scale)
mask_path = '/data1/abide/Mask/mask_data_specific.nii.gz'

# Output paths
out_path = '/data1/subtypes/serial_preps'
out_default = '{}_{}_scale_{:03d}'.format(mtp, name, scale)

net_raw_name = 'netstack_raw_{}.npy'.format(out_default)
net_demean_name = 'netstack_dmn_{}.npy'.format(out_default)
sim_mat_name = 'correlation_matrix_{}.npy'.format(out_default)
net_raw_path = os.path.join(out_path, net_raw_name)
net_demean_path = os.path.join(out_path, net_demean_name)
sim_mat_path = os.path.join(out_path, sim_mat_name)
# See if output path exists
if not os.path.isdir(out_path):
    print('Creating new output path {}'.format(out_path))
    os.makedirs(out_path)

In [3]:
# Get the mask
m_img = nib.load(mask_path)
mask_data = m_img.get_data()
mask = mask_data != 0

In [4]:
# Get the phenotype
pheno = pd.read_csv(pheno_path)
# Grab the corresponding file paths - should be sure that they exist
path_list = [glob.glob(os.path.join(data_path, mtp, tpl.format(r['SUB_ID'], mtp)))[0] for i, r in pheno.iterrows()]

In [5]:
pheno.SITE_ID.value_counts()

NYU       130
USM        76
PITT       39
UCLA_1     26
UCLA_2      8
Name: SITE_ID, dtype: int64

In [6]:
path_list

['/data1/abide/Out/Scores/sc07/time/rmap_part/USM_fmri_0050439_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07/time/rmap_part/USM_fmri_0050466_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07/time/rmap_part/USM_fmri_0050440_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07/time/rmap_part/USM_fmri_0050467_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07/time/rmap_part/USM_fmri_0050450_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07/time/rmap_part/NYU_fmri_0051148_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07/time/rmap_part/USM_fmri_0050441_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07/time/rmap_part/NYU_fmri_0051147_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07/time/rmap_part/USM_fmri_0050452_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07/time/rmap_part/USM_fmri_0050443_session_1_run1_rmap_part.nii.gz',
 '/data1/abide/Out/Scores/sc07

In [14]:
for i, r in pheno.iterrows():
    if not glob.glob(os.path.join(data_path, mtp, tpl.format(r['SUB_ID'], mtp))):
        print('{} not here'.format(tpl.format(r['SUB_ID'])))

In [15]:
# Get the prior
p_img = nib.load(prior_path)
prior = p_img.get_data()

In [16]:
# Get the number of voxels and subjects
n_vox = np.sum(mask)
n_sub = len(path_list)

In [17]:
# Storage
netstack_raw = np.zeros((scale, n_vox, n_sub))
netstack_demean = np.zeros((scale, n_vox, n_sub))
sim_mat = np.zeros((scale, n_sub, n_sub))

# Get the scores maps into a voxel by subject array
net_stack = np.zeros((n_vox, n_sub))
# Iterate files
timer = bb.tools.Counter(n_sub)
for sub_id in range(n_sub):
    timer.tic()
    s_path = path_list[sub_id]
    # Getting data
    netstack_raw[..., sub_id] = nib.load(s_path).get_data()[mask].T
    timer.toc()
    timer.progress()
    
# Save the netstack
np.save(net_raw_path, netstack_raw)

# Take the grand average
grand_average = np.mean(netstack_raw, 2)
# Demean the netstack
netstack_demean = netstack_raw - np.tile(grand_average[..., None], (1,1,n_sub))

# Save the demeaned netstack
np.save(net_demean_path, netstack_demean)

 100.0 % done 0.00 seconds to go. One step takes 0.04907 and we ran for 20.15 s so far

In [18]:
# Build the correlation matrices
timer = bb.tools.Counter(scale)
for net_id in range(scale):   
    timer.tic()
    # Build correlation matrix on the network stack
    corr_mat = np.corrcoef(netstack_demean[net_id, ...], rowvar=0)
    # Store the correlation matrix
    sim_mat[net_id, ...] = corr_mat
    timer.toc()
    timer.progress()
    
# Serialize and save the output
np.save(sim_mat_path, sim_mat)

 100.0 % done 0.00 seconds to go. One step takes 99.28215 and we ran for 794.26 s so far