# Detect Hits
and write them out to a csv file

In [1]:
# Imports
import os
import glob
import numpy as np
import pandas as pd
import nibabel as nib
import brainbox as bb
import nilearn as nil
import statsmodels.api as sm
from scipy import stats as st
from matplotlib import gridspec
from scipy import cluster as scl
from multiprocessing import Pool
from nilearn import plotting as nlp
from matplotlib import pyplot as plt
from sklearn import linear_model as slin
from statsmodels.sandbox import stats as sts
from matplotlib.colors import LinearSegmentedColormap
from statsmodels.sandbox.stats import multicomp as smi

In [2]:
%matplotlib inline

# Paths

In [3]:
# Variables
mtp_list = ['rmap_part', 'dual_regression', 'stability_maps']
pheno_list = [
              '/data1/abide/Pheno/combined_406.csv',
              '/data1/abide/Pheno/site_balanced_279.csv',
              '/data1/abide/Pheno/site_and_age_balanced_194.csv'
              ]
sample_names = [
                'combined_406_sample',
                'site_279_sample',
                'site_age_194_sample'
                ]
scale_list = np.array([7, 12, 20, 36, 64])
#cov_list = ['VIQ', 'DX_GROUP', 'ADOS_SOCOM_SEV', 'VPR', 'EYE_STATUS_AT_SCAN']
cov_list = ['EYE_STATUS_AT_SCAN', 'ADOS_SOCOM_SEV','DX_GROUP','VIQ']
out_str = 'Network,VIQ,DX_GROUP,ADOS_SOCOM_SEV,VPR,EYE_STATUS_AT_SCAN'
# Select
n_iter = 1000
mtp_id = 2
sc_id = [0,1,2,3]
phen_id = 1
# Make variables
scales = scale_list[sc_id]
mtp = mtp_list[mtp_id]
name = sample_names[phen_id]
pheno_path = pheno_list[phen_id]
# Fixed values
mask_path = '/data1/abide/Mask/mask_data_specific.nii.gz'
in_path = '/data1/subtypes/serial_preps/'

In [4]:
# Get the mask
m_img = nib.load(mask_path)
mask_data = m_img.get_data()
mask = mask_data != 0

In [5]:
# Get the phenotype data
pheno = pd.read_csv(pheno_path)

# Scale iteration

In [6]:
n_cases = len(pheno)
pheno['rand'] = np.arange(n_cases)

In [7]:
glm_dict = dict()

for cov in cov_list:
    cov_index = pd.notnull(pheno[cov])
    cov_pheno = pheno[cov_index]
    # Generate the model matrix
    factors = [cov, 'SEX', 'AGE_AT_SCAN', 'FD_scrubbed']
    # Make dummy variables for the site factor
    site_factor = pd.get_dummies(cov_pheno['SITE_ID'])
    # Turn the first site into the intercept
    site_factor = site_factor.rename(columns={site_factor.keys()[0]: 'INTERCEPT'})
    site_factor['INTERCEPT'] = 1
    # Get the other variables
    other_factors = cov_pheno.ix[:,factors]
    # Turn diagnosis into [0,1] vector
    #other_factors['DX_GROUP'] = other_factors['DX_GROUP'].values - 1
    # Demean age
    other_factors['AGE_AT_SCAN'] = other_factors['AGE_AT_SCAN']-np.mean(other_factors['AGE_AT_SCAN'].values)
    # Demean the covariate
    other_factors[cov] = other_factors[cov]-np.mean(other_factors[cov].values)
    # Put them back together
    glm_pheno = pd.concat([site_factor, other_factors], axis=1)
    glm_dict[cov] = (glm_pheno, cov_index)

In [8]:
def par_track(args):
    (seed, glm_pheno, cov,
     scale, subtypes, cov_weight) = args
    np.random.seed(seed)
    # Randomize the covariate
    tmp = glm_pheno[cov].values
    np.random.shuffle(tmp)
    glm_pheno[cov] = tmp

    pvec = np.zeros(np.prod((scale, subtypes)))
    for net_id in range(scale):
        # Loop through the subtypes
        tmp = np.array([sm.OLS(cov_weight[net_id, s_id, :], glm_pheno).fit().pvalues[cov] for s_id in range(subtypes)])
        pvec[net_id*subtypes:net_id*subtypes+subtypes] = tmp 
    # Now look at the mask of p-values passing FDR Correction
    pcorr_vec = smi.multipletests(pvec, alpha=0.05, method='fdr_bh')[0]
    # count the results passing the threshold
    n_res = np.sum(pcorr_vec)
    return n_res

In [9]:
# Prepare storage
findings = dict()
for scale in scales:
    findings[scale] = dict()
    for cov in cov_list:
        findings[scale][cov] = list()

timer = bb.tools.Counter(np.prod((len(scales), len(cov_list))))
for scale in scales:
    # Scale stuff
    netstack_path = os.path.join(in_path, 'netstack_dmn_{}_{}_scale_{:03d}.npy'.format(mtp, name, scale))
    netraw_path = os.path.join(in_path, 'netstack_raw_{}_{}_scale_{:03d}.npy'.format(mtp, name, scale))
    corrmat_path = os.path.join(in_path, 'correlation_matrix_{}_{}_scale_{:03d}.npy'.format(mtp, name, scale))
    prior_path = '/data1/cambridge/template/template_cambridge_basc_multiscale_sym_scale{:03d}.nii.gz'.format(scale)
    
    # Get the prior
    p_img = nib.load(prior_path)
    prior = p_img.get_data()
    
    # Turn the priors into an image
    prior = nib.load(prior_path)
    prior_data = prior.get_data()
    prior_temp = np.zeros((prior_data.shape + (scale,)))
    for sc_id in range(scale):
        tmp = np.zeros_like(prior_data)
        tmp[prior_data==sc_id+1] = sc_id + 1
        prior_temp[..., sc_id] = tmp
    prior_img = nib.Nifti1Image(prior_temp, affine=m_img.get_affine(), header=m_img.get_header())
    
    # Load the serialized netstack
    netstack = np.load(netstack_path)
    corr_mat = np.load(corrmat_path)
    
    subtypes = 5

    n_sub = netstack.shape[2]
    n_vox = netstack.shape[1]

    # Make the grand average
    gdavg = np.zeros(mask.shape + (scale,))

    scale = netstack.shape[0]
    n_sub = netstack.shape[2]
    n_vox = netstack.shape[1]

    link_store = np.zeros((n_sub-1,4,scale))
    part_store = np.zeros((scale, n_sub))
    sbt_store = np.zeros((scale, subtypes, n_vox))
    weight_store = np.zeros((scale, subtypes, n_sub))

    # Iterate through the networks
    for net_id in range(scale):
        # Compute linkage with Ward's criterion
        link_mat = scl.hierarchy.linkage(corr_mat[net_id, ...] , method='ward')
        link_store[..., net_id] = link_mat
        # Partition the linkage to get a given number of subtypes
        part_sub = scl.hierarchy.fcluster(link_mat, subtypes, criterion='maxclust')
        part_store[net_id, :] = part_sub

        sub_stack = np.zeros((n_vox, subtypes))
        for s_id in range(subtypes):
            sbt = np.mean(netstack[net_id, :, part_sub==s_id+1],0)
            sub_stack[:,s_id] = sbt
            sbt_store[net_id, s_id, :] = sbt

        # Init store - Compute the weights
        for s_id in range(subtypes):
            type_map = sub_stack[:, s_id]
            weight_store[net_id, s_id, :] = np.array([np.corrcoef(type_map, netstack[net_id, :, x])[0,1] for x in range(n_sub)])
    
    # prepare the parallel run
    for cov in cov_list:
        timer.tic()
        (glm_pheno, cov_index) = glm_dict[cov]
        cov_weight = weight_store[..., cov_index.values]
        # Loop through the iterations
        arg_list = list()
        for n_it in np.arange(n_iter):
            arg_list.append((n_it, glm_pheno, cov,
                             scale, subtypes, cov_weight))
        
        # Run parallel
        pool = Pool(processes=6)
        results = pool.map(par_track, arg_list)
        findings[scale][cov] = np.array(results)
        timer.toc()
        timer.progress()

 100.0 % done 0.00 seconds to go. One step takes 12.27783 and we ran for 340.95 s so far

In [10]:
# Make a report
report  = pd.DataFrame()
headers = ['scale'] + cov_list
n_scales = len(scales)
n_covs = len(cov_list)

data = np.zeros((n_scales, n_covs+1))
data[:, 0] = np.array(scales)
for sc_id, scale in enumerate(scales):
    data[sc_id, 1:] = np.array([np.float(np.sum(findings[scale][cov] != 0))/n_iter for cov in cov_list])
    
report = pd.DataFrame(data, columns=headers)

In [11]:
report

Unnamed: 0,scale,EYE_STATUS_AT_SCAN,ADOS_SOCOM_SEV,DX_GROUP,VIQ
0,7,0.04,0.055,0.038,0.048
1,12,0.041,0.044,0.036,0.045
2,20,0.049,0.044,0.034,0.038
3,36,0.044,0.038,0.043,0.034


The empirical FDR is actually very close to the selected q-Value. So I guess we can at least say that my analysis doesn't create spurious significant results in the absence of true signal.