In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, FastICA
from sklearn.model_selection import LeaveOneOut, KFold, LeaveOneOut, train_test_split
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, roc_auc_score
import scipy as sp
from scipy.stats import pearsonr
pd.options.mode.chained_assignment = None
from IPython.core.debugger import set_trace
import os
import matplotlib.pyplot as plt
import itertools
import nibabel as nib
import seaborn as sns

%matplotlib inline

In [None]:
# define some functions used later in the main functions
# or are helpers for reading and handling data

def match_dfs_by_ind(df_list, behav, to_compare=[]):
    # if we dont have the same subjects available for all tasks, it matches it
    all_indices = [list(df.index) for df in df_list]
    all_indices.append(behav.index)
    if len(to_compare):
        all_indices.append(to_compare.index)
    in_all = list(set(all_indices[0]).intersection(*all_indices))
    return [df[df.index.isin(in_all)] for df in df_list], behav[behav.index.isin(in_all)]
    
def read_features(to_use, subjlist='test_subjlist', to_scale=True):
    orig_mat = np.genfromtxt(f'data/{to_use}.csv', delimiter=',')
    subjects_path = f'data/{subjlist}.txt'
    with open(subjects_path, 'r') as f:
        subjects = [line.rstrip('\n') for line in f]

    # arrange features in df and keep only subjects that we have g scores for
    features = pd.DataFrame(data=orig_mat, index=subjects)
    
    if to_scale:
        # scale features
        scaler = StandardScaler()
        scaled_features = pd.DataFrame(data = scaler.fit_transform(features), index = features.index)
        
        return scaled_features
    else:
        return features

def decompose(data, num_comps, transformer='pca'):
    # recieves a subjectsXvertices matrix. returns verticesXcomponents matrix
    if transformer == 'pca':
        trans = PCA()
    if transformer == 'ica':
        trans = FastICA(max_iter=500)
    trans.fit(data)
    return trans.components_.T[:,:num_comps]

def demean_by_train(train, test):
    # demeans the data by the training sets mean

    train_avg = train.mean(axis=0)
    train_demeaned = train-train_avg
    test_demeaned = test-train_avg
    return train_demeaned, test_demeaned

def shuffle_copy(to_shuffle):
    # returns shuffled DataFrame to utilize in permutation tests
    
    shuffled = to_shuffle.copy()
    shuffled = shuffled.values
    np.random.shuffle(shuffled)
    return shuffled


def corr_analysis(features, y, ff_num):
    # select featrues cpm-style
    feat_num = features.shape[1]
    corrs = np.zeros(feat_num)
    for feat in range(feat_num):
        corrs[feat] = sp.stats.pearsonr(features[:,feat],y)[0]
    mask = abs(corrs)>=np.sort(abs(corrs))[len(corrs)-ff_num]
    return mask

def save_masked_maps(data, filename):
    template = nib.load('data/Smask.dtseries.nii')
    mask = np.asanyarray(template.dataobj)
    mask[mask==1]=data
    to_save = nib.cifti2.cifti2.Cifti2Image(mask, template.header)
    nib.save(to_save, f'{filename}.dtseries.nii')
    
def get_task_feat_num(mask, num_comps):
    num_tasks = len(mask)/num_comps
    f_per_task = []
    for task in range(int(num_tasks)):
        f_per_task.append((mask[num_comps*task: num_comps*(task+1)].sum()))
    return f_per_task

# two functions - one for single data entry and one for multi

In [None]:
def do_bbs_single(features, score_df, reg_type, k=10, num_comps=75, score='g_efa', save_maps=False):
    
    stats = {'r':[], 'mse':[]}
    
    kfold = KFold(n_splits=k, random_state=42, shuffle=True)
    predicted = np.zeros(features.shape[0])
    
    if permute_num:
        permutations = np.zeros((features.shape[0],permute_num-1))
        
    consensus = np.zeros((features.shape[1]))
    
    for fold, (train_index, test_index) in enumerate(kfold.split(features)):
        print(f'fold {fold+1} out of {k}')
        train_features = np.zeros([len(train_index),num_comps*len(dfs_list)])
        test_features = np.zeros([len(test_index),num_comps*len(dfs_list)])
        
        fold_comps = np.zeros((features.shape[1], num_comps))
        for i in range(len(dfs_list)):
            features = dfs_list[i]
            X_train, X_test = features.iloc[train_index,:], features.iloc[test_index,:]

            Y_train, Y_test = score_df[score].iloc[train_index], score_df[score].iloc[test_index]
            # create pca-reduced matrix, in the shpae of verticesXcomponents
            reduced_train = decompose(X_train, num_comps, transformer='pca')

            # extract individual features by calculating expression scores for each subject
            this_train_features = np.matmul(np.linalg.pinv(reduced_train),X_train.values.T).T            
            this_test_features = np.matmul(np.linalg.pinv(reduced_train),X_test.values.T).T
            
            start = i*num_comps
            end = start+num_comps
            train_features[:, start:end] = this_train_features
            test_features[:, start:end] = this_test_features
            
            # save the components used for feature extration 
            if save_maps: 
                fold_comps[:,start:end] = reduced_train
                
        
        # calculate model and get predictions
        if reg_type == 'glm':
            model = LinearRegression()
        elif reg_type == 'elnet':
            model = ElasticNetCV(l1_ratio=[0.9, 0.95, 0.99],n_alphas=50)
        
        model.fit(train_features, Y_train)            
        predicted[test_index] = model.predict(test_features)
        
        if return_stats:
            stats['r'].append(sp.stats.pearsonr(predicted[test_index], Y_test)[0])
            stats['mse'].append(mean_squared_error(predicted[test_index], Y_test))
        
        if save_maps:
            # weight components with their related beta values
            weighted_task_comps = fold_comps*model.coef_
            # sum over components to get a single weighted map for this fold
            summed_weighted_task_comps = np.sum(weighted_task_comps,axis=1)
            # add the weighted components to the rest of the weighted components.
            consensus += summed_weighted_task_comps
        
    if return_stats:
        summary = pd.DataFrame(stats).describe().loc[['mean', 'std'],:]
        if save_maps:
            return predicted, stats, summary, consensus
        else:
            return predicted, stats, summary

In [None]:
def do_bbs_multi_data_select(dfs_list, task_names, score_df, reg_type, k=10, num_comps=75, score='g_efa', permute_num=False, ff_num = 75, l1_ratio=0.01, save_maps=False):
    
    stats = {'r':[], 'mse':[]}

    kfold = KFold(n_splits=k, random_state=42, shuffle=True)
    feature_example = dfs_list[0]
    predicted = np.zeros(feature_example.shape[0])
    
    task_comps_per_fold = []                     
    consensus = np.zeros((feature_example.shape[1]))
    
    for fold, (train_index, test_index) in enumerate(kfold.split(feature_example)):
        print(f'fold {fold+1} out of {k}')
        train_features = np.zeros([len(train_index),num_comps*len(dfs_list)])
        test_features = np.zeros([len(test_index),num_comps*len(dfs_list)])
        # define this folds' Y
        Y_train, Y_test = score_df[score].iloc[train_index], score_df[score].iloc[test_index]

        fold_comps = np.zeros((features.shape[1], num_comps*len(dfs_list)))

        for i in range(len(dfs_list)):
            features = dfs_list[i]
            X_train, X_test = features.iloc[train_index,:], features.iloc[test_index,:]

            # create pca-reduced matrix, in the shpae of verticesXcomponents
            reduced_train = decompose(X_train, num_comps, transformer='pca')

            # extract individual features by calculating expression scores for each subject
            this_train_features = np.matmul(np.linalg.pinv(reduced_train),X_train.values.T).T            
            this_test_features = np.matmul(np.linalg.pinv(reduced_train),X_test.values.T).T
            
            start = i*num_comps
            end = start+num_comps
            train_features[:, start:end] = this_train_features
            test_features[:, start:end] = this_test_features
            
            # save the components used for feature extration 
            if save_maps: 
                fold_comps[:,start:end] = reduced_train
        

        # correlation analysis to select features
        mask = corr_analysis(train_features, Y_train, ff_num)
        task_comps_per_fold.append(get_task_feat_num(mask, num_comps))
        
        # reduce features with mask produced in the correlation analysis
        train_features = train_features[:, mask]
        test_features = test_features[:, mask]
        
        # calculate model and get predictions
        if reg_type == 'glm':
            model = LinearRegression()
        elif reg_type == 'elnet':
            model = ElasticNetCV(l1_ratio=l1_ratio,n_alphas=50, tol=0.001, max_iter=5000)
            
        model.fit(train_features, Y_train)
        betas = model.coef_
                    
        predicted[test_index] = model.predict(test_features)
        
        stats['r'].append(sp.stats.pearsonr(predicted[test_index], Y_test)[0])
        stats['mse'].append(mean_squared_error(predicted[test_index], Y_test))

        if save_maps:
            #reduce fold_comps according to the correlation analysis
            masked_comps = fold_comps[:,mask]
            # weight components with their related beta values
            weighted_task_comps = masked_comps*betas
            # sum over components to get a single weighted map for this fold
            summed_weighted_task_comps = np.sum(weighted_task_comps,axis=1)
            # add the weighted components to the rest of the weighted components.
            consensus += summed_weighted_task_comps
        
    summary = pd.DataFrame(stats).describe().loc[['mean', 'std'],:]
        
    task_comps_per_fold_df = pd.DataFrame(data = task_comps_per_fold, columns = task_names)
    if save_maps:
        return predicted, stats, summary, task_comps_per_fold_df, consensus
    else:
        return predicted, stats, summary, task_comps_per_fold_df

# check significance of comparison between inputs

In [None]:
def compare_predictions(A,B,hcp_df,score,k=30, num_comps=75):
    dfs, behav = match_dfs_by_ind([A,B], hcp_df, to_compare=emot_facshp_orig_z)
    
    score_dict = {0:[], 1:[]}
    for i in range(k):
        print(i)
        rand_seed = np.random.randint(0,1000)
        for m in [0,1]:
            
            X_train, X_test, y_train, y_test = train_test_split(dfs[m], behav[score], test_size=0.33, random_state=rand_seed)
            reduced_train = decompose(X_train, num_comps, transformer='pca')

            # extract individual features by calculating expression scores for each subject
            train_features = np.matmul(np.linalg.pinv(reduced_train),X_train.values.T).T            
            test_features = np.matmul(np.linalg.pinv(reduced_train),X_test.values.T).T
            model = LinearRegression()
            model.fit(train_features, y_train)
            predicted = model.predict(test_features)
            r = pearsonr(predicted,y_test)[0]
            score_dict[m].append(r)
            
    m,p = sp.stats.mannwhitneyu(score_dict[0], score_dict[1]) 
    return p
        

# get data of mean activity in DMN and FPN

In [None]:
contrasts = {'2bk>0bk' : 'WM_11_s4', '2bk': 'WM_09_s4', '0bk': 'WM_10_s4',
        'Math-Story' : 'Lang_03_s4',
        'Random' : 'Soc_01_s4', 'TOM': 'Soc_02_s4', 'TOM-Radnom': 'Soc_06_s4',
        'Rel' : 'Rel_02_s4', 'Match': 'Rel_01_s4', 'Rel-Match': 'Rel_04_s4',
        'Reweard': 'Gamb_02_s4', 'Punish': 'Gamb_01_s4', 'Punish-Reward': 'Gamb_03_s4',
        'Faces-Shapes': 'Em_03_s4'}

In [None]:
yeo_parc = nib.load('/Volumes/HCP/HCP_WB_Tutorial_1.0/yeo_masked.dtseries.nii')
yeo_parc = np.asanyarray(yeo_parc.dataobj)
fpn_num = 6;
dmn_num = 7;
fpn_mask = (yeo_parc==fpn_num).flatten()
dmn_mask = (yeo_parc==dmn_num).flatten()

data_dir = '/Volumes/HCP/Predicted_data_100';

orig_fpn_mean=[]
orig_dmn_mean=[]
pred_fpn_mean=[]
pred_dmn_mean=[]
contrast_description = list(contrasts.keys())
contrast_names = list(contrasts.values())

for con in contrast_names:
    print(con)
    all_orig_path = f'{data_dir}/{con}/all_test_data/all_test_data_orig_z.dtseries.nii'
    all_orig = nib.load(all_orig_path)
    all_orig = np.asanyarray(all_orig.dataobj)
    
    orig_fpn_mean.append(np.mean(np.mean(all_orig[:, fpn_mask])));
    orig_dmn_mean.append(np.mean(np.mean(all_orig[:, dmn_mask])));
    
    all_pred_path = f'{data_dir}/{con}/all_test_data/all_test_data_pred_z_cleaned_cb.dtseries.nii'
    all_pred = nib.load(all_pred_path)
    all_pred = np.asanyarray(all_pred.dataobj)
    
    pred_fpn_mean.append(np.mean(np.mean(all_pred[:, fpn_mask])));
    pred_dmn_mean.append(np.mean(np.mean(all_pred[:, dmn_mask])));
