In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import LeaveOneOut, KFold, LeaveOneOut, train_test_split, GroupShuffleSplit
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import scipy as sp
from scipy.stats import pearsonr
pd.options.mode.chained_assignment = None
from IPython.core.debugger import set_trace
import os
import matplotlib.pyplot as plt
import itertools
import nibabel as nib
import seaborn as sns

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from BBS import bbs, bbs_comparisons

%matplotlib inline

In [None]:
# a function that helps to read in data into a dataframe with subjects as index
def read_features(path, subjlist, to_scale=True):
    orig_mat = np.genfromtxt(path, delimiter=',')
    with open(subjlist, 'r') as f:
        subjects = [line.rstrip('\n') for line in f]

    # arrange features in df and keep only subjects that we have g scores for
    features = pd.DataFrame(data=orig_mat, index=subjects)
    
    if to_scale:
        # scale features
        scaler = StandardScaler()
        scaled_features = pd.DataFrame(data = scaler.fit_transform(features), index = features.index)
        
        return scaled_features
    else:
        return features
    
def save_masked_maps(data, filename):
    template = nib.load('../BBS/misc/Smask.dtseries.nii')
    mask = np.asanyarray(template.dataobj)
    mask[mask==1]=data
    to_save = nib.cifti2.cifti2.Cifti2Image(mask, template.header)
    nib.save(to_save, f'{filename}.dtseries.nii')

In [None]:
# read hcp's behavioral data
g_efa_cv = pd.read_csv('g_efa_cv.csv')
g_efa_cv = g_efa_cv['x']

hcp_df = pd.read_csv('hcp_dataframe_with_g.csv')
hcp_df['Subject'] = hcp_df['Subject'].apply(str)
hcp_df = hcp_df.set_index('Subject')
hcp_df = hcp_df[['PMAT24_A_CR', 'ReadEng_Unadj', 'Gender', 'G', 'g_efa', 'g_cfa', 'NEOFAC_O', 'NEOFAC_C', 'NEOFAC_E', 'NEOFAC_A', 'NEOFAC_N']]
hcp_df['g_efa_cv'] = g_efa_cv.values
hcp_df = hcp_df.dropna()
print(hcp_df.shape)

# add family groups
restricted_df = pd.read_csv('RESTRICTED_HCP.csv')
restricted_df['Subject'] = restricted_df['Subject'].apply(str)
restricted_df = restricted_df.set_index('Subject')
restricted_df = restricted_df.loc[hcp_df.index.values,:]
groups = [np.where(np.unique(restricted_df['Family_ID'])==family_id)[0][0] for family_id in restricted_df['Family_ID']]
groups = np.array(groups, dtype=int)
hcp_df['Family_group'] = groups

hcp_df_wm = pd.read_csv('/Users/crazyjoe/Downloads/unrestricted_shachargal_7_7_2020_7_22_11.csv')
hcp_df_wm['Subject'] = hcp_df_wm['Subject'].apply(str)
hcp_df_wm = hcp_df_wm.set_index('Subject')
hcp_df_wm = hcp_df_wm[['WM_Task_Acc', 'WM_Task_2bk_Acc']]
hcp_df_wm = hcp_df_wm.dropna()
hcp_df_wm.shape
hcp_df_wm['Family_group'] = hcp_df['Family_group'][hcp_df_wm.index]
hcp_df_wm = hcp_df_wm.dropna()


In [None]:
# some important variables for later
subjlist = '/Volumes/HCP/FE_100_noRelatives/all_subjects.txt'
data_dir = '/Volumes/homes/Shachar/python_projects/bbs_prediction/data'
task_contrasts = ['WM_09_s4','WM_10_s4','WM_11_s4',
                  'Em_03_s4','Gamb_01_s4','Gamb_02_s4','Lang_03_s4',
                'Rel_01_s4','Rel_02_s4','Rel_04_s4','Soc_01_s4','Soc_02_s4','Soc_06_s4']
glm = LinearRegression()
elnet = ElasticNetCV(l1_ratio=0.01, n_alphas=50, tol=0.001, max_iter=5000)
combinations = [
                ['WM_09_s4', 'Em_03_s4'], 
                ['WM_09_s4', 'Rel_02_s4'],
                ['WM_09_s4', 'Lang_03_s4'],
                ['WM_09_s4', 'Soc_01_s4'],
                ['Lang_03_s4', 'Rel_02_s4'],
                ['Rel_02_s4', 'Em_03_s4'],
                ['WM_09_s4', 'Gamb_01_s4'],
                ['WM_09_s4', 'Lang_03_s4', 'Soc_01_s4'],
                ['WM_09_s4', 'Lang_03_s4', 'Em_03_s4'],
                ['WM_09_s4', 'Lang_03_s4', 'Rel_02_s4'],
                ['Rel_02_s4', 'Em_03_s4', 'Lang_03_s4'],
                ['WM_09_s4', 'Lang_03_s4', 'Em_03_s4', 'Gamb_01_s4'],
                ['WM_09_s4', 'Lang_03_s4', 'Em_03_s4', 'Soc_01_s4'],
                ['WM_09_s4', 'Lang_03_s4', 'Em_03_s4', 'Soc_01_s4', 'Gamb_01_s4', 'Rel_02_s4'],
]

In [None]:
# predict using single map data
for score in ['g_efa_cv', 'PMAT24_A_CR', 'ReadEng_Unadj', 'NEOFAC_O']:
    single_scan_results=[]
    for con in task_contrasts:
        print(con)
        for dtype in ['orig', 'pred']:
            print(dtype)
            data = read_features(f'data/{con}_z_masked_{dtype}.csv', subjlist=subjlist)
            dfs, behav = bbs.match_dfs_by_ind([data], hcp_df)
            bbs_model = bbs.BBSPredictSingle(data=dfs[0], target=behav[score], num_components=75,
                                      folds=10, model=glm, groups=behav['Family_group'])
            bbs_model.predict()
            bbs_model.build_contribution_map()
            p_val = bbs_model.permutation_test(5000)
            r_mean, r_std = bbs_model.stats['r'].mean(), bbs_model.stats['r'].std()
            mse_mean, mse_std = bbs_model.stats['mse'].mean(), bbs_model.stats['mse'].std()
            single_scan_results.append([con, dtype, r_mean, r_std, mse_mean, mse_std, p_val]) 
            print(bbs_model.summary)
            print(p_val)
            bbs.to_pickle(bbs_model, f'bbs_models/{con}_{dtype}_{score}_10folds.pickle')
            save_masked_maps(bbs_model.contribution_map, f'consensus_maps/{con}_{dtype}_{score}_10folds')
            np.savetxt(f'predicted_values/{con}_{dtype}_{score}.csv',bbs_model.predicted, delimiter=',')
    single_scan_df = pd.DataFrame(data = single_scan_results,
                columns = ['input','type','r_mean','r_std','mse_mean','mse_std', 'p'])
    single_scan_df.to_csv(f'{score}_single_scan_results.csv')

In [None]:
# predict using multi-map data

for score in ['g_efa_cv', 'PMAT24_A_CR', 'ReadEng_Unadj', 'NEOFAC_O']:
    multi_scan_results=[]
    for comb in combinations:
        final_num_comps=300
        num_comps=int(final_num_comps/len(comb))
        ff_num =160
        input_name = '+'.join(comb)
        print(input_name)
        for dtype in ['orig', 'pred']:
            print(dtype)
            dfs = [read_features(f'data/{con}_z_masked_{dtype}.csv', subjlist=subjlist) for con in comb]
            dfs, behav = bbs.match_dfs_by_ind(dfs, hcp_df)
            bbs_model = bbs.BBSpredictMulti(data=dfs, target=behav[score], num_components=num_comps, 
                                            final_feature_number=ff_num, folds=10, model=elnet, groups=behav['Family_group'])
            bbs_model.predict()
            bbs_model.build_contribution_map()
            p_val = bbs_model.permutation_test(5000)
            r_mean, r_std = bbs_model.stats['r'].mean(), bbs_model.stats['r'].std()
            mse_mean, mse_std = bbs_model.stats['mse'].mean(), bbs_model.stats['mse'].std()
            multi_scan_results.append([input_name, dtype, r_mean, r_std, mse_mean, mse_std, p_val]) 
            print(bbs_model.summary)
            bbs.to_pickle(bbs_model, f'bbs_models/{input_name}_{dtype}_{score}_10folds.pickle')
            save_masked_maps(bbs_model.contribution_map, f'consensus_maps/{input_name}_{dtype}_{score}_10folds')
            np.savetxt(f'predicted_values/{input_name}_{dtype}_{score}.csv',bbs_model.predicted, delimiter=',')
    multi_scan_df = pd.DataFrame(data = multi_scan_results,
                columns = ['input','type','r_mean','r_std','mse_mean','mse_std', 'p'])
    multi_scan_df.to_csv(f'{score}_multi_scan_results.csv')

# predict using connectome and alff

In [None]:
for score in ['g_efa_cv','PMAT24_A_CR', 'ReadEng_Unadj', 'NEOFAC_O']:
    print(score)
    rs_results = []
    print('alff')
    data = read_features(f'data/alff.csv', subjlist=subjlist)
    dfs, behav = bbs.match_dfs_by_ind([data], hcp_df)
    bbs_model = bbs.BBSPredictSingle(data=dfs[0], target=behav[score], num_components=75,
                              folds=10, model=glm, groups=behav['Family_group'])
    bbs_model.predict()
    print(bbs_model.summary)
    p_val = bbs_model.permutation_test(5000)
    r_mean, r_std = bbs_model.stats['r'].mean(), bbs_model.stats['r'].std()
    mse_mean, mse_std = bbs_model.stats['mse'].mean(), bbs_model.stats['mse'].std()
    rs_results.append(['alff', '-', r_mean, r_std, mse_mean, mse_std, pval]) 

    print('connectome')
    data = read_features(f'data/rs_conn.csv', subjlist=subjlist)
    dfs, behav = bbs.match_dfs_by_ind([data], hcp_df)
    bbs_model = bbs.BBSPredictSingle(data=dfs[0], target=behav[score], num_components=75,
                              folds=10, model=glm, groups=behav['Family_group'])
    bbs_model.predict()
    print(bbs_model.summary)
    p_val = bbs_model.permutation_test(5000)
    r_mean, r_std = bbs_model.stats['r'].mean(), bbs_model.stats['r'].std()
    mse_mean, mse_std = bbs_model.stats['mse'].mean(), bbs_model.stats['mse'].std()
    np.savetxt(f'predicted_values/rs_conn_{score}.csv',bbs_model.predicted, delimiter=',')

    rs_results.append(['rs_conn', '-', r_mean, r_std, mse_mean, mse_std, p_val]) 

    rs_results = pd.DataFrame(data = rs_results,
                 columns = ['input','type','r_mean','r_std','mse_mean','mse_std', 'p'])
    rs_results.to_csv(f'{score}_rs_results.csv')

# compare all predictions to connectome

In [None]:
rs_conn = read_features(f'data/rs_conn.csv', subjlist=subjlist)
single_data_compare_scores = []
for con in ['WM_09_s4', 'Soc_01_s4', 'Gamb_01_s4', 'Em_03_s4', 'Rel_02_s4', 'Lang_03_s4']:
    print(con)
    for dtype in ['orig', 'pred']:
        print(dtype)
        data = read_features(f'data/{con}_z_masked_{dtype}.csv', subjlist=subjlist)
        s, p, _ = bbs_comparisons.compare_predictions_single(data, rs_conn, hcp_df, 'g_efa_cv', k=1000, train_size=0.9)
        single_data_compare_scores.append([con, dtype, s, p])
        print(p)
        
single_data_compare_scores_df = pd.DataFrame(single_data_compare_scores, columns=['input', 'type', 's', 'p'])
single_data_compare_scores_df.to_csv('single_scan_comprison_stats_family_1000_g_efa_cv_10split.csv')

In [None]:
rs_conn = read_features(f'data/rs_conn.csv', subjlist=subjlist)

multi_data_compare_scores = []
for comb in combinations:
    input_name = '+'.join(comb)
    print(input_name)
    for dtype in ['orig', 'pred']:
        data = [read_features(f'data/{con}_z_masked_{dtype}.csv', subjlist=subjlist) for con in comb]
        s,p = bbs_comparisons.compare_predictions_multi(data, rs_conn, hcp_df, 'g_efa_cv', 0.9, k=1000)
        print(p)
        multi_data_compare_scores.append([input_name, dtype, s, p])


multi_data_compare_scores_df = pd.DataFrame(multi_data_compare_scores, columns=['input', 'type', 's', 'p'])
multi_data_compare_scores_df.to_csv('multi_scan_comprison_stats_family_1000_g_efa_cv_10split.csv')

# compare prediction ratios and diffs (in-out of scanner)

In [None]:
wm_2bk_orig_z = read_features(f'data/WM_09_s4_z_masked_orig.csv', subjlist=subjlist)
wm_2bk_pred_z = read_features(f'data/WM_09_s4_z_masked_pred.csv', subjlist=subjlist)

In [None]:
comapre_ratios_results = []
comapre_diffs_results = []
for score in ['g_efa_cv', 'ReadEng_Unadj', 'PMAT24_A_CR']: 
    ratios, diffs = bbs_comparisons.compare_prediction_ratios(wm_2bk_orig_z,wm_2bk_pred_z,hcp_df,hcp_df_wm,score,'WM_Task_2bk_Acc')
    comapre_ratios_results.append([score,ratios['s'],ratios['p']])
    comapre_diffs_results.append([score,diffs['s'],diffs['p']])
comapre_ratios_results_df = pd.DataFrame(comapre_ratios_results, columns=['score', 'statistic', 'p'])
comapre_ratios_results_df.to_csv('comapre_ratios_results_1000iter_family.csv')
comapre_diffs_results_df = pd.DataFrame(comapre_diffs_results, columns=['score', 'statistic', 'p'])
comapre_diffs_results_df.to_csv('comapre_diffs_results_1000iter_family.csv')

In [None]:
scores = ['g_efa_cv', 'ReadEng_Unadj', 'PMAT24_A_CR', 'WM_Task_2bk_Acc']
inputs = {'wm_orig': 'WM_09_s4_z_masked_orig.csv', 'wm_pred': 'WM_09_s4_z_masked_pred.csv', 'rs_conn': 'rs_conn.csv'}
results = []
for d in inputs.keys():
    print(d)
    for score in scores:
        print(score)
        data = read_features(f'data/{inputs[d]}', subjlist=subjlist)
        if score == 'WM_Task_2bk_Acc':
            dfs, behav = bbs.match_dfs_by_ind([data], hcp_df_wm)
        else:       
            dfs, behav = bbs.match_dfs_by_ind([data], hcp_df)
        bbs_model = bbs.BBSPredictSingle(data=dfs[0], target=behav[score], num_components=75,
                                  folds=10, model=glm, groups=behav['Family_group'])
        bbs_model.predict()
        results.append([d, score, np.mean(bbs_model.stats['r']), np.std(bbs_model.stats['r']), np.mean(bbs_model.stats['mse']), np.std(bbs_model.stats['mse'])])
wm_results_df = pd.DataFrame(results, columns=['input', 'score', 'r_mean', 'r_std', 'mse_mean', 'mse_std'])
wm_results_df.to_csv('wm_prediction_results_g_wm2bk_1.csv', index=None)