In [1]:
#from time import time
import sklearn
print(sklearn.__version__)

import pandas as pd
import numpy as np

cat_vbm = pd.read_csv("data/ROI_catROI_neuromorphometrics_Vgm.csv", delimiter = ",")
fc = pd.read_csv("data/outc01_fc_power264.csv", delimiter = ",")

fc['0'] = cat_vbm['names']
fc.rename(columns  = {'0':'names'}, inplace = True)



0.19.1


In [2]:
subject_behav = pd.read_csv("data/pnc_cognitive_response.csv", delimiter = ",")

# subject_behav['SUBJID'] = subject_behav['SUBJID'].apply(str)
# fc['names'] = fc['names'].apply(str)

subject_behav_merge = subject_behav.merge(fc.iloc[:,0].to_frame(), right_on = 'names', left_on = 'SUBJID', how = 'inner')
fc_merge = fc.merge(subject_behav_merge.iloc[:,0].to_frame(), right_on = 'SUBJID', left_on ='names', how = 'inner')

# remove samples with Med_rating higher than 1:
# subject_behav_merge["Med_Rating"] = subject_behav_merge["Med_Rating"].fillna(0)
# subject_behav_merge["Med_Rating"] = pd.to_numeric(subject_behav_merge["Med_Rating"],  errors='coerce')
# subject_behav_merge = subject_behav_merge.loc[subject_behav_merge["Med_Rating"]<2]

cog_label_acc = [
    # executive control:
    "pcet_acc2",
    "pcpt_t_tp",
    "lnb_tp2",
    # episodic memory task:
    "pwmt_kiwrd_tot",
    "pfmt_ifac_tot",
    "volt_svt",
    # complex cognitive task:
    "pvrt_cr",
    "pmat_cr",
    "plot_tc",
    # social cognition task:
    "peit_cr",
    "pedt_a",
    "padt_a"
]

cog_label_speed = [
    # executive control:
    "pcet_rtcr",
    "pcpt_t_tprt",
    "lnb_rtc2",
    # episodic memory task:
    "pwmt_kiwrd_rtc",
    "pfmt_ifac_rtc",
    "volt_svtcrt",
    # complex cognitive task:
    "pvrt_rtcr",
    "pmat_rtcr",
    "plot_tcrt",
    # social cognition task:
    "peit_crt",
    "pedt_t",
    "padt_t",
    # sensorimotor speed task:
    "mp_mp2rtcr",
    "tap_tot"
]

cog_label = cog_label_acc + cog_label_speed

subject_behav_merge = subject_behav_merge.loc[:, ['SUBJID', 'Sex', 'age_at_cnb'] + cog_label]
subject_behav_merge.shape

(839, 29)

In [3]:
print("participants:")
print("number of subjects: %d" % subject_behav_merge.shape[0])
print("meam  and std of age: %.2f %.2f" % (subject_behav_merge["age_at_cnb"].mean(),\
                                           subject_behav_merge["age_at_cnb"].std()))
print("range of age: %d - %d" % (subject_behav_merge["age_at_cnb"].max(),\
                                 subject_behav_merge["age_at_cnb"].min()))

print("number of females: %d" % subject_behav_merge.loc[subject_behav_merge['Sex']=='F',:].shape[0])
print("number of males: %d" % subject_behav_merge.loc[subject_behav_merge['Sex']=='M',:].shape[0])



participants:
number of subjects: 839
meam  and std of age: 14.37 3.37
range of age: 21 - 8
number of females: 451
number of males: 388


In [4]:
# check if behav and brain data are matched:
#pd.concat([subject_behav_merge['SUBJID'], fc_merge['names'], subject_behav_merge['SUBJID'] - fc_merge['names']], axis = 1)

In [5]:
from sklearn.model_selection import KFold
from sklearn import preprocessing
# from sklearn.model_selection import StratifiedKFold

# tune model in n-fold CV:
nfold = 10
seed = 111
kf = KFold(n_splits=nfold, shuffle = True, random_state=seed)
# kf = StratifiedKFold(n_splits=nfold, shuffle = True, random_state=seed)

def run_model(model, behav_data, brain_feature, kf, y_label_list, fit_method=0):
    sex_ind = behav_data['Sex'].replace({'F': 0, 'M': 1})
    
    brain_feature_x = brain_feature.drop(columns = 'names')
    brain_feature_x.fillna(0, inplace = True)
    X = pd.concat([sex_ind.to_frame(), brain_feature_x], axis = 1).values
    subj_id = behav_data['SUBJID'].values.flatten()
    
    # create empty dataframe to save results:
    column_index = pd.MultiIndex.from_product([['Pearson r', 'MAE', 'rmse'],
                                               ['cv' + str(i) for i in range(1, kf.n_splits+1, 1)]])
    
    result_table = pd.DataFrame(index = y_label_list, columns = column_index)
    plot_data = pd.DataFrame(columns = ['feature', 'SUBJID', 'CV'])
    
    
    for y_label in y_label_list:
        print('work on behav task: %s', y_label)
        y = behav_data[y_label].values
        y = y.flatten()
        
        # remove NaNs:
        nan_idx = np.isnan(y)
        y = y[~nan_idx]
        subj_id_ilabel = subj_id[~nan_idx]
        X_ilabel = X[~nan_idx,:]
        
#         print(np.count_nonzero(np.isnan(X)))
#         print(X.shape)
#         pos = np.argwhere(np.isnan(X))
#         print(pos[:,0])
#         print(pos[:,1])
        
        i = 1
        for train_index, test_index in kf.split(X_ilabel):
            print('run_model on CV: %d' % i)

            X_train, X_test = X_ilabel[train_index], X_ilabel[test_index]
            y_train, y_test = y[train_index], y[test_index]
            subj_id_test = subj_id_ilabel[test_index]
            
            # normalize X_train and X_test based on mean and sd of X_train.
            scaler = preprocessing.StandardScaler().fit(X_train)
            X_train = scaler.transform(X_train)  
            X_test = scaler.transform(X_test)
            
            seed = 7
            np.random.seed(seed)           
            
            if fit_method==0:
                
                fit_result = model.fit(X_train, y_train)
                y_prediction = model.predict(X_test)
                
            elif fit_method ==1:
                
                fit_result = model.fit(X_train, y_train)
                y_prediction = model.best_estimator_.predict(X_test)
                
                print("Best: %f using %s" % (fit_result.best_score_, fit_result.best_params_))
            
            result_table['Pearson r', 'cv' + str(i)][y_label] = np.corrcoef(y_test, y_prediction)[0,1]
            result_table['rmse', 'cv' + str(i)][y_label] = np.sqrt(np.mean(np.square(y_test - y_prediction)))
            result_table['MAE', 'cv' + str(i)][y_label] = np.mean(np.abs(y_test - y_prediction))

            plot_data_cv = pd.DataFrame(columns = ['task', 'SUBJID', 'CV', 'y', 'y_pred'])
            plot_data_cv['y'] = y_test
            plot_data_cv['y_pred'] = y_prediction
            plot_data_cv.loc[:,'CV'] = i
            plot_data_cv.loc[:,'task'] = y_label
            plot_data_cv['SUBJID'] = subj_id_test

            plot_data = pd.concat([plot_data, plot_data_cv], axis = 0, sort = False)
            i = i+1


In [None]:
###############################################ridge:#########################################
from sklearn.linear_model import RidgeCV
import numpy as np

# ridge regression:
alpha = np.power(10, np.linspace(-3, 3, num = 10))
# print(alpha)
ridge_cv = RidgeCV(alphas=alpha, cv = nfold)

result_table_ridge, scatter_data_ridge = run_model(ridge_cv, subject_behav_merge, fc_merge, 
                                                   kf,y_label_list = cog_label, fit_method = 0)


work on behav task: %s pcet_acc2
run_model on CV: 1
run_model on CV: 2
run_model on CV: 3
run_model on CV: 4
run_model on CV: 5
run_model on CV: 6
run_model on CV: 7
run_model on CV: 8
run_model on CV: 9
run_model on CV: 10
work on behav task: %s pcpt_t_tp
run_model on CV: 1
run_model on CV: 2
run_model on CV: 3
run_model on CV: 4
run_model on CV: 5
run_model on CV: 6
run_model on CV: 7
run_model on CV: 8
run_model on CV: 9
run_model on CV: 10
work on behav task: %s lnb_tp2
run_model on CV: 1
run_model on CV: 2
run_model on CV: 3
run_model on CV: 4
run_model on CV: 5
run_model on CV: 6
run_model on CV: 7
run_model on CV: 8
run_model on CV: 9
run_model on CV: 10
work on behav task: %s pwmt_kiwrd_tot
run_model on CV: 1
run_model on CV: 2
run_model on CV: 3
run_model on CV: 4
run_model on CV: 5
run_model on CV: 6
run_model on CV: 7
run_model on CV: 8
run_model on CV: 9
run_model on CV: 10
work on behav task: %s pfmt_ifac_tot
run_model on CV: 1
run_model on CV: 2
run_model on CV: 3
run_mod

In [None]:
result_table_ridge.to_csv("behav_prediction_result/result_table_ridge.csv", sep = ",")
scatter_data_ridge.to_csv("behav_prediction_result/scatter_data_ridge.csv", sep = ",", index = False)


In [None]:
############################################### SVR: #########################################
from sklearn.svm import SVR
import numpy as np

C = np.power(10, np.linspace(-1,2, num = 20))
epsilon=np.linspace(.001, 1, num = 20)

param_grid_svr = dict(C = C, epsilon = epsilon)
# linear model takes very long time to run.
#grid_svr = GridSearchCV(estimator=SVR(kernel='linear'), param_grid=param_grid_svr, n_jobs=3, refit = True)
# use default kernel rbf:
grid_svr = GridSearchCV(estimator=SVR(), param_grid=param_grid_svr, n_jobs=3, refit = True)

result_table_svr, scatter_data_svr = run_model(grid_svr, subject_info_hc, brain_feature_list, kf, fit_method = 1)



In [None]:
result_table_svr.to_csv("result/result_table_svr.csv", sep = ",")
scatter_data_svr.to_csv("result/scatter_data_svr.csv", sep = ",", index = False)

In [None]:
############################################### GPR: #########################################
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process.kernels import ConstantKernel as CK

# gpr_kernel = 1.0 * RBF(length_scale=10, length_scale_bounds=(1e-2, 1e3)) \
#                     + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

# gpr_kernel2 = 1.0 * RBF(length_scale=100, length_scale_bounds=(1e-2, 1e3)) \
#                     + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

# gpr_kernel3 = 1.0 * RBF(length_scale=1000, length_scale_bounds=(1e-2, 1e3)) \
#                     + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

gpr_kernel4 = CK(2.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) \
                    + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))


# gpr_kernel4 = 1.0 * RBF(length_scale=100, length_scale_bounds=(1e-2, 1e3))

gpr_alpha = np.power(10, np.linspace(-2, 2, num = 10))

#gpr_alpha = np.linspace(0,.1, num = 3)

#param_grid_gpr = dict(alpha = gpr_alpha, kernel=[gpr_kernel, gpr_kernel2, gpr_kernel3, gpr_kernel4])
param_grid_gpr = dict(alpha = gpr_alpha, kernel=[gpr_kernel4])

grid_gpr = GridSearchCV(estimator=GaussianProcessRegressor(), \
                                param_grid=param_grid_gpr, n_jobs=3, refit = True)

result_table_gpr, scatter_data_gpr = run_model(grid_gpr, subject_info_hc, brain_feature_list, kf, fit_method = 1)

In [None]:
result_table_gpr.to_csv("result/result_table_gpr.csv", sep = ",")
scatter_data_gpr.to_csv("result/scatter_data_gpr.csv", sep = ",", index = False)