In [128]:
# predict adni MEM and EF score based on brain imaging features:
## 1. harmonic entropy
## 2. network metrics

import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score

input_dir = 'adni_out04_power_histogram_and_entropy'
output_dir = 'adni_out05_composite_score_prediction_entropy'

if not os.path.exists(output_dir):
    os.mkdir(output_dir)


In [129]:
from sklearn.model_selection import KFold
# from sklearn.model_selection import StratifiedKFold

# tune model in n-fold CV:
# n fold CV on hc data:
nfold = 3
seed = 123
kf = KFold(n_splits=nfold, shuffle = True, random_state=seed)

n_alphas = 100
alphas = np.logspace(0, 4, n_alphas)
ridge_cv = RidgeCV(alphas=alphas, cv = 3)

## SVR with grid_search:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [.01, 0.1, 1, 10],
              'gamma': [5, 1, 0.1, 0.01, 0.001, 0.0001],
              'epsilon': [.025, .05, .1, .25, .5],
              'kernel': ['rbf', 'sigmoid']}
 
svr_grid = GridSearchCV(SVR(), param_grid, refit = True, verbose = True, n_jobs = 3, cv = 3)

In [130]:
data_files = glob.glob(input_dir + '/adni_out04_entropy_freq_*.csv')
data_files.sort()

input_subject_info = 'adni_out02_test_subject_info/subject_info_merge2.csv'
subject_info = pd.read_csv(input_subject_info, index_col = 0)

column_index = pd.MultiIndex.from_product([['Pearson r', 'R square', 'MAE', 'rmse'],
                                           ['cv' + str(i) for i in range(1, kf.n_splits+1, 1)]])

row_index = ['freq' + str(i) for i in range(1, len(data_files) + 1)]
result_table = pd.DataFrame(index = row_index, columns = column_index)

response = ['ADNI_MEM', 'ADNI_EF']
result_table_all = {resp: result_table.copy() for resp in response}

for i_freq, f in enumerate(data_files):
    
    plot_data = pd.DataFrame(columns = ['SUBJID', 'CV'])
    
    print(f)
    freq = re.search('(.*)_freq_(.*).csv', f).group(2)
    data = pd.read_csv(f, index_col = 0)
    
    data = subject_info.merge(data.iloc[:, [0] + [i for i in range(6, data.shape[1])]], on = 'subject_id')
    data.dropna(subset=response, inplace = True)
    subjid = data['subject_id'].values
    
    X = pd.concat([data['PTGENDER'].map({'Male': 1, 'Female': 0}), 
                   data['AGE'], 
                   data.iloc[:, 8:]], axis = 1).values
    
    i = 1
    for train_index, test_index in kf.split(X):
        print('run_model on CV: %d' % i)

        X_train, X_test = X[train_index,:], X[test_index,:]
        y_train, y_test = y[train_index], y[test_index]

        # normalize X_train and X_test based on mean and sd of X_train. 
        # don't normalize 1st col which is sex, 
        scaler = StandardScaler().fit(X_train[:,1:])
        X_train[:,1:] = scaler.transform(X_train[:,1:])  
        X_test[:,1:] = scaler.transform(X_test[:,1:])
        subjid_test = subjid[test_index] 
        
        seed = 7
        np.random.seed(seed)           
    
        plot_data_cv = pd.DataFrame(columns = ['SUBJID', 'CV'])
        for resp in response:

            y = data[resp].values
            y_train = y[train_index]
            y_test  = y[test_index]
            
            
            fit_result = ridge_cv.fit(X_train, y_train)
            y_prediction = ridge_cv.predict(X_test)
            print(ridge_cv.alpha_)
            
            # fitting the model for grid search
            # svr_grid.fit(X_train, y_train)
            # print(svr_grid.best_params_)
            # y_prediction = svr_grid.predict(X_test)

            result_table = result_table_all[resp]
            result_table['R square', 'cv' + str(i)][i_freq] = r2_score(y_test, y_prediction)
            result_table['Pearson r', 'cv' + str(i)][i_freq] = np.corrcoef(y_test, y_prediction)[0,1]
            result_table['rmse', 'cv' + str(i)][i_freq] = np.sqrt(np.mean(np.square(y_test - y_prediction)))
            result_table['MAE', 'cv' + str(i)][i_freq] = np.mean(np.abs(y_test - y_prediction))

            plot_data_cv[resp] = y_test
            plot_data_cv[resp + '_pred'] = y_prediction
            plot_data_cv.loc[:,'CV'] = i
            plot_data_cv['SUBJID'] = subjid_test
            plot_data_cv['Sex'] = X_test[:,0]

        plot_data = pd.concat([plot_data,plot_data_cv])    
        plot_data.to_csv(output_dir+"/scatter_data_ridge_freq" + str(i_freq) + ".csv")
        # plot_data.to_csv(output_dir+"/scatter_data_svr_freq" + str(i_freq) + ".csv")
            
        i += 1
        

print('finished')

adni_out04_power_histogram_and_entropy/adni_out04_entropy_freq_00.csv
run_model on CV: 1
319.92671377973846
1176.811952434999
run_model on CV: 2
890.2150854450392
613.5907273413176
run_model on CV: 3
977.0099572992257
3593.8136638046294
adni_out04_power_histogram_and_entropy/adni_out04_entropy_freq_01.csv
run_model on CV: 1
1417.4741629268062
2718.588242732943
run_model on CV: 2
2477.0763559917114
3944.20605943766
run_model on CV: 3
4328.7612810830615
6280.29144183426
adni_out04_power_histogram_and_entropy/adni_out04_entropy_freq_02.csv
run_model on CV: 1
1707.352647470692
3944.20605943766
run_model on CV: 2
1176.811952434999
4328.7612810830615
run_model on CV: 3
1555.6761439304723
559.0810182512229
adni_out04_power_histogram_and_entropy/adni_out04_entropy_freq_03.csv
run_model on CV: 1
1417.4741629268062
613.5907273413176
run_model on CV: 2
3593.8136638046294
4328.7612810830615
run_model on CV: 3
1417.4741629268062
1417.4741629268062
adni_out04_power_histogram_and_entropy/adni_out04_e

In [131]:
result_table

Unnamed: 0_level_0,Pearson r,Pearson r,Pearson r,R square,R square,R square,MAE,MAE,MAE,rmse,rmse,rmse
Unnamed: 0_level_1,cv1,cv2,cv3,cv1,cv2,cv3,cv1,cv2,cv3,cv1,cv2,cv3
freq1,0.208716,0.156462,0.253884,0.028898,-0.011839,0.04032,0.9413,0.979631,0.987678,1.227219,1.246306,1.229027
freq2,0.202154,0.155887,0.227514,0.027332,0.012994,0.026269,0.950947,0.974493,1.002759,1.228208,1.230918,1.237991
freq3,0.289105,0.189124,0.177228,0.047614,0.031916,0.001122,0.934537,0.966234,1.030271,1.215335,1.219062,1.253876
freq4,0.218736,0.19332,0.213748,0.031647,0.035045,0.029426,0.931104,0.971153,1.007916,1.225481,1.21709,1.235983
freq5,0.199306,0.240576,0.261721,-0.015419,0.056111,0.051519,0.979134,0.954585,0.999852,1.254909,1.203731,1.221835
freq6,0.261078,0.181549,0.148526,0.052497,0.025514,-0.042604,0.94476,0.977365,1.01868,1.212215,1.223086,1.281026
freq7,0.24978,0.156159,0.116274,0.040888,-2.5e-05,-0.014303,0.941813,0.991613,1.034972,1.219619,1.239009,1.26352
freq8,0.345412,0.22446,0.229889,0.092581,0.041394,0.02287,0.905611,0.965884,1.013522,1.186297,1.213079,1.24015
freq9,0.143219,0.167021,0.189835,-0.012732,0.020571,0.013905,0.972046,0.979986,1.00086,1.253248,1.226183,1.245826
freq10,0.3095,0.24655,0.254428,0.076831,0.060034,0.038577,0.918444,0.94952,0.9924,1.196548,1.201227,1.230142


In [134]:
def summary_result(result_table):
    import pandas as pd
    
    column_index = pd.MultiIndex.from_product([['Pearson r', 'R square', 'MAE', 'rmse'], \
                                               ['mean', 'std']])
    
    result_table_summary = pd.DataFrame(columns = column_index)

    result_table_summary['Pearson r', 'mean'] = result_table['Pearson r'].mean(axis = 1)
    result_table_summary['Pearson r', 'std'] = result_table['Pearson r'].std(axis = 1)

    result_table_summary['R square', 'mean'] = result_table['R square'].mean(axis = 1)
    result_table_summary['R square', 'std'] = result_table['R square'].std(axis = 1)
    
    result_table_summary['MAE', 'mean'] = result_table['MAE'].mean(axis = 1)
    result_table_summary['MAE', 'std'] = result_table['MAE'].std(axis = 1)

    result_table_summary['rmse', 'mean'] = result_table['rmse'].mean(axis = 1)
    result_table_summary['rmse', 'std'] = result_table['rmse'].std(axis = 1)
    
    #print('summary_result:')
    #print(result_table_summary)
    
    result_table2 = result_table[['Pearson r', 'R square']].reset_index(level = 0, inplace = False)
    result_table3 = result_table2.rename(columns ={'index':'frequency'}, inplace = False)
    result_accuracy_plot = pd.melt(result_table3, id_vars=['frequency'], var_name = "cv")
    
#     result_table2 = result_table['R square'].reset_index(level = 0, inplace = False)
#     result_table3 = result_table2.rename(columns ={'index':'feature'}, inplace = False)
#     result_r2_plot = pd.melt(result_table3, id_vars=['feature'], \
#                                    value_name = "R square", var_name = "boot")
    #print(result_accuracy_plot)
    return((result_table_summary, result_accuracy_plot))

result_table_summary, result_accuracy_plot = summary_result(result_table)

In [135]:
# result_table_summary.to_csv(output_dir + '/result_table_svr.csv')
result_table_summary.to_csv(output_dir + '/result_table_ridge.csv')
result_table_summary


Unnamed: 0_level_0,Pearson r,Pearson r,R square,R square,MAE,MAE,rmse,rmse
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
freq1,0.206354,0.048754,0.019126,0.027418,0.969536,0.024782,1.234184,0.010537
freq2,0.195185,0.036319,0.022198,0.007989,0.976066,0.025942,1.232372,0.005051
freq3,0.218486,0.061447,0.026884,0.023651,0.977014,0.048769,1.229424,0.021258
freq4,0.208601,0.013467,0.032039,0.00283,0.970058,0.038418,1.226184,0.009466
freq5,0.233868,0.031744,0.030737,0.040038,0.977857,0.02266,1.226825,0.025951
freq6,0.197051,0.057856,0.011802,0.049011,0.980269,0.037045,1.238776,0.036991
freq7,0.174071,0.068532,0.008854,0.028647,0.989466,0.046616,1.240716,0.022
freq8,0.266587,0.068318,0.052282,0.036108,0.961672,0.054079,1.213175,0.026927
freq9,0.166692,0.02331,0.007248,0.017622,0.984297,0.014883,1.241752,0.013984
freq10,0.270159,0.034297,0.058481,0.019174,0.953455,0.037135,1.209306,0.018196
