In [2]:
## run LASSO on the entropy of orignal bold signals.

import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

input_dir = 'adni_out04_bold_entropy/'
output_dir = 'adni_out05_lasso_bold_entropy/'

## subject info:
input_subject_info = 'adni_out02_test_subject_info/subject_info_merge.csv'
subject_info = pd.read_csv(input_subject_info, index_col = 0)
subject_info.sort_values(by = ['subject_id', 'ses_id'], inplace = True)
subject_info


Unnamed: 0,subject_id,ses_id,AGE,PTGENDER,DX_bl,DX
0,002_S_0295,2012-05-10_15_42_37.0,84.8,Male,CN,CN
1,002_S_0413,2017-06-21_13_23_38.0,76.3,Female,CN,CN
2,002_S_0685,2011-07-08_07_04_27.0,89.6,Female,CN,CN
3,002_S_0729,2012-08-07_07_21_09.0,65.1,Female,LMCI,LMCI
4,002_S_1155,2017-04-24_13_21_32.0,57.8,Male,LMCI,LMCI
...,...,...,...,...,...,...
324,941_S_4100,2017-10-27_11_05_51.0,78.5,Female,CN,CN
325,941_S_4187,2017-06-05_14_05_55.0,62.0,Male,LMCI,LMCI
326,941_S_4292,2017-08-21_15_05_09.0,70.9,Male,CN,CN
327,941_S_4365,2017-08-28_14_06_46.0,80.3,Male,CN,CN


In [39]:
# data_files = glob.glob(input_dir + '/adni_out04_entropy_freq_*.csv')
metrics = ['adni_out04_bold_entropy']
data_files = [input_dir + m + '.csv' for m in metrics ]
data_files.sort()

all_result = []
print_group_size = True
# comparision = [('CN', 'AD'), ('CN', 'EMCI'), ('CN', 'LMCI'), ('CN', 'SMC')]
comparision = [('CN', 'AD'), ('CN', 'EMCI'), ('CN', 'LMCI')]
result_accuracy = {comp: [] for comp in comparision}
result_coefs = {comp: [] for comp in comparision}

for f in data_files:
    
    print(f)
    data = pd.read_csv(f, index_col = 0)
    data = subject_info.merge(data, on = ['subject_id', 'ses_id', 'AGE', 'PTGENDER', 'DX_bl', 'DX'], how = 'inner')
    data.dropna(subset=['DX'], inplace = True)
    data.drop(columns = ['ses_id', 'DX_bl'], inplace = True)
    # break
    
    if print_group_size:
        print(data.groupby(['PTGENDER', 'DX']).size())
        print_group_size = False
    
    X = pd.concat([data['AGE'], data['PTGENDER'].map({'Male': 1, 'Female': 0}), data.iloc[:, 4:]], axis = 1).values
    y = data['DX'].map({'CN': 0, 'EMCI': 1, 'LMCI': 1, 'AD': 1, 'SMC': 1}).values
    
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    C_values = np.logspace(-1, 2, 10)
    
    for comp in comparision:
        # select data with ad/lmci and cn:    
        idx_ad_cn = data['DX'].isin(comp)
        # build lassoCV that tune parameters with inner cv:
        reg = LogisticRegressionCV(cv=5, random_state=0, Cs=C_values, n_jobs = 6,
                                   penalty = 'l1', solver='liblinear', 
                                   refit = False).fit(X[idx_ad_cn,:], y[idx_ad_cn])
        # reg = LassoCV(cv=5, random_state=0, normalize = True, tol = .001).fit(X[idx_ad_cn,:], y[idx_ad_cn])
        # run outer cv:
        cv_result = cross_val_score(reg, X[idx_ad_cn,:], y[idx_ad_cn], cv=5)
        print('cross validaton result, mean %3f, std: %3f' % (cv_result.mean(), cv_result.std()))
        result_accuracy[comp]=list(cv_result)
        result_coefs[comp]=list(reg.coef_.reshape(-1))
        
    #     break
    # break
    
print('finished')


adni_out04_bold_entropy/adni_out04_bold_entropy.csv
PTGENDER  DX  
Female    AD      17
          CN      50
          EMCI    43
          LMCI    28
          SMC     23
Male      AD      16
          CN      44
          EMCI    42
          LMCI    31
          SMC     17
dtype: int64
cross validaton result, mean 0.708923, std: 0.028706
cross validaton result, mean 0.558730, std: 0.076841
cross validaton result, mean 0.653333, std: 0.020135
finished


In [41]:
## save results:
## save all accuracy results to one file (as we only have 1 metric for each comparision)
## udpates on May-10-2022.

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

res = pd.DataFrame(result_accuracy)
res.index = ['cv' + str(i) for i in range(1, 6)]
res.columns = ['CV vs. AD', 'CN vs. EMCI', 'CN vs. LMCI']

res.to_csv(output_dir + '/lasso_accuracy.csv')


res = pd.DataFrame(result_coefs)
res.index = ['Age', 'Sex'] + ['roi' + str(i) for i in range(1, 265)]
res.columns = ['CV vs. AD', 'CN vs. EMCI', 'CN vs. LMCI']

res.to_csv(output_dir + '/lasso_coefs.csv')


In [42]:
res
# result_accuracy

Unnamed: 0,CV vs. AD,CN vs. EMCI,CN vs. LMCI
Age,-0.092447,-0.442705,-0.568348
Sex,0.029048,0.000000,0.000000
roi1,-0.262912,0.000000,0.000000
roi2,0.100663,-0.004461,0.000000
roi3,0.000000,0.000000,0.000000
...,...,...,...
roi260,0.000000,0.000000,0.000000
roi261,0.000000,0.000000,0.000000
roi262,-0.039560,0.000000,0.000000
roi263,0.000000,0.000000,0.559939


In [38]:
metrics

['adni_out04_bold_entropy']