In [95]:
## run LASSO on the harmonic features (entropy of the spectrum power histogram)
import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

input_dir = 'adni_out03_network_analysis'
output_dir = 'adni_out05_lasso_network_metrics'


In [176]:
data_files = glob.glob(input_dir + '/adni_out04_entropy_freq_*.csv')
data_files.sort()

all_result = []
print_group_size = True
comparision = [('CN', 'AD'), ('CN', 'EMCI'), ('CN', 'LMCI'), ('CN', 'SMC')]
result_accuracy = {comp: [] for comp in comparision}
result_coefs = {comp: [] for comp in comparision}

for f in data_files:
    
    print(f)
    freq = re.search('(.*)_freq_(.*).csv', f).group(2)
    data = pd.read_csv(f, index_col = 0)
    data.dropna(subset=['DX'], inplace = True)
    
    if print_group_size:
        print(data.groupby(['PTGENDER', 'DX']).size())
        print(data[['PTGENDER', 'DX', 'AGE']].groupby(['PTGENDER', 'DX']).mean())
        print_group_size = False
    
    X = pd.concat([data['AGE'], data['PTGENDER'].map({'Male': 1, 'Female': 0}), data.iloc[:, 6:]], axis = 1).values
    y = data['DX'].map({'CN': 0, 'EMCI': 1, 'LMCI': 1, 'AD': 1, 'SMC': 1}).values
    
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    for comp in comparision:
        # select data with ad/lmci and cn:    
        idx_ad_cn = data['DX'].isin(comp)
        # build lassoCV that tune parameters with inner cv:
        reg = LogisticRegressionCV(cv=5, random_state=0, penalty = 'l1', solver='liblinear').fit(X[idx_ad_cn,:], y[idx_ad_cn])
        # reg = LassoCV(cv=5, random_state=0, normalize = True, tol = .001).fit(X[idx_ad_cn,:], y[idx_ad_cn])
        # run outer cv:
        cv_result = cross_val_score(reg, X[idx_ad_cn,:], y[idx_ad_cn], cv=5)
        print('cross validaton result, mean %3f, std: %3f' % (cv_result.mean(), cv_result.std()))
        result_accuracy[comp].append(cv_result)
        result_coefs[comp].append(reg.coef_.reshape(-1))
        
    #     break
    # break

adni_out04_power_histogram_and_entropy/adni_out04_entropy_freq_00.csv
PTGENDER  DX  
Female    AD      17
          CN      50
          EMCI    43
          LMCI    28
          SMC     23
Male      AD      16
          CN      44
          EMCI    42
          LMCI    31
          SMC     17
dtype: int64
                     AGE
PTGENDER DX             
Female   AD    71.252941
         CN    72.484000
         EMCI  69.409302
         LMCI  70.664286
         SMC   71.560870
Male     AD    73.806250
         CN    75.161364
         EMCI  71.495238
         LMCI  71.306452
         SMC   71.494118
cross validaton result, mean 0.732308, std: 0.029239
cross validaton result, mean 0.598095, std: 0.042398
cross validaton result, mean 0.587742, std: 0.032569
cross validaton result, mean 0.701425, std: 0.004558
adni_out04_power_histogram_and_entropy/adni_out04_entropy_freq_01.csv
cross validaton result, mean 0.740308, std: 0.016553
cross validaton result, mean 0.507619, std: 0.071414
cros

In [177]:
## save results:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

for comp in comparision:
    res = pd.DataFrame(result_accuracy[comp])
    res.columns = ['cv' + str(i) for i in range(1, 6)]
    res.index = ['freq' + str(i) for i in range(1, 11)]
    
    res.to_csv(output_dir + '/lasso_accuracy_' + comp[0] + '_' + comp[1] + '.csv')
    
    res = pd.DataFrame(result_coefs[comp])
    res.columns = ['Age', 'Sex'] + ['roi' + str(i) for i in range(1, 265)]
    res.index = ['freq' + str(i) for i in range(1, 11)]
    
    res.to_csv(output_dir + '/lasso_coefs_brain_only' + comp[0] + '_' + comp[1] + '.csv')
