In [1]:
## run LASSO on the harmonic features (entropy of the spectrum power histogram)
import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
## this only returns the accuracy. we use all the data to estimate a single estimator to extract the coefs (with is averaged across the inner cv)
from sklearn.model_selection import cross_val_score 
## to do: we can use cross validate to return estimator for each outer cv.
from sklearn.model_selection import cross_validate 

input_dir = 'adni_out04_power_histogram_and_entropy'
output_dir = 'adni_out05_lasso'
cv_outer = 5


In [5]:
data_files = glob.glob(input_dir + '/adni_out04_entropy_freq_*.csv')
data_files.sort()

all_result = []
print_group_size = True

comparision = [('CN', 'AD'), ('CN', 'EMCI'), ('CN', 'LMCI'), ('CN', 'SMC')]
result_accuracy = {comp: [] for comp in comparision}
result_coefs = {comp: [] for comp in comparision}

for f in data_files:
    
    print(f)
    freq = re.search('(.*)_freq_(.*).csv', f).group(2)
    data = pd.read_csv(f, index_col = 0)
    data.dropna(subset=['DX'], inplace = True)
    
    if print_group_size:
        print(data.groupby(['PTGENDER', 'DX']).size())
        print_group_size = False
    
    X = pd.concat([data['AGE'], 
                   data['PTGENDER'].map({'Male': 1, 'Female': 0}), 
                   data.iloc[:, 6:]], axis = 1).values
    y = data['DX'].map({'CN': 0, 'EMCI': 1, 'LMCI': 1, 'AD': 1, 'SMC': 1}).values
    
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    C_values = np.logspace(-2, 3, 10)
    
    for comp in comparision:
        # select data with ad/lmci and cn:    
        idx_ad_cn = data['DX'].isin(comp)
        # build lassoCV that tune parameters with inner cv:
        # reg = LogisticRegressionCV(cv = 5, random_state=0, Cs = C_values, n_jobs = 5,
        #                            penalty = 'l1', solver='liblinear').fit(X[idx_ad_cn,:], y[idx_ad_cn])
        
        reg = LogisticRegressionCV(cv = 5, random_state=0, Cs = C_values, n_jobs = 5,
                                   penalty = 'l1', solver='liblinear')
        # reg = LassoCV(cv=5, random_state=0, normalize = True, tol = .001).fit(X[idx_ad_cn,:], y[idx_ad_cn])
        # run outer cv:
        cv_result = cross_val_score(reg, X[idx_ad_cn,:], y[idx_ad_cn], cv=cv_outer)
        print('cross validaton result, mean %3f, std: %3f' % (cv_result.mean(), cv_result.std()))
        result_accuracy[comp].append(cv_result)
        
        reg = LogisticRegressionCV(cv = 5, random_state=0, Cs = C_values, n_jobs = 5,
                                   penalty = 'l1', solver='liblinear').fit(X[idx_ad_cn,:], y[idx_ad_cn])
        result_coefs[comp].append(reg.coef_.reshape(-1))
        
    #     break
    # break

print('finished')


adni_out04_power_histogram_and_entropy/adni_out04_entropy_freq_00.csv
PTGENDER  DX  
Female    AD      17
          CN      50
          EMCI    43
          LMCI    28
          SMC     23
Male      AD      16
          CN      44
          EMCI    42
          LMCI    31
          SMC     17
dtype: int64
cross validaton result, mean 0.724308, std: 0.044134
cross validaton result, mean 0.525238, std: 0.048516
cross validaton result, mean 0.626882, std: 0.063337
cross validaton result, mean 0.701425, std: 0.004558
adni_out04_power_histogram_and_entropy/adni_out04_entropy_freq_01.csv
cross validaton result, mean 0.716615, std: 0.045663
cross validaton result, mean 0.620159, std: 0.040947
cross validaton result, mean 0.627097, std: 0.046719
cross validaton result, mean 0.679202, std: 0.043529
adni_out04_power_histogram_and_entropy/adni_out04_entropy_freq_02.csv
cross validaton result, mean 0.787385, std: 0.039312
cross validaton result, mean 0.503016, std: 0.028657
cross validaton result

In [9]:
## save results:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

for comp in comparision:
    res = pd.DataFrame(result_accuracy[comp])
    res.columns = ['cv' + str(i) for i in range(1, cv_outer+1)]
    res.index = ['freq' + str(i) for i in range(1, 11)]
    
    res.to_csv(output_dir + '/lasso_accuracy_' + comp[0] + '_' + comp[1] + '.csv')
    
    res = pd.DataFrame(result_coefs[comp])
    res = res.transpose()
    res.index = ['Age', 'Sex'] + [i for i in range(1, 265)]
    res.columns = ['freq' + str(i) for i in range(1, 11)]
    
    res.to_csv(output_dir + '/lasso_coefs_' + comp[0] + '_' + comp[1] + '.csv')
