In [2]:
## run LASSO on the harmonic features (entropy of the spectrum power histogram)
import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
## this only returns the accuracy. we use all the data to estimate a single estimator to extract the coefs (with is averaged across the inner cv)
from sklearn.model_selection import cross_val_score 
## to do: we can use cross validate to return estimator for each outer cv.
from sklearn.model_selection import cross_validate 


input_dir = 'ocd_out04_power_histogram_and_entropy'
output_dir = 'ocd_out05_lasso'
cv_outer = 5


In [3]:
data_files = glob.glob(input_dir + '/out04_entropy_freq_*.csv')
data_files.sort()

all_result = []
print_group_size = True

comparision = [('hc', 'ocd')]
result_accuracy = {comp: [] for comp in comparision}
result_coefs = {comp: [] for comp in comparision}

for f in data_files:
    
    print(f)
    freq = re.search('(.*)_freq_(.*).csv', f).group(2)
    data = pd.read_csv(f, index_col = 0)
    data.dropna(subset=['group'], inplace = True)
    # break
    
    if print_group_size:
        print(data.groupby(['group']).size())
        print_group_size = False
    
    X = data.iloc[:, 1:].values
    y = data['group'].map({'hc': 0, 'ocd': 1}).values
    
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    C_values = np.logspace(-3, 3, 30)
    
    for comp in comparision:
        # select data with ad/lmci and cn:    
        idx_ad_cn = data['group'].isin(comp)
        # build lassoCV that tune parameters with inner cv:
        cv_model = LogisticRegressionCV(cv = 5, random_state=0, Cs = C_values, n_jobs = 5,
                                   penalty = 'l1', solver = 'liblinear')
                
        ## run outer cv:
        cv_result = cross_val_score(cv_model, X[idx_ad_cn,:], y[idx_ad_cn], cv = cv_outer)
        
        print('cross validaton result, mean %3f, std: %3f' % (cv_result.mean(), cv_result.std()))
        result_accuracy[comp].append(cv_result)
        
        cv_model = LogisticRegressionCV(cv = 5, random_state = 0, Cs = C_values, n_jobs = 5,
                                   penalty = 'l1', solver = 'liblinear').fit(X[idx_ad_cn,:], y[idx_ad_cn])
        result_coefs[comp].append(cv_model.coef_.reshape(-1))
        
    #     break
    # break
print('finished!')

ocd_out04_power_histogram_and_entropy/out04_entropy_freq_00.csv
group
hc     45
ocd    35
dtype: int64
cross validaton result, mean 0.612500, std: 0.046771
ocd_out04_power_histogram_and_entropy/out04_entropy_freq_01.csv
cross validaton result, mean 0.562500, std: 0.039528
ocd_out04_power_histogram_and_entropy/out04_entropy_freq_02.csv
cross validaton result, mean 0.475000, std: 0.140312
ocd_out04_power_histogram_and_entropy/out04_entropy_freq_03.csv
cross validaton result, mean 0.612500, std: 0.061237
ocd_out04_power_histogram_and_entropy/out04_entropy_freq_04.csv
cross validaton result, mean 0.562500, std: 0.079057
ocd_out04_power_histogram_and_entropy/out04_entropy_freq_05.csv
cross validaton result, mean 0.637500, std: 0.091856
ocd_out04_power_histogram_and_entropy/out04_entropy_freq_06.csv
cross validaton result, mean 0.487500, std: 0.133463
ocd_out04_power_histogram_and_entropy/out04_entropy_freq_07.csv
cross validaton result, mean 0.587500, std: 0.084779
ocd_out04_power_histogram

In [4]:
## save results:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

for comp in comparision:
    res = pd.DataFrame(result_accuracy[comp])
    res.columns = ['cv' + str(i) for i in range(1, cv_outer+1)]
    res.index = ['freq' + str(i) for i in range(1, 11)]
    
    res.to_csv(output_dir + '/lasso_accuracy_' + comp[0] + '_' + comp[1] + '.csv')
    
    res = pd.DataFrame(result_coefs[comp])
    res.columns = ['roi' + str(i) for i in range(1, 117)]
    res.index = ['freq' + str(i) for i in range(1, 11)]
    res = res.transpose()
    res.to_csv(output_dir + '/lasso_coefs_' + comp[0] + '_' + comp[1] + '.csv')
