In [46]:
## run LASSO on the harmonic features (entropy of the spectrum power histogram)
import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
## this only returns the accuracy. we use all the data to estimate a single estimator to extract the coefs (with is averaged across the inner cv)
from sklearn.model_selection import cross_val_score 
## to do: we can use cross validate to return estimator for each outer cv.
from sklearn.model_selection import cross_validate 

input_dir = 'hcp_out04_power_histogram_and_entropy_0back'
input_dir2 = 'hcp_out04_power_histogram_and_entropy_2back'

output_dir = 'out05_lasso'
cv_outer = 5


In [41]:
data_files = glob.glob(input_dir + '/out04_entropy__freq_*.csv')
data_files.sort()

data_files2 = glob.glob(input_dir2 + '/out04_entropy__freq_*.csv')
data_files2.sort()

result_accuracy = []
result_coefs = []
print_group_size = True

for f1, f2 in zip(data_files, data_files2):
    
    print(f1)
    print(f2)
    
    freq = re.search('(.*)_freq_(.*).csv', f1).group(2)
    data = pd.read_csv(f1, index_col = None, header = None)
    data2= pd.read_csv(f2, index_col = None, header = None)
    
    num1 = data.shape[0]
    num2 = data2.shape[0]
    
    if print_group_size:
        print(num1)
        print(num2)
    
    X = pd.concat([data, data2], axis = 0, ignore_index = False).values
    y = np.hstack((np.ones((num1)), np.zeros((num2))))
    
    # break
    
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    C_values = np.logspace(-2, 3, 10)
    

    # build lassoCV that tune parameters with inner cv:
    # reg = LogisticRegressionCV(cv = 5, random_state=0, Cs = C_values, n_jobs = 5,
    #                            penalty = 'l1', solver='liblinear').fit(X[idx_ad_cn,:], y[idx_ad_cn])

    reg = LogisticRegressionCV(cv = 5, random_state=0, Cs = C_values, n_jobs = 5,
                               penalty = 'l1', solver = 'liblinear')

    # reg = LassoCV(cv=5, random_state=0, normalize = True, tol = .001).fit(X[idx_ad_cn,:], y[idx_ad_cn])
    # run outer cv:
    cv_result = cross_val_score(reg, X, y, cv = cv_outer)
    print('cross validaton result, mean %3f, std: %3f' % (cv_result.mean(), cv_result.std()))
    result_accuracy.append(cv_result)

    reg = LogisticRegressionCV(cv = 5, random_state=0, Cs = C_values, n_jobs = 5,
                               penalty = 'l1', solver = 'liblinear').fit(X, y)
    result_coefs.append(reg.coef_.reshape(-1))
        
    #     break
    # break

print('finished')


hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_00.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_00.csv
959
959
cross validaton result, mean 0.573000, std: 0.019325
hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_01.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_01.csv
959
959
cross validaton result, mean 0.554217, std: 0.013773
hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_02.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_02.csv
959
959
cross validaton result, mean 0.583430, std: 0.022788
hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_03.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_03.csv
959
959
cross validaton result, mean 0.562568, std: 0.014028
hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_04.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_04.csv
959
959
cross validaton result, mean 0.555811,

In [43]:
result_accuracy

[array([0.58333333, 0.55729167, 0.56510417, 0.60574413, 0.5535248 ]),
 array([0.54166667, 0.58072917, 0.55208333, 0.55091384, 0.54569191]),
 array([0.61458333, 0.54947917, 0.56770833, 0.59791123, 0.58746736]),
 array([0.5703125 , 0.54166667, 0.5703125 , 0.55091384, 0.57963446]),
 array([0.56770833, 0.50520833, 0.54947917, 0.57963446, 0.5770235 ]),
 array([0.5546875 , 0.5390625 , 0.578125  , 0.60574413, 0.59007833]),
 array([0.578125  , 0.5703125 , 0.54427083, 0.55613577, 0.57180157]),
 array([0.61197917, 0.57552083, 0.625     , 0.61096606, 0.60574413]),
 array([0.59895833, 0.58333333, 0.609375  , 0.60574413, 0.62924282]),
 array([0.54166667, 0.52604167, 0.5234375 , 0.5926893 , 0.61096606])]

In [47]:
## save results:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)


res = pd.DataFrame(result_accuracy)
res.columns = ['cv' + str(i) for i in range(1, cv_outer+1)]
res.index = ['freq' + str(i) for i in range(1, 11)]

res.to_csv(output_dir + '/lasso_accuracy_0back_2back.csv')

res = pd.DataFrame(result_coefs)
res = res.transpose()
res.index = [i for i in range(1, 269)]
res.columns = ['freq' + str(i) for i in range(1, 11)]

res.to_csv(output_dir + '/lasso_coefs_0back_2back.csv')
