In [1]:
## run LASSO on the harmonic features (entropy of the spectrum power histogram)
import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

## this only returns the accuracy. we use all the data to estimate a single estimator to extract the coefs 
# (which is averaged across the inner cv)
from sklearn.model_selection import cross_val_score 

## to do: we can use cross validate to return estimator for each outer cv.
from sklearn.model_selection import cross_validate 

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import AdaBoostClassifier

input_dir  = 'hcp_out04_power_histogram_and_entropy_0back'
input_dir2 = 'hcp_out04_power_histogram_and_entropy_2back'

# output_dir = 'hcp_out05_randomforest'
output_dir = 'hcp_out05_adaboost'
cv_outer   = 5


In [2]:
data_files = glob.glob(input_dir + '/out04_entropy__freq_*.csv')
data_files.sort()

data_files2 = glob.glob(input_dir2 + '/out04_entropy__freq_*.csv')
data_files2.sort()

result_accuracy = []
result_coefs = []
print_group_size = True

for f1, f2 in zip(data_files, data_files2):
    
    print(f1)
    print(f2)
    
    freq = re.search('(.*)_freq_(.*).csv', f1).group(2)
    data = pd.read_csv(f1, index_col = None, header = None)
    data2= pd.read_csv(f2, index_col = None, header = None)
    
    num1 = data.shape[0]
    num2 = data2.shape[0]
    
    if print_group_size:
        print(num1)
        print(num2)
    
    X = pd.concat([data, data2], axis = 0, ignore_index = False).values
    y = np.hstack((np.ones((num1)), np.zeros((num2))))
    
    # break
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    C_values = np.logspace(-2, 3, 10)
    
    clf = AdaBoostClassifier(n_estimators=100, random_state = 0)
    param_grid = { 
        'n_estimators': [20, 50, 100, 200]
    }
    
    cv_model = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5, n_jobs = 2, refit = True, return_train_score = True)
    
    # run this to selected tuning parameters, and permutation importance:
    cv_model.fit(X, y)
    #print(cv_model.cv_results_)
    
    ## run outer cv to get prediction accuracy:
    cv_result = cross_val_score(cv_model, X, y, cv = cv_outer)
    
    print('cross validaton result, mean %3f, std: %3f' % (cv_result.mean(), cv_result.std()))
    result_accuracy.append(cv_result)
    # cv_model.fit(X, y)
    perm_importance = permutation_importance(cv_model, X, y)
    result_coefs.append(perm_importance['importances_mean'])
    
    # break
    
print('finished!')

hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_00.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_00.csv
959
959
cross validaton result, mean 0.557350, std: 0.019898
hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_01.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_01.csv
959
959
cross validaton result, mean 0.534412, std: 0.020676
hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_02.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_02.csv
959
959
cross validaton result, mean 0.563596, std: 0.020628
hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_03.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_03.csv
959
959
cross validaton result, mean 0.535460, std: 0.009329
hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_04.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_04.csv
959
959
cross validaton result, mean 0.519274,

In [3]:
cv_model.cv_result_

AttributeError: 'GridSearchCV' object has no attribute 'cv_result_'

In [None]:
## save results:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)


res = pd.DataFrame(result_accuracy)
res.columns = ['cv' + str(i) for i in range(1, cv_outer+1)]
res.index = ['freq' + str(i) for i in range(1, 11)]

res.to_csv(output_dir + '/ada_accuracy_.csv')

res = pd.DataFrame(result_coefs)
res.columns = ['roi' + str(i) for i in range(1, 269)]
res.index = ['freq' + str(i) for i in range(1, 11)]
res = res.transpose()
res.to_csv(output_dir + '/ada_coefs_.csv')


In [None]:
perm_importance