In [43]:
## run LASSO on the harmonic features (entropy of the spectrum power histogram)
import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

## this only returns the accuracy. we use all the data to estimate a single estimator to extract the coefs 
# (which is averaged across the inner cv)
from sklearn.model_selection import cross_val_score 

## to do: we can use cross validate to return estimator for each outer cv.
from sklearn.model_selection import cross_validate 

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import AdaBoostClassifier

input_dir  = 'hcp_out04_power_histogram_and_entropy_0back'
input_dir2 = 'hcp_out04_power_histogram_and_entropy_2back'

output_dir = 'hcp_out05_randomforest'
cv_outer   = 5


In [48]:
data_files = glob.glob(input_dir + '/out04_entropy__freq_*.csv')
data_files.sort()

data_files2 = glob.glob(input_dir2 + '/out04_entropy__freq_*.csv')
data_files2.sort()

result_accuracy = []
result_coefs = []
print_info = True

for f1, f2 in zip(data_files, data_files2):
    
    freq = re.search('(.*)_freq_(.*).csv', f1).group(2)
    data = pd.read_csv(f1, index_col = None, header = None)
    data2= pd.read_csv(f2, index_col = None, header = None)
    
    num1 = data.shape[0]
    num2 = data2.shape[0]
    
    if print_info:
        print(f1)
        print(f2)
        
        print(num1)
        print(num2)
        print_info = False
    
    X = pd.concat([data, data2], axis = 0, ignore_index = False).values
    y = np.hstack((np.ones((num1)), np.zeros((num2))))
    
    # break
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    C_values = np.logspace(-2, 3, 10)

    ## random forest:
    rfc = RandomForestClassifier(random_state=0)
 
    # param_grid = { 
    #     'n_estimators': [50, 100],
    #     'max_features': ['sqrt'],
    #     'max_depth' : [2, 4],
    #     'criterion' : ['gini']
    # }

    param_grid = { 
        'n_estimators': [20, 50, 100],
        'max_features': [5, 10, 20, 50], #['sqrt', 'log2'],
        'max_depth' : [2,3,4],
        'criterion' : ['gini', 'entropy']
    }

    cv_model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs = 2, refit = True, return_train_score = True)
    cv_model.fit(X, y)

    ## run outer cv to get prediction accuracy:
    # cv_result = cross_val_score(cv_model.best_estimator_, X, y, cv = cv_outer)
    cv_result = cross_validate(cv_model.best_estimator_, X, y, cv = cv_outer)
    
    print('mean train score: %3f' % cv_model.cv_results_['mean_train_score'].mean())
    print('cross validaton result, mean %3f, std: %3f' % (cv_result['test_score'].mean(), cv_result['test_score'].std()))
    result_accuracy.append(cv_result['test_score'])

    perm_importance = permutation_importance(cv_model, X, y)
    result_coefs.append(perm_importance['importances_mean'])
    
    # break
    
print('finished!')

hcp_out04_power_histogram_and_entropy_0back/out04_entropy__freq_00.csv
hcp_out04_power_histogram_and_entropy_2back/out04_entropy__freq_00.csv
959
959
mean train score: 0.776474
cross validaton result, mean 0.586547, std: 0.011769
mean train score: 0.787044
cross validaton result, mean 0.561543, std: 0.027646
mean train score: 0.772699
cross validaton result, mean 0.596985, std: 0.014810
mean train score: 0.783159
cross validaton result, mean 0.588120, std: 0.016229
mean train score: 0.764486
cross validaton result, mean 0.561534, std: 0.020149
mean train score: 0.792028
cross validaton result, mean 0.561018, std: 0.016662
mean train score: 0.774366
cross validaton result, mean 0.592279, std: 0.023272
mean train score: 0.764981
cross validaton result, mean 0.598023, std: 0.013596
mean train score: 0.693751
cross validaton result, mean 0.597004, std: 0.025240
mean train score: 0.783009
cross validaton result, mean 0.560481, std: 0.010827
finished!


In [None]:
## save results:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)


res = pd.DataFrame(result_accuracy)
res.columns = ['cv' + str(i) for i in range(1, cv_outer+1)]
res.index = ['freq' + str(i) for i in range(1, 11)]

res.to_csv(output_dir + '/rf_accuracy_.csv')

res = pd.DataFrame(result_coefs)
res.columns = ['roi' + str(i) for i in range(1, 269)]
res.index = ['freq' + str(i) for i in range(1, 11)]
res = res.transpose()
res.to_csv(output_dir + '/rf_coefs_.csv')


In [None]:
perm_importance