In [27]:
## run LASSO on network metrics

import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

input_dir  = 'hcp_out03_network_analysis_0back/'
input_dir2 = 'hcp_out03_network_analysis_2back/'

output_dir = 'hcp_out05_randomforest'
cv_outer   = 5


In [28]:

metrics = ['betweenness', 'clustering', 'degree', 'eigenvector', 'closeness']
data_files = [input_dir + m + '.csv' for m in metrics]
data_files.sort()

metrics = ['betweenness', 'clustering', 'degree', 'eigenvector', 'closeness']
data_files2 = [input_dir2 + m + '.csv' for m in metrics]
data_files2.sort()

all_result = []
print_group_size = True
result_accuracy = []
result_coefs = []

for f1,f2 in zip(data_files, data_files2):
    
    data = pd.read_csv(f1, index_col = 0)
    data.drop(columns = 'subject_id', inplace = True)
    data2= pd.read_csv(f2, index_col= 0)
    data2.drop(columns = 'subject_id', inplace = True)
    # break
    num1 = data.shape[0]
    num2 = data2.shape[0]
    
    if print_group_size:
        
        print(num1)
        print(num2)
        print_group_size = False
    print(f1)
    print(f2)
    
    X = pd.concat([data, data2], axis = 0).values
    y = np.hstack((np.ones((num1)), np.zeros((num2))))
    
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    C_values = np.logspace(-3, 3, 30)

    ## random forest:
    rfc = RandomForestClassifier(random_state=0)

    # param_grid = { 
    #     'n_estimators': [50, 100],
    #     'max_features': ['sqrt'],
    #     'max_depth' : [2, 4],
    #     'criterion' : ['gini']
    # }

    param_grid = { 
        'n_estimators': [20, 50, 100],
        'max_features': ['sqrt', 'log2'],
        'max_depth' : [2, 3, 4],
        'criterion' : ['gini', 'entropy']
    }

    cv_model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs = 2)
    ## run outer cv:
    cv_result = cross_val_score(cv_model, X, y, cv = cv_outer)

    print('cross validaton result, mean %3f, std: %3f' % (cv_result.mean(), cv_result.std()))
    result_accuracy.append(cv_result)
    cv_model.fit(X, y)

    perm_importance = permutation_importance(cv_model, X, y)
    result_coefs.append(perm_importance['importances_mean'])
        
    #     break
    # break
print('finished!')

959
959
hcp_out03_network_analysis_0back/betweenness.csv
hcp_out03_network_analysis_2back/betweenness.csv
cross validaton result, mean 0.677789, std: 0.002655
hcp_out03_network_analysis_0back/closeness.csv
hcp_out03_network_analysis_2back/closeness.csv
cross validaton result, mean 0.724180, std: 0.026628
hcp_out03_network_analysis_0back/clustering.csv
hcp_out03_network_analysis_2back/clustering.csv
cross validaton result, mean 0.651741, std: 0.023105
hcp_out03_network_analysis_0back/degree.csv
hcp_out03_network_analysis_2back/degree.csv
cross validaton result, mean 0.731481, std: 0.027032
hcp_out03_network_analysis_0back/eigenvector.csv
hcp_out03_network_analysis_2back/eigenvector.csv
cross validaton result, mean 0.740346, std: 0.030662
finished!


In [29]:
## save results:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)


res = pd.DataFrame(result_accuracy)
res.columns = ['cv' + str(i) for i in range(1, 6)]
res.index = metrics

res.to_csv(output_dir + '/rf_network_metrics_accuracy_' + comp[0] + '_' + comp[1] + '.csv')

res = pd.DataFrame(result_coefs)
res.columns = ['roi' + str(i) for i in range(1, 269)]
res.index = metrics
res = res.transpose()

res.to_csv(output_dir + '/rf_network_metrics_coefs_' + comp[0] + '_' + comp[1] + '.csv')


NameError: name 'comp' is not defined

In [None]:
result_accuracy