In [1]:
## run LASSO on network metrics

import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

input_dir = 'ocd_out03_network_analysis/'
output_dir = 'ocd_out05_randomforest_network_metrics/'
cv_outer = 5


In [10]:
# data_files = glob.glob(input_dir + '/adni_out04_entropy_freq_*.csv')
metrics = ['betweenness', 'clustering', 'degree', 'eigenvector', 'closeness']
data_files = [input_dir + m + '.csv' for m in metrics ]
data_files.sort()

all_result = []
print_group_size = True
comparision = [('hc', 'ocd')]
result_accuracy = {comp: [] for comp in comparision}
result_coefs = {comp: [] for comp in comparision}

for f in data_files:
    
    print(f)
    data = pd.read_csv(f, index_col = 0)    
    data.dropna(subset=['group'], inplace = True)
    # break
    
    if print_group_size:
        print(data.groupby(['group']).size())
        print_group_size = False
    
    X = data.iloc[:, 1:].values
    y = data['group'].map({'hc': 0, 'ocd': 1}).values
    
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    C_values = np.logspace(-3, 3, 30)
    
    for comp in comparision:
        # select data with ad/lmci and cn:    
        idx_ad_cn = data['group'].isin(comp)
        
        ## random forest:
        rfc = RandomForestClassifier(random_state=0)
        
        # param_grid = { 
        #     'n_estimators': [50, 100],
        #     'max_features': ['sqrt'],
        #     'max_depth' : [2, 4],
        #     'criterion' : ['gini']
        # }
        
        param_grid = { 
            'n_estimators': [20, 50, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth' : [2,3,4],
            'criterion' : ['gini', 'entropy']
        }
        
        cv_model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs = 7)
        ## run outer cv:
        cv_result = cross_val_score(cv_model, X[idx_ad_cn,:], y[idx_ad_cn], cv = cv_outer)
        
        print('cross validaton result, mean %3f, std: %3f' % (cv_result.mean(), cv_result.std()))
        result_accuracy[comp].append(cv_result)
        cv_model.fit(X[idx_ad_cn,:], y[idx_ad_cn])
        
        perm_importance = permutation_importance(cv_model, X[idx_ad_cn,:], y[idx_ad_cn])
        result_coefs[comp].append(perm_importance['importances_mean'])
        
    #     break
    # break
print('finished!')

ocd_out03_network_analysis/betweenness.csv
group
hc     45
ocd    35
dtype: int64
cross validaton result, mean 0.662500, std: 0.134629
ocd_out03_network_analysis/closeness.csv
cross validaton result, mean 0.675000, std: 0.127475
ocd_out03_network_analysis/clustering.csv
cross validaton result, mean 0.675000, std: 0.133463
ocd_out03_network_analysis/degree.csv
cross validaton result, mean 0.687500, std: 0.147902
ocd_out03_network_analysis/eigenvector.csv
cross validaton result, mean 0.662500, std: 0.063738
finished!


In [3]:
data

Unnamed: 0,group,0,1,2,3,4,5,6,7,8,...,106,107,108,109,110,111,112,113,114,115
0,hc,0.096049,0.094835,0.088075,0.091569,0.095156,0.095474,0.091869,0.092195,0.092384,...,0.052900,0.093793,0.094949,0.097449,0.095242,0.096199,0.097449,0.096049,0.089791,0.088982
1,hc,0.097915,0.086813,0.103128,0.099016,0.101604,0.101785,0.087893,0.057575,0.098464,...,0.082012,0.060616,0.030917,0.063738,0.099837,0.086875,0.101509,0.083009,0.062089,0.020955
2,hc,0.097773,0.096354,0.098291,0.097773,0.096296,0.098291,0.097773,0.097773,0.094639,...,0.096351,0.094021,0.056310,0.084169,0.097769,0.098523,0.098523,0.097781,0.083287,0.025290
3,hc,0.093587,0.094888,0.095087,0.095607,0.094063,0.094657,0.095607,0.095607,0.094063,...,0.095687,0.094797,0.058119,0.095607,0.094657,0.094278,0.094969,0.094969,0.048092,0.009046
4,hc,0.094690,0.091483,0.095822,0.096505,0.096486,0.093438,0.096154,0.096836,0.096486,...,0.093834,0.084634,0.038758,0.090152,0.095822,0.091391,0.090270,0.095027,0.094627,0.036665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,ocd,0.106734,0.099571,0.109001,0.116107,0.089768,0.063952,0.083344,0.087498,0.100205,...,0.082964,0.088825,0.042535,0.106190,0.117428,0.105310,0.114884,0.089361,0.066948,0.024959
76,ocd,0.114590,0.114907,0.113478,0.116134,0.062223,0.061522,0.071936,0.086169,0.081829,...,0.103318,0.085201,0.039407,0.082467,0.109863,0.104563,0.083225,0.084791,0.038959,0.013715
77,ocd,0.098771,0.099820,0.104738,0.098699,0.098556,0.101240,0.078122,0.078840,0.092480,...,0.038071,0.076421,0.066139,0.082664,0.105398,0.108267,0.096230,0.085420,0.042972,0.049988
78,ocd,0.101941,0.103518,0.099976,0.103002,0.077730,0.087243,0.093291,0.100520,0.093491,...,0.091486,0.077893,0.092441,0.090554,0.100928,0.101851,0.100669,0.101802,0.094650,0.052988


In [4]:
## save results:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

for comp in comparision:
    res = pd.DataFrame(result_accuracy[comp])
    res.columns = ['cv' + str(i) for i in range(1, 6)]
    res.index = metrics
    
    res.to_csv(output_dir + '/lasso_accuracy_' + comp[0] + '_' + comp[1] + '.csv')
    
    res = pd.DataFrame(result_coefs[comp])
    res.columns = ['roi' + str(i) for i in range(1, 117)]
    res.index = metrics
    res = res.transpose()
    
    res.to_csv(output_dir + '/lasso_coefs_' + comp[0] + '_' + comp[1] + '.csv')


ValueError: Length mismatch: Expected axis has 3 elements, new values have 116 elements

In [9]:
result_accuracy[comp]

[array([0.75  , 0.5   , 0.5   , 0.75  , 0.8125]),
 array([0.625, 0.625, 0.5  , 0.75 , 0.875]),
 array([0.75  , 0.5625, 0.5   , 0.6875, 0.875 ]),
 array([0.625 , 0.625 , 0.5   , 0.75  , 0.9375]),
 array([0.5625, 0.6875, 0.625 , 0.75  , 0.6875])]