In [9]:
## run LASSO on network metrics

import pandas as pd
import numpy as np
import os
import re
import glob

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

input_dir  = 'hcp_out03_network_analysis_0back/'
input_dir2 = 'hcp_out03_network_analysis_2back/'

output_dir = 'hcp_out05_lasso_network_metrics/'

## subject info:


In [10]:
# data_files = glob.glob(input_dir + '/adni_out04_entropy_freq_*.csv')
metrics = ['betweenness', 'clustering', 'degree', 'eigenvector', 'closeness']
data_files = [input_dir + m + '.csv' for m in metrics]
data_files.sort()

metrics = ['betweenness', 'clustering', 'degree', 'eigenvector', 'closeness']
data_files2 = [input_dir2 + m + '.csv' for m in metrics]
data_files2.sort()

all_result = []
print_group_size = True
result_accuracy = []
result_coefs = []

for f1,f2 in zip(data_files, data_files2):
    
    print(f1)
    data = pd.read_csv(f1, index_col = 0)
    data2= pd.read_csv(f2, index_col= 0)
    # break
    num1 = data.shape[0]
    num2 = data2.shape[0]
    
    if print_group_size:
        print(num1)
        print(num2)
        print_group_size = False
    
    X = pd.concat([data, data2], axis = 0).values
    y = np.hstack((np.ones((num1)), np.zeros((num2))))
    
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    C_values = np.logspace(-1, 2, 10)
    
    # build lassoCV that tune parameters with inner cv:
    reg = LogisticRegressionCV(cv=5, random_state=0, Cs=C_values, n_jobs = 6,
                               penalty = 'l1', solver='liblinear', 
                               refit = False).fit(X, y)
    # note: .fit(X, y) is only for coefs report (reg.coef_.). cross_val_score only need the estimator.

    # run outer cv:
    cv_result = cross_val_score(reg, X, y, cv=5)
    print('cross validaton result, mean %3f, std: %3f' % (cv_result.mean(), cv_result.std()))
    result_accuracy.append(cv_result)
    result_coefs.append(reg.coef_.reshape(-1))
        
    #     break
    # break
    
print('finished')


hcp_out03_network_analysis_0back/betweenness.csv
959
959
cross validaton result, mean 0.725766, std: 0.009815
hcp_out03_network_analysis_0back/closeness.csv
cross validaton result, mean 0.784689, std: 0.024057
hcp_out03_network_analysis_0back/clustering.csv
cross validaton result, mean 0.713272, std: 0.024997
hcp_out03_network_analysis_0back/degree.csv
cross validaton result, mean 0.789383, std: 0.025981
hcp_out03_network_analysis_0back/eigenvector.csv
cross validaton result, mean 0.789903, std: 0.026417
finished


In [14]:
## save results:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)


res = pd.DataFrame(result_accuracy)
res.columns = ['cv' + str(i) for i in range(1, 6)]
res.index = metrics

res.to_csv(output_dir + '/lasso_accuracy.csv')

res = pd.DataFrame(result_coefs)
res.columns = ['roi' + str(i) for i in range(1, 270)]
res.index = metrics

res.to_csv(output_dir + '/lasso_coefs.csv')


In [13]:
res

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,259,260,261,262,263,264,265,266,267,268
0,0.000528,0.007546,0.0,-0.004095,0.0,0.0,-0.00519,0.190783,0.15221,0.078381,...,0.034893,-0.002938,0.029204,-0.023024,-0.04113,-0.015454,0.0,0.032714,-0.024512,-0.030697
1,0.007451,-0.007978,0.002087,-0.20645,-0.002478,-0.121003,-0.059927,0.278804,-0.023949,0.22858,...,0.055402,0.038664,0.007425,-0.011055,0.0,0.004568,0.0,0.007969,0.0,-0.013065
2,0.015782,-0.033359,0.014451,-0.215419,-0.025694,-0.140509,0.026181,0.05249,-0.141371,0.121601,...,-0.113948,0.057136,-0.204887,0.012777,0.203662,0.041456,-0.020762,-0.153129,0.037519,-0.062587
3,0.00705,-0.006586,0.000618,-0.198531,0.0,-0.104031,-0.05052,0.2591,-0.016703,0.211326,...,0.047591,0.027337,0.007868,0.0,0.0,0.004946,0.0,0.004784,0.0,-0.009839
4,0.000484,-0.001462,0.0,-0.260215,0.0,-0.056568,-0.031372,0.236874,-0.013492,0.169946,...,0.011312,0.00938,0.002977,-0.000444,0.0,0.01407,0.0,0.00128,0.0,-0.026661
