In [18]:
## plot histogram across time points, compute entropy, 
## and plot histogram of entropy across subjects.
## xin Apr 11 2022.

import glob
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt

## hcp data:
# input_harmonics_wavelets = 'hcp_out02_harmonics_100'
# input_time_signal = '/home/xin/Downloads/FullData_Oct26/Scan1/'
# output_dir = 'hcp_out03_spectrum_power'

## adni data:
input_dir = 'ocd_out03_spectrum_power_hc'
input_dir2 = 'ocd_out03_spectrum_power_ocd'

output_dir = 'ocd_out04_power_histogram_and_entropy'
outfile_name_header = '/out04_entropy'


if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
    
spectrum_files = glob.glob(input_dir + '/spectrum_subj*_roi_*.csv')
spectrum_files.sort()

spectrum_files2 = glob.glob(input_dir2 + '/spectrum_subj*_roi_*.csv')
spectrum_files2.sort()


In [19]:
len(spectrum_files)

5220

In [20]:
num_hc = len(spectrum_files)//116
num_ocd = len(spectrum_files2)//116

subject_info = pd.DataFrame({'group': ['hc'] * num_hc + ['ocd'] * num_ocd})
subject_info

spectrum_files += spectrum_files2

In [21]:
subject_info

Unnamed: 0,group
0,hc
1,hc
2,hc
3,hc
4,hc
...,...
75,ocd
76,ocd
77,ocd
78,ocd


In [22]:
from math import log, e

def entropy2(labels, base=None):
    """ Computes entropy of label distribution. """

    n_labels = len(labels)

    if n_labels <= 1:
        return 0

    value,counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0

    ent = 0.

    # Compute entropy
    base = e if base is None else base
    for i in probs:
        ent -= i * log(i, base)

    return ent

nan_col = []


num_roi = 116
num_subjects = len(spectrum_files)//num_roi
entropy_list = [[None] * num_roi for _ in range(num_subjects)]

subj_id = -1
for file in spectrum_files:
    ## each file is a matrix (subjects by brain region)
    ## the values are inner product of harmonics and time course signal.
    
    print(file)
    #subj_id = re.search('(.*)spectrum_subj(.*)_roi_(.*).csv', file).group(2)
    roi = re.search('(.*)spectrum_subj(.*)_roi_(.*).csv', file).group(3)
    if int(roi)==0:
        subj_id += 1
    
    # get OCD diagnosis:
    diagnosis = subject_info.iloc[int(subj_id),:]['group']
    
    spectrum = pd.read_csv(file, header = None, sep = ',')
    num_roi = spectrum.shape[1]
    
    result = []
    for col in spectrum:
        
        if spectrum[col].isna().any():
            
            print('nan column found for: ', col)
            #spectrum[col][spectrum[col].isna()] = 0
            nan_col.append(col)
            result.append(np.nan)
            #entropy = np.nan
            continue
 
        # compute entropy:
        bins = np.linspace(min(spectrum[col]), max(spectrum[col]), num = 50)
        discretized_signal = np.digitize(spectrum[col], bins)
        entropy = entropy2(discretized_signal)
        result.append(entropy)
        
        # histogram:
        ## plot figure on every n frequency, subject and brain region :
        if col % 3 == 0 and int(subj_id) % 10 == 0 and int(roi) % 20 == 0:
            plt.hist(spectrum[col], bins = 30)
            figure_name = file.replace('.csv', '_freq_{:03}'.format(col) + diagnosis + '.png')
            
            # fix a bug the ocd subjects are not saved in correct file.
            if diagnosis == 'hc':
                figure_name = figure_name.replace(input_dir, output_dir)
            else:
                figure_name = figure_name.replace(input_dir2, output_dir)
                
            plt.savefig(figure_name)
            plt.clf()
    
        # break
    entropy_list[int(subj_id)][int(roi)] = result
    # outfile_name = outfile_name_header + '_subj_' + subj_id + '_roi_' + roi + '.csv'
    # np.savetxt(output_dir + outfile_name, np.array(result), delimiter = ',')
    
    # break
    
print('finished!')

ocd_out03_spectrum_power_hc/spectrum_subj000_roi_000.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_001.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_002.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_003.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_004.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_005.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_006.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_007.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_008.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_009.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_010.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_011.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_012.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_013.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_014.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_015.csv
ocd_out03_spectrum_power_hc/spectrum_subj000_roi_016.csv
ocd_out03_spectrum_power_hc/spe

<Figure size 432x288 with 0 Axes>

In [16]:
## save result as subject by roi .csv files.

num_subject = len(entropy_list)
num_roi = len(entropy_list[0])
num_freq = len(entropy_list[0][0]) 

for i in range(num_freq):
    res = [[np.nan] * num_roi for _ in range(num_subject)]
    for subj in range(num_subject):
        for roi in range(num_roi):
            if entropy_list[subj][roi] and entropy_list[subj][roi][i]:
                res[subj][roi] = entropy_list[subj][roi][i]
                df = pd.DataFrame(data = res)
                df = pd.concat([subject_info, df], axis = 1)
                
    outfile_name = outfile_name_header + '_freq_{:02}'.format(i) + '.csv'
    # np.savetxt(output_dir + outfile_name, np.array(df), delimiter = ',')
    df.to_csv(output_dir + outfile_name)
    
print('finished!')  

finished!
