In [1]:
###################### Importing Packages #########################################
import os

import numpy as np
import pickle, random, time
from sklearn import svm
from itertools import chain, combinations
from copy import deepcopy
from sklearn.decomposition import PCA
import pandas as pd
import nibabel as nib

from datautility import *

from scipy.stats import kurtosis, moment, skew, entropy
from tqdm import tqdm
from scipy.io import savemat, loadmat

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Read and Save prefrontal mask
* compress nii to mat
* separate left prefrontal and right prefrontal area

In [None]:
prefrontal_file = './data/masks/Bilateral_prefrontalWM.nii'
prefrontal_array = ((nib.load(prefrontal_file)).get_fdata()).astype(np.float32)
prefrontal_l = np.zeros(prefrontal_array.shape)
prefrontal_r = np.zeros(prefrontal_array.shape)
for i in np.arange (prefrontal_array.shape[0]):
    if i > 91:
        prefrontal_l[i] = prefrontal_array[i]
    else:
        prefrontal_r[i] = prefrontal_array[i]

prefrontal_l_dict = {}
prefrontal_r_dict = {}

prefrontal_l_dict['vol'] = prefrontal_l
prefrontal_r_dict['vol'] = prefrontal_r

savemat('./data/masks/L_Pref.mat', prefrontal_l_dict, do_compression=True)
savemat('./data/masks/R_Pref.mat', prefrontal_r_dict, do_compression=True)

## Defining Global Variable
* 12 channels of metrics to use: [ad, ak, awf, eas_De_par, eas_De_perp, eas_tort, FA, ias_Da, md, mk, rd, rk]
* 5 region mask: [1_L_thal, 2_R_thal, CC_Body_mask, CC_Genu_mask, CC_Splenium_mask]
* 67 positive index and 50 negative index in 117 new subjects 
* 27 positive index and 22 negative index in 49 old subjects

In [2]:
###################### Importing Packages #########################################
# metric = ['ad', 'ak', 'awf', 'eas_De_par', 'eas_De_perp', 'eas_tort', 'FA', 'ias_Da', 'md', 'mk', 'rd', 'rk']
# 12 metrics in total

metric = ['ak', 'awf', 'eas_De_par', 'eas_De_perp', 'FA', 'ias_Da', 'md', 'mk']
# 8 metric to use

mask_name = ['1_L_thal','2_R_thal','CC_Body_mask','CC_Genu_mask','CC_Splenium_mask', 'L_Pref', 'R_Pref']
# 5 Regions in total

positive_idx = np.arange(73) + 1
positive_idx = np.delete(positive_idx, [8, 15, 16, 17, 29, 33])

# should be 67 postive in total

negative_idx = np.arange(53) + 1
negative_idx = np.delete(negative_idx, [14, 15, 16])

# should be 50 negative in total

OLD_PATH = './data/old_65subj_stats JFR.xlsx'

old_data = pd.read_excel(OLD_PATH, header=0, index_col=0, sheet_name=0)
old_idx = list(old_data.index)

old_index = np.arange(65)
old_index = np.delete(old_index, [1, 4, 6, 8, 10, 12, 14, 16, 21, 23, 26, 29, 33, 35, 37, 43])

# And 49 old subjects 

stats = [np.mean, np.std, skew, kurtosis, entropy] # mean, std, normalize 3rd moment, normalized 4th moment, entrophy 
stats_str = ['mean', 'std', 'skew', 'kurt', 'etrp'] # mean, std, normalize 3rd moment, normalized 4th moment, entrophy 

## Generating sample image and clown mask
* using 5 region mask: [1_L_thal, 2_R_thal, CC_Body_mask, CC_Genu_mask, CC_Splenium_mask]

In [3]:
mask = get_mask(mask_name)
mask = (mask != 0).astype(np.float32)

## Save the masks into a single nii file
* Do not run other than 1st time !
* 7 mask .nii for visualization

In [None]:
mask_one = np.zeros([182, 218, 182])

for i, m in enumerate(mask):
    mask_one += (i+1)*m 

data = get_image(1, metric, positive=True)

savenii(mask_one, '7_mask')

## Generating 67 positive subjects among 117 new subjects
* each subject has a feature vector of shape (360,)

In [4]:
# Generating positive data

data_positive = []
key_positive = []

for p in tqdm(positive_idx):
    
    data = get_image(p, metric, positive=True)
    temp = np.zeros([len(metric), len(mask), len(stats)])

    for i, m in enumerate(data): # for metric in data
        for j, k in enumerate(mask): # for mask in mask
            mask_data = (m*k).reshape(-1)
            mask_data = mask_data[mask_data != 0]
            for t, s in enumerate(stats):
                temp[i,j,t] = s(mask_data)
                
    temp = temp.reshape(-1) # a C style flattening, metric*mask*stats
    data_positive.append(temp)
    
    key = 'TBI-' + '{:03}'.format(p)
    key_positive.append(key)
    
print('loading {} positive subjects'.format(len(data_positive)))

100%|██████████████████████████████████████████████████████████████████████████████████| 67/67 [04:49<00:00,  4.40s/it]


loading 67 positive subjects


## Generating 50 positive subjects among 117 new subjects
* each subject has a feature vector of shape (360,)

In [5]:
# Generating negative data

data_negative = []
key_negative = []

for n in tqdm(negative_idx):

    data = get_image(n, metric, positive=False)
    temp = np.zeros([len(metric), len(mask), len(stats)])

    for i, m in enumerate(data): # for metric in data
        for j, k in enumerate(mask): # for mask in mask
            mask_data = (m*k).reshape(-1)
            mask_data = mask_data[mask_data != 0]
            for t, s in enumerate(stats):
                temp[i,j,t] = s(mask_data)
                
    temp = temp.reshape(-1) # a C style flattening, metric*mask*stats
    data_negative.append(temp)
    key = 'TBN-' + '{:03}'.format(n)
    key_negative.append(key)
    
print('loading {} negative subjects'.format(len(data_negative)))

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [03:55<00:00,  4.60s/it]


loading 50 negative subjects


## Generating 49 subjects among 67 old images
* Other 18 subjects are take MRI second time, ignore them for this dataset

In [6]:
data_old = []
key_old = []

for o, key in enumerate(tqdm(old_idx)):
    data = get_old_image(old_index[o], metric)
    temp = np.zeros([len(metric), len(mask), len(stats)])

    for i, m in enumerate(data): # for metric in data
        for j, k in enumerate(mask): # for mask in mask
            mask_data = (m*k).reshape(-1)
            mask_data = mask_data[mask_data != 0]

            for t, s in enumerate(stats):
                temp[i,j,t] = s(mask_data)
    temp = temp.reshape(-1) # a C style flattening, metric*mask*stats
    data_old.append(temp)
    key_old.append(key)
    
print('loading {} old subjects'.format(len(data_old)))

100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [03:59<00:00,  4.67s/it]


loading 49 old subjects


## Binding data
* Sequence: old subjects -> positive new subjects -> negative new subjects
* Generate column names as statistic * mask * metric

In [7]:
data_all = []
key_all = []

data_all = data_old + data_positive + data_negative
key_all = key_old + key_positive + key_negative

print(len(data_all), data_all[0].shape)

166 (280,)


In [8]:
import itertools

pre_column = ['-'.join(i) for i in list(itertools.product(mask_name, stats_str))]
index_column = ['-'.join(i) for i in list(itertools.product(metric, pre_column))] # metric

In [9]:
data_dict = pd.DataFrame(data=data_all, index=key_all, columns=index_column)
print(data_dict.index, len(key_all), len(index_column))
####################### Save dataframe to Excel #############################################

SAVE_PATH = './data/stats_data_gather.xlsx'
if not os.path.isfile(SAVE_PATH):
    data_dict.to_excel(SAVE_PATH)
    pass
else:
    print('file {} exists'.format(SAVE_PATH))

Index(['HT102', 'HT103', 'HT105', 'HT106', 'HT107', 'HT109', 'HT111', 'HT112',
       'HT113', 'HT114',
       ...
       'TBN-044', 'TBN-045', 'TBN-046', 'TBN-047', 'TBN-048', 'TBN-049',
       'TBN-050', 'TBN-051', 'TBN-052', 'TBN-053'],
      dtype='object', length=166) 166 280


In [10]:
LABEL_PATH = './data/merged_np.xlsx'
label_data = pd.read_excel(LABEL_PATH, index_col=1, sheet_name=0)
print(label_data.index, label_data.columns)

DATA_PATH = './data/stats_data_gather.xlsx'
stats_data = pd.read_excel(DATA_PATH, index_col=0, sheet_name=0)
print(stats_data.index, stats_data.columns)

Index(['HT102', 'HT103', 'HT104', 'HT105', 'HT106', 'HT107', 'HT108', 'HT109',
       'HT110', 'HT111',
       ...
       'TBN-043', 'TBN-044', 'TBN-045', 'TBN-046', 'TBN-047', 'TBN-049',
       'TBN-050', 'TBN-051', 'TBN-052', 'TBN-053'],
      dtype='object', name='Subject', length=172) Index(['Unnamed: 0', 'T1 Letter Number', 'T2 Letter Number',
       'T3 Letter Number', 'Digit Span Forward T1', 'Digit Span Forward T2',
       'Digit Span Forward T3', 'Digit Span Backward T1',
       'Digit Span Backward T2', 'Digit Span Backward T3', 'Trailmaking B T1',
       'Trailmaking B T2', 'Trailmaking B T3', 'Trailmaking A T1',
       'Trailmaking A T2', 'Trailmaking A T3', 'CVLT T1', 'CVLT T2', 'CVLT T3',
       'RCFT Immediate T1', 'RCFT Immediate T2', 'RCFT Immediate T3',
       'RCFT Delayed T1', 'RCFT Delayed T2', 'RCFT Delayed T3', 'DKEFS T1',
       'DKEFS T2', 'DKEFS T3', 'Stroop T1', 'Stroop T2', 'Stroop T3',
       'SDMT T1', 'SDMT T2', 'SDMT T3'],
      dtype='object')
Index(['H

In [11]:
all_data = pd.concat([data_dict, label_data], axis=1, join='inner')
print(all_data)

         ak-1_L_thal-mean  ak-1_L_thal-std  ak-1_L_thal-skew  \
HT102            0.605042         0.128671         -0.345832   
HT103            0.649086         0.133304         -0.238727   
HT105            0.661849         0.170823         -0.002355   
HT106            0.667329         0.128948         -0.183180   
HT107            0.693921         0.155623          0.443730   
HT109            0.664475         0.132775         -0.378491   
HT111            0.595104         0.156826         -0.451598   
HT112            0.667250         0.145615         -0.094611   
HT113            0.653951         0.155048          0.010153   
HT114            0.681961         0.168137         -0.194994   
HT115            0.625602         0.172083         -0.129125   
HT116            0.642646         0.159711         -0.107693   
HT117            0.618530         0.144971          0.263937   
HT118            0.630059         0.141668         -0.006345   
HT119            0.654763         0.1769

[154 rows x 314 columns]


In [12]:
reduce_data = all_data.iloc[:,0:280]
print(reduce_data.columns, reduce_data.index)

reduce_label = all_data.iloc[:,280:]
print(reduce_label.columns, reduce_label.index)

Index(['ak-1_L_thal-mean', 'ak-1_L_thal-std', 'ak-1_L_thal-skew',
       'ak-1_L_thal-kurt', 'ak-1_L_thal-etrp', 'ak-2_R_thal-mean',
       'ak-2_R_thal-std', 'ak-2_R_thal-skew', 'ak-2_R_thal-kurt',
       'ak-2_R_thal-etrp',
       ...
       'mk-L_Pref-mean', 'mk-L_Pref-std', 'mk-L_Pref-skew', 'mk-L_Pref-kurt',
       'mk-L_Pref-etrp', 'mk-R_Pref-mean', 'mk-R_Pref-std', 'mk-R_Pref-skew',
       'mk-R_Pref-kurt', 'mk-R_Pref-etrp'],
      dtype='object', length=280) Index(['HT102', 'HT103', 'HT105', 'HT106', 'HT107', 'HT109', 'HT111', 'HT112',
       'HT113', 'HT114',
       ...
       'TBN-043', 'TBN-044', 'TBN-045', 'TBN-046', 'TBN-047', 'TBN-049',
       'TBN-050', 'TBN-051', 'TBN-052', 'TBN-053'],
      dtype='object', length=154)
Index(['Unnamed: 0', 'T1 Letter Number', 'T2 Letter Number',
       'T3 Letter Number', 'Digit Span Forward T1', 'Digit Span Forward T2',
       'Digit Span Forward T3', 'Digit Span Backward T1',
       'Digit Span Backward T2', 'Digit Span Backward T3', 

In [13]:
if not os.path.isfile('data_154.xlsx'):
    reduce_data.to_excel('data_154.xlsx')
    print('new data file generated')
else:
    print('file exist, check the disk')
    
if not os.path.isfile('label_154.xlsx'):
    reduce_label.to_excel('label_154.xlsx')
    print('new label file generated')
else:
    print('file exist, check the disk')

new data file generated
file exist, check the disk
