In [27]:
###################### Importing Packages #########################################
import os

import numpy as np
import pickle, random, time
from sklearn import svm
from itertools import chain, combinations
from copy import deepcopy
from sklearn.decomposition import PCA
import pandas as pd
import nibabel as nib

from datautility import *

from scipy.stats import kurtosis, moment
from tqdm import tqdm
from scipy.io import savemat

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
prefrontal_file = './data/masks/Bilateral_prefrontalWM.nii'
prefrontal_array = ((nib.load(prefrontal_file)).get_fdata()).astype(np.float32)

## Defining Global Variable
* 12 channels of metrics to use: [ad, ak, awf, eas_De_par, eas_De_perp, eas_tort, FA, ias_Da, md, mk, rd, rk]
* 5 region mask: [1_L_thal, 2_R_thal, CC_Body_mask, CC_Genu_mask, CC_Splenium_mask]
* 67 positive index and 50 negative index in 117 new subjects 
* 27 positive index and 22 negative index in 49 old subjects

In [1]:
###################### Importing Packages #########################################
metric = ['ad', 'ak', 'awf', 'eas_De_par', 'eas_De_perp', 'eas_tort', 'FA', 'ias_Da', 'md', 'mk', 'rd', 'rk']
# 12 metrics in total
mask_name = ['1_L_thal','2_R_thal','CC_Body_mask','CC_Genu_mask','CC_Splenium_mask']
# 5 Regions in total
stats = ['mean', 'std', '2m', '3m', '4m', 'kut']

positive_idx = np.arange(73) + 1
positive_idx = np.delete(positive_idx, [8, 15, 16, 17, 29, 33])

# should be 67 postive in total

negative_idx = np.arange(53) + 1
negative_idx = np.delete(negative_idx, [14, 15, 16])

# should be 50 negative in total

OLD_PATH = './data/old_65subj_stats JFR.xlsx'

old_data = pd.read_excel(OLD_PATH, header=0, index_col=0, sheet_name=0)
old_idx = list(old_data.index)

NameError: name 'np' is not defined

## Generating sample image and clown mask
* using 5 region mask: [1_L_thal, 2_R_thal, CC_Body_mask, CC_Genu_mask, CC_Splenium_mask]

In [30]:
mask = get_mask(mask_name)
mask = (mask != 0).astype(np.float32)

In [None]:
mask_one = np.zeros([182, 218, 182])

for i, m in enumerate(mask):
    mask_one += (i+1)*m 

mask_one = mask_one + 6*prefrontal_array

data = get_image(1, metric, positive=True)

# savenii(data[0], 'ad_TBI001')
# savenii(mask_one, '6_mask')

## Generating 67 positive subjects among 117 new subjects
* each subject has a feature vector of shape (360,)

In [32]:
# Generating positive data

data_positive = []
key_positive = []

for p in tqdm(positive_idx):

    data = get_image(p, metric, positive=True)
    temp = np.zeros([len(metric), len(mask), 6])

    for i, m in enumerate(data): # for metric in data
        for j, k in enumerate(mask): # for mask in mask
            mask_data = (m*k).reshape(-1)
            mask_data = mask_data[mask_data != 0]

            temp[i,j,0] = np.mean(mask_data)
            temp[i,j,1] = np.std(mask_data)
            temp[i,j,2] = moment(mask_data, moment=2)
            temp[i,j,3] = moment(mask_data, moment=3)
            temp[i,j,4] = moment(mask_data, moment=4)
            temp[i,j,5] = kurtosis(mask_data)
    temp = temp.reshape(-1) # a C style flattening, statics * mask * metric
    data_positive.append(temp)
    
    key = 'TBI-' + '{:03}'.format(p)
    key_positive.append(key)
    
print('loading {} positive subjects'.format(len(data_positive)))

100%|██████████████████████████████████████████████████████████████████████████████████| 67/67 [09:56<00:00,  9.23s/it]


loading 67 positive subjects


## Generating 50 positive subjects among 117 new subjects
* each subject has a feature vector of shape (360,)

In [33]:
data_negative = []
key_negative = []

for n in tqdm(negative_idx):

    data = get_image(n, metric, positive=False)
    temp = np.zeros([len(metric), len(mask), 6])

    for i, m in enumerate(data): # for metric in data
        for j, k in enumerate(mask): # for mask in mask
            mask_data = (m*k).reshape(-1)
            mask_data = mask_data[mask_data != 0]

            temp[i,j,0] = np.mean(mask_data)
            temp[i,j,1] = np.std(mask_data)
            temp[i,j,2] = moment(mask_data, moment=2)
            temp[i,j,3] = moment(mask_data, moment=3)
            temp[i,j,4] = moment(mask_data, moment=4)
            temp[i,j,5] = kurtosis(mask_data)
    temp = temp.reshape(-1) # a C style flattening, statics * mask * metric
    data_negative.append(temp)
    key = 'TBN-' + '{:03}'.format(n)
    key_negative.append(key)
    
print('loading {} negative subjects'.format(len(data_negative)))

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [07:13<00:00, 10.41s/it]


loading 50 negative subjects


## Generating 49 subjects among 67 old images
* Other 18 subjects are take MRI second time, ignore them for this dataset

In [42]:
data_old = []
key_old = []

for o, key in enumerate(tqdm(old_idx)):
    data = get_old_image(o, metric)
    temp = np.zeros([len(metric), len(mask), 6])

    for i, m in enumerate(data): # for metric in data
        for j, k in enumerate(mask): # for mask in mask
            mask_data = (m*k).reshape(-1)
            mask_data = mask_data[mask_data != 0]

            temp[i,j,0] = np.mean(mask_data)
            temp[i,j,1] = np.std(mask_data)
            temp[i,j,2] = moment(mask_data, moment=2)
            temp[i,j,3] = moment(mask_data, moment=3)
            temp[i,j,4] = moment(mask_data, moment=4)
            temp[i,j,5] = kurtosis(mask_data)
    temp = temp.reshape(-1) # a C style flattening, statics * mask * metric
    data_old.append(temp)
    key_old.append(key)
    
print('loading {} old subjects'.format(len(data_old)))



  0%|                                                                                           | 0/49 [00:00<?, ?it/s]

  2%|█▋                                                                                 | 1/49 [00:07<05:50,  7.29s/it]

  4%|███▍                                                                               | 2/49 [00:14<05:40,  7.25s/it]

  6%|█████                                                                              | 3/49 [00:21<05:34,  7.28s/it]

  8%|██████▊                                                                            | 4/49 [00:28<05:26,  7.25s/it]

 10%|████████▍                                                                          | 5/49 [00:36<05:16,  7.19s/it]

 12%|██████████▏                                                                        | 6/49 [00:43<05:10,  7.23s/it]

 14%|███████████▊                                                                       | 7/49 [00:50<05:03,  7.23s/it]

 16%|█████████████▌           

loading 49 old subjects


## Binding data
* Sequence: old subjects -> positive new subjects -> negative new subjects
* Generate column names as statistic * mask * metric

In [43]:
data_all = []
key_all = []

data_all = data_old + data_positive + data_negative
key_all = key_old + key_positive + key_negative

print(len(data_all))

166


In [44]:
import itertools

pre_column = ['-'.join(i) for i in list(itertools.product(mask_name, metric))]
index_column = ['-'.join(i) for i in list(itertools.product(stats, pre_column))]


In [57]:
data_dict = pd.DataFrame(data=data_all, index=key_all, columns=index_column)
print(data_dict.index, len(key_all), len(index_column))
####################### Save dataframe to Excel #############################################

SAVE_PATH = './data/stats_data_gather.xlsx'
if not os.path.isfile(SAVE_PATH):
    data_dict.to_excel(SAVE_PATH)
    pass
else:
    print('file {} exists'.format(SAVE_PATH))

Index(['HT102', 'HT103', 'HT105', 'HT106', 'HT107', 'HT109', 'HT111', 'HT112',
       'HT113', 'HT114',
       ...
       'TBN-044', 'TBN-045', 'TBN-046', 'TBN-047', 'TBN-048', 'TBN-049',
       'TBN-050', 'TBN-051', 'TBN-052', 'TBN-053'],
      dtype='object', length=166) 166 360


ValueError: Cannot convert 'HT102' to Excel

In [48]:
LABEL_PATH = './data/merged_np.xlsx'
label_data = pd.read_excel(LABEL_PATH, index_col=1, sheet_name=0)
print(label_data.index, label_data.columns)

# DATA_PATH = './data/stats_data_gather.xlsx'
# stats_data = pd.read_excel(DATA_PATH, index_col=0, sheet_name=0)
# print(stats_data.index, stats_data.columns)

Index(['HT102', 'HT103', 'HT104', 'HT105', 'HT106', 'HT107', 'HT108', 'HT109',
       'HT110', 'HT111',
       ...
       'TBN-043', 'TBN-044', 'TBN-045', 'TBN-046', 'TBN-047', 'TBN-049',
       'TBN-050', 'TBN-051', 'TBN-052', 'TBN-053'],
      dtype='object', name='Subject', length=172) Index(['Unnamed: 0', 'T1 Letter Number', 'T2 Letter Number',
       'T3 Letter Number', 'Digit Span Forward T1', 'Digit Span Forward T2',
       'Digit Span Forward T3', 'Digit Span Backward T1',
       'Digit Span Backward T2', 'Digit Span Backward T3', 'Trailmaking B T1',
       'Trailmaking B T2', 'Trailmaking B T3', 'Trailmaking A T1',
       'Trailmaking A T2', 'Trailmaking A T3', 'CVLT T1', 'CVLT T2', 'CVLT T3',
       'RCFT Immediate T1', 'RCFT Immediate T2', 'RCFT Immediate T3',
       'RCFT Delayed T1', 'RCFT Delayed T2', 'RCFT Delayed T3', 'DKEFS T1',
       'DKEFS T2', 'DKEFS T3', 'Stroop T1', 'Stroop T2', 'Stroop T3',
       'SDMT T1', 'SDMT T2', 'SDMT T3'],
      dtype='object')


In [51]:
all_data = pd.concat([data_dict, label_data], axis=1, join='inner')
print(all_data)

         mean-1_L_thal-ad  mean-1_L_thal-ak  mean-1_L_thal-awf  \
HT102            1.146115          0.187892           0.035304   
HT103            1.191020          0.269101           0.072415   
HT105            1.291676          0.362108           0.131123   
HT106            1.206369          0.291270           0.084838   
HT107            1.149255          0.184552           0.034059   
HT109            1.186361          0.219010           0.047965   
HT111            1.391104          0.571680           0.326818   
HT112            1.155803          0.183605           0.033711   
HT113            1.230235          0.443477           0.196672   
HT114            1.163863          0.245453           0.060247   
HT115            1.600985          0.965549           0.932285   
HT116            1.301578          0.498871           0.248872   
HT117            1.204352          0.329715           0.108712   
HT118            1.243044          0.458843           0.210537   
HT119     

[154 rows x 394 columns]


In [52]:
reduce_data = all_data.iloc[:,0:360]
print(reduce_data.columns, reduce_data.index)

reduce_label = all_data.iloc[:,360:]
print(reduce_label.columns, reduce_label.index)

Index(['mean-1_L_thal-ad', 'mean-1_L_thal-ak', 'mean-1_L_thal-awf',
       'mean-1_L_thal-eas_De_par', 'mean-1_L_thal-eas_De_perp',
       'mean-1_L_thal-eas_tort', 'mean-1_L_thal-FA', 'mean-1_L_thal-ias_Da',
       'mean-1_L_thal-md', 'mean-1_L_thal-mk',
       ...
       'kut-CC_Splenium_mask-awf', 'kut-CC_Splenium_mask-eas_De_par',
       'kut-CC_Splenium_mask-eas_De_perp', 'kut-CC_Splenium_mask-eas_tort',
       'kut-CC_Splenium_mask-FA', 'kut-CC_Splenium_mask-ias_Da',
       'kut-CC_Splenium_mask-md', 'kut-CC_Splenium_mask-mk',
       'kut-CC_Splenium_mask-rd', 'kut-CC_Splenium_mask-rk'],
      dtype='object', length=360) Index(['HT102', 'HT103', 'HT105', 'HT106', 'HT107', 'HT109', 'HT111', 'HT112',
       'HT113', 'HT114',
       ...
       'TBN-043', 'TBN-044', 'TBN-045', 'TBN-046', 'TBN-047', 'TBN-049',
       'TBN-050', 'TBN-051', 'TBN-052', 'TBN-053'],
      dtype='object', length=154)
Index(['Unnamed: 0', 'T1 Letter Number', 'T2 Letter Number',
       'T3 Letter Number', 'Di

In [54]:
if not os.path.isfile('data_154.xlsx'):
    reduce_data.to_excel('data_154.xlsx')
    print('new data file generated')
else:
    print('file exist, check the disk')
    
if not os.path.isfile('label_154.xlsx'):
    reduce_label.to_excel('label_154.xlsx')
    print('new data file generated')
else:
    print('file exist, check the disk')

file exist, check the disk
file exist, check the disk
