In [1]:
import numpy as np
import h5py as h5
import pandas as pd
import re

combined_path = '/projects/jp/adni-autoencoder/combined.h5'
cortical_path = '/projects/nikhil/ADNI_prediction/input_datasets/CT/scans_AAL.csv'
clinical_path = '/projects/francisco/data/ADNI/ADNI_Merge_filter.csv'
fuse_path = '/projects/nikhil/miccai/input_data_comb/data_t300_adcn.h5'
fused_segmentations_path = '/projects/francisco/data/ADNI/ordered_fused_ad_cn_mci_{}.h5'

# ID patterns
id_participant = re.compile(r"""
 (?<=ADNI_)      # Match the first string after ADNI_
 (.*?)          # Lazy quantifier so it only grabs the first immediate match.
 (?=_MR)        # End at the _MR
""", re.VERBOSE)

id_image = re.compile('(?<=S)\d+_(.*?)(?=_)')

In [2]:
# Concatenate all the filenames
filenames = set([])
combined = h5.File(combined_path, 'r')

for split in ['train', 'valid', 'test']:
    filenames = filenames.union(combined['l_{}_files'.format(split)])
print len(filenames)

IOError: Unable to open file (Unable to open file: name = '/projects/jp/adni-autoencoder/combined.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

In [70]:
# part > img id lookup
participants = {}
for f in filenames:
    try:
        id = re.search(id_participant, f).group(0)
        img = re.search(id_image, f).group(1)
        participants[id] = img
    except:
        print f
print '{} unique mappings found'.format(len(participants.items()))

698 unique mappings found


In [71]:
# img id > part lookup
images = {}
for f in filenames:
    try:
        id = re.search(id_participant, f).group(0)
        img = re.search(id_image, f).group(1)
        images[img] = id
    except:
        print f
print '{} unique mappings found'.format(len(images.items()))

698 unique mappings found


In [72]:
from pandas import Series

# Load clinical and cortical datasets
clinical = pd.read_csv(clinical_path)
cortical = pd.read_csv(cortical_path)

# Filter ADNI1 Baseline subjects 
baseline_adni_1 = clinical[(clinical.ORIGPROT =='ADNI1') & (clinical.COLPROT=='ADNI1') & (clinical.VISCODE == 'bl')]

# Filter for subjects whom we have CT measurements for
baseline_adni_1 = baseline_adni_1.loc[baseline_adni_1['PTID'].isin(participants)]
img_id_col = [participants[id] for id in baseline_adni_1.PTID if id in participants.keys()]

# Add image id to clinical table:
baseline_adni_1.insert(2,'IID', img_id_col)

In [73]:
# Rename the cortical variables
new_cols = ['CT_{}'.format(col) for col in cortical.columns]
cortical.columns = new_cols


In [74]:
from pandas import merge

# Rename ID to IID in cortical df
cortical.rename(columns={'CT_ID':'IID'}, inplace=True)

# Merge these suckas:
merged = merge(baseline_adni_1, cortical, on=['IID'])

# Save
merged.to_csv('/projects/francisco/data/ADNI/ct_clinical.csv')

In [75]:
print merged.columns

Index([u'RID', u'PTID', u'IID', u'VISCODE', u'COLPROT', u'ORIGPROT', u'DX_bl',
       u'AGE', u'PTGENDER', u'ADAS11', u'ADAS13', u'MMSE', u'ADAS11_bl',
       u'ADAS13_bl', u'MMSE_bl', u'CT_REC.L', u'CT_OLF.L', u'CT_ORBsup.L',
       u'CT_ORBsupmed.L', u'CT_ORBmid.L', u'CT_ORBinf.L', u'CT_SFGdor.L',
       u'CT_MFG.L', u'CT_IFGoperc.L', u'CT_IFGtriang.L', u'CT_SFGmed.L',
       u'CT_SMA.L', u'CT_PCL.L', u'CT_PreCG.L', u'CT_ROL.L', u'CT_PoCG.L',
       u'CT_SPG.L', u'CT_IPL.L', u'CT_SMG.L', u'CT_ANG.L', u'CT_PCUN.L',
       u'CT_SOG.L', u'CT_MOG.L', u'CT_IOG.L', u'CT_CAL.L', u'CT_CUN.L',
       u'CT_LING.L', u'CT_FFG.L', u'CT_HES.L', u'CT_STG.L', u'CT_MTG.L',
       u'CT_ITG.L', u'CT_TPOsup.L', u'CT_TPOmid.L', u'CT_ACG.L', u'CT_DCG.L',
       u'CT_PCG.L', u'CT_REC.R', u'CT_OLF.R', u'CT_ORBsup.R',
       u'CT_ORBsupmed.R', u'CT_ORBmid.R', u'CT_ORBinf.R', u'CT_SFGdor.R',
       u'CT_MFG.R', u'CT_IFGoperc.R', u'CT_IFGtriang.R', u'CT_SFGmed.R',
       u'CT_SMA.R', u'CT_PCL.R', u'CT_PreCG.R'

In [76]:
import collections
import scipy.stats as stats
import h5py as h5

input_data = h5.File(fuse_path, 'r')
fuse_ids = {}

# Find the ids for each of the fused splits:
for split in ['train', 'test', 'valid']:
    files = input_data['l_{}_files'.format(split)][:]
    subject_idx=[]

    #find volume indices for each unique subject
    seen = set([])
    for i, f in enumerate(files):
        subject_id = re.search(id_participant, f).group(0)
        if subject_id not in seen:
            subject_idx.append(subject_id)
            seen.add(subject_id)

    # Reduce to unique set of ids:
    fuse_ids[split] = subject_idx
    
total = np.sum([len(ids) for ids in fuse_ids.values()])
print '{} fused ids found in total'.format(total)
input_data.close()

698 fused ids found in total


In [77]:
index = list(fuse_ids['train']) + list(fuse_ids['valid']) + list(fuse_ids['test'])
segmatrix = np.array([],)
splits = []
for split in ['train', 'test', 'valid']:
    print split
    fdata = h5.File(fused_segmentations_path.format(split), 'r')
    r = fdata['r_hc_features_fused'][:]
    l = fdata['l_hc_features_fused'][:]
    l_vol = np.sum(l,axis=1).reshape(-1,1)
    r_vol = np.sum(r,axis=1).reshape(-1,1)
    lr = np.concatenate([l_vol, r_vol,l,r],axis=1)
    segmatrix = np.vstack([segmatrix, lr]) if segmatrix.size else lr
    # Add a column to keep track of which data split each participant is in:
    splits.extend([split for x in range(lr.shape[0])])

hc_col_names = ['L_HC_VOL', 'R_HC_VOL']
hc_col_names.extend(['L_HC_{}'.format(x) for x in range(l.shape[1])])
hc_col_names.extend(['R_HC_{}'.format(x) for x in range(r.shape[1])])

segmentations = pd.DataFrame(segmatrix, columns=hc_col_names)
segmentations.insert(0,'PTID', index)
segmentations.insert(1,'SPLIT', splits)


train
test
valid


In [82]:
print segmentations.columns
print np.sum(segmentations['L_HC_VOL'])
print np.sum(segmentations['R_HC_VOL'])

Index([u'PTID', u'SPLIT', u'L_HC_VOL', u'R_HC_VOL', u'L_HC_0', u'L_HC_1',
       u'L_HC_2', u'L_HC_3', u'L_HC_4', u'L_HC_5', 
       ...
       u'R_HC_10509', u'R_HC_10510', u'R_HC_10511', u'R_HC_10512',
       u'R_HC_10513', u'R_HC_10514', u'R_HC_10515', u'R_HC_10516',
       u'R_HC_10517', u'R_HC_10518'],
      dtype='object', length=21950)
1550702.0
1527313.0


In [79]:
# Merge these to the other guy
# Merge these suckas:        bsel = SelectKBest(f_regression, k=k)

combined = merge(merged, segmentations, on=['PTID'], sort=False)

# Move the SPLIT column to the somewhere near the front:
split_col = combined['SPLIT']
combined.drop(labels=['SPLIT'], axis=1, inplace = True)
combined.insert(3, 'SPLIT', split_col)

# Save
combined.to_pickle('/projects/francisco/data/ADNI/master_fused.pkl')
combined.to_csv('/projects/francisco/data/ADNI/master_fused.csv')

In [80]:
useful_vars = '^DX_bl|^ADAS|^MMSE|^CT_|^L_HC|^R_HC'

# Extract all the useful features and save to training, validation, and test files
for split in ['train', 'test', 'valid']:
    current_split = combined[(combined.SPLIT ==  split)]
    useful = current_split.filter(regex=useful_vars)
    c = useful['DX_bl']
    c[c=='AD'] = 0
    c[c=='CN'] = 1
    c[c=='LMCI'] = 2
    useful.to_pickle('/projects/francisco/data/ADNI/cli_ct_seg_fused_{}.pkl'.format(split))
    print useful.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(490, 22029)
(110, 22029)
(98, 22029)


In [81]:
print combined.shape
print combined.columns
print combined.loc[0]

(698, 22038)
Index([u'RID', u'PTID', u'IID', u'SPLIT', u'VISCODE', u'COLPROT', u'ORIGPROT',
       u'DX_bl', u'AGE', u'PTGENDER', 
       ...
       u'R_HC_10509', u'R_HC_10510', u'R_HC_10511', u'R_HC_10512',
       u'R_HC_10513', u'R_HC_10514', u'R_HC_10515', u'R_HC_10516',
       u'R_HC_10517', u'R_HC_10518'],
      dtype='object', length=22038)
RID                      295
PTID              002_S_0295
IID                  I118671
SPLIT                  train
VISCODE                   bl
COLPROT                ADNI1
ORIGPROT               ADNI1
DX_bl                     CN
AGE                     84.8
PTGENDER                Male
ADAS11                     3
ADAS13                     4
MMSE                      28
ADAS11_bl                  3
ADAS13_bl                  4
MMSE_bl                   28
CT_REC.L            3.307916
CT_OLF.L            3.399054
CT_ORBsup.L          3.49189
CT_ORBsupmed.L      3.596402
CT_ORBmid.L         3.564972
CT_ORBinf.L         3.547402
CT_SFGdor.L 

In [7]:
import pandas as pd
from IPython.display import display

combined = pd.read_pickle('/projects/francisco/data/ADNI/master_fused.pkl')
display(combined)

In [17]:
UID = combined['PTID'] + '_' + combined['IID']
combined.insert(0, 'UID', UID)

0      002_S_0295_I118671
1      002_S_0559_I118676
2      002_S_0619_I118678
3      002_S_0685_I118680
4      002_S_0729_I118682
5      002_S_0782_I118669
6      002_S_0816_I118984
7      002_S_0938_I118685
8      002_S_0954_I118688
9      002_S_0955_I118689
10      002_S_1018_I40817
11      002_S_1155_I40845
12      002_S_1261_I62377
13      002_S_1268_I64037
14      002_S_1280_I60056
15      003_S_0907_I52781
16      003_S_0908_I62589
17      003_S_0981_I52776
18      003_S_1021_I73506
19      003_S_1059_I52816
20      003_S_1074_I53395
21      003_S_1122_I52799
22      005_S_0221_I72128
23      005_S_0222_I54686
24      005_S_0324_I32891
25      005_S_0448_I32875
26      005_S_0546_I32681
27      005_S_0553_I32644
28      005_S_0572_I32653
29      005_S_0602_I32672
              ...        
668     141_S_0696_I82738
669     141_S_0697_I91235
670     141_S_0717_I98888
671     141_S_0726_I94824
672     141_S_0767_I47306
673     141_S_0790_I91253
674     141_S_0810_I47314
675     141_

In [19]:
display(combined)
# Save
combined.to_pickle('/projects/francisco/data/ADNI/master_fused.pkl')
combined.to_csv('/projects/francisco/data/ADNI/master_fused.csv')

Unnamed: 0,UID,RID,PTID,IID,SPLIT,VISCODE,COLPROT,ORIGPROT,DX_bl,AGE,...,R_HC_10509,R_HC_10510,R_HC_10511,R_HC_10512,R_HC_10513,R_HC_10514,R_HC_10515,R_HC_10516,R_HC_10517,R_HC_10518
0,002_S_0295_I118671,295,002_S_0295,I118671,train,bl,ADNI1,ADNI1,CN,84.8,...,0,0,0,0,0,0,0,0,0,0
1,002_S_0559_I118676,559,002_S_0559,I118676,train,bl,ADNI1,ADNI1,CN,79.3,...,0,0,0,0,0,0,0,0,0,0
2,002_S_0619_I118678,619,002_S_0619,I118678,test,bl,ADNI1,ADNI1,AD,77.5,...,0,0,0,0,0,0,0,0,0,0
3,002_S_0685_I118680,685,002_S_0685,I118680,valid,bl,ADNI1,ADNI1,CN,89.6,...,0,0,0,0,0,0,0,0,0,0
4,002_S_0729_I118682,729,002_S_0729,I118682,train,bl,ADNI1,ADNI1,LMCI,65.1,...,0,0,0,0,0,0,0,0,0,0
5,002_S_0782_I118669,782,002_S_0782,I118669,train,bl,ADNI1,ADNI1,LMCI,81.6,...,0,0,0,0,0,0,0,0,0,0
6,002_S_0816_I118984,816,002_S_0816,I118984,test,bl,ADNI1,ADNI1,AD,70.8,...,0,0,0,0,0,0,0,0,0,0
7,002_S_0938_I118685,938,002_S_0938,I118685,test,bl,ADNI1,ADNI1,AD,82.2,...,0,0,0,0,0,0,0,0,0,0
8,002_S_0954_I118688,954,002_S_0954,I118688,train,bl,ADNI1,ADNI1,LMCI,69.3,...,0,0,0,0,0,0,0,0,0,0
9,002_S_0955_I118689,955,002_S_0955,I118689,train,bl,ADNI1,ADNI1,AD,78.2,...,0,0,0,0,0,0,0,0,0,0
