In [110]:
import numpy as np
import h5py as h5
import pandas as pd
import re

combined_path = '/projects/jp/adni-autoencoder/combined.h5'
cortical_path = '/projects/nikhil/ADNI_prediction/input_datasets/CT/scans_AAL.csv'
clinical_path = '/projects/francisco/data/ADNI/ADNI_Merge_filter.csv'
fuse_path = '/projects/nikhil/miccai/input_data_comb/data_t300_adcn.h5'
fused_segmentations_path = '/projects/nikhil/miccai/input_data_comb/ad_mci_cn_{}.h5'

# ID patterns
id_participant = re.compile(r"""
 (?<=ADNI_)      # Match the first string after ADNI_
 (.*?)          # Lazy quantifier so it only grabs the first immediate match.
 (?=_MR)        # End at the _MR
""", re.VERBOSE)

id_image = re.compile('(?<=S)\d+_(.*?)(?=_)')

In [87]:
# Concatenate all the filenames
filenames = set([])
combined = h5.File(combined_path, 'r')

for split in ['train', 'valid', 'test']:
    filenames = filenames.union(combined['l_{}_files'.format(split)])
print len(filenames)

49752


In [30]:
# part > img id lookup
participants = {}
for f in filenames:
    try:
        id = re.search(id_participant, f).group(0)
        img = re.search(id_image, f).group(1)
        participants[id] = img
    except:
        print f
print '{} unique mappings found'.format(len(participants.items()))

698 unique mappings found


In [88]:
# img id > part lookup
images = {}
for f in filenames:
    try:
        id = re.search(id_participant, f).group(0)
        img = re.search(id_image, f).group(1)
        images[img] = id
    except:
        print f
print '{} unique mappings found'.format(len(images.items()))

698 unique mappings found


In [77]:
from pandas import Series

# Load clinical and cortical datasets
clinical = pd.read_csv(clinical_path)
cortical = pd.read_csv(cortical_path)

# Filter ADNI1 Baseline subjects 
baseline_adni_1 = clinical[(clinical.ORIGPROT =='ADNI1') & (clinical.COLPROT=='ADNI1') & (clinical.VISCODE == 'bl')]

# Filter for subjects whom we have CT measurements for
baseline_adni_1 = baseline_adni_1.loc[baseline_adni_1['PTID'].isin(participants)]
img_id_col = [participants[id] for id in baseline_adni_1.PTID if id in participants.keys()]

# Add image id to clinical table:
baseline_adni_1.insert(2,'IID', img_id_col)

In [83]:
from pandas import merge

# Rename ID to IID in cortical df
cortical.rename(columns={'ID':'IID'}, inplace=True)

# Merge these suckas:
merged = merge(baseline_adni_1, cortical, on=['IID'])

# Save
merged.to_csv('/projects/francisco/data/ADNI/ct_clinical.csv')

In [97]:
import collections
import scipy.stats as stats
import h5py as h5

input_data = h5.File(fuse_path, 'r')
fuse_ids = {}

# Find the ids for each of the fused splits:
for split in ['train', 'test', 'valid']:
    files = input_data['l_{}_files'.format(split)][:]
    subject_idx=[]
    subject_vol_dict = collections.defaultdict(list)
    subject_class_dict = collections.defaultdict(list)

    #find volume indices for each unique subject
    for i, f in enumerate(files):
        subject_id = re.search(id_participant, f).group(0)
        subject_idx.append(subject_id)

    # Reduce to unique set of ids:
    fuse_ids[split] = set(subject_idx)
    
total = np.sum([len(ids) for ids in fuse_ids.values()])
print '{} fused ids found in total'.format(total)
input_data.close()

698 fused ids found in total


In [118]:
index = list(fuse_ids['train']) + list(fuse_ids['valid']) + list(fuse_ids['test'])
segmatrix = np.array([],)
for split in ['train', 'test', 'valid']:
    fdata = h5.File(fused_segmentations_path.format(split), 'r')
    r = fdata['r_hc_features_fused'][:]
    l = fdata['r_hc_features_fused'][:]
    lr = np.concatenate([l,r],axis=1)
    segmatrix = np.vstack([segmatrix, lr]) if segmatrix.size else lr

segmentations = pd.DataFrame(segmatrix)
segmentations.insert(0,'PTID', index)



In [120]:
# Merge these to the other guy
# Merge these suckas:
combined = merge(merged, segmentations, on=['PTID'])

# Save
combined.to_csv('/projects/francisco/data/ADNI/ct_clinical_seg.csv')

In [121]:
print combined.columns

Index([     u'RID',     u'PTID',      u'IID',  u'VISCODE',  u'COLPROT',
       u'ORIGPROT',    u'DX_bl',      u'AGE', u'PTGENDER',   u'ADAS11', 
       ...
             21028,       21029,       21030,       21031,       21032,
             21033,       21034,       21035,       21036,       21037],
      dtype='object', length=21127)
