In [1]:
# Basic Imports
import numpy as np
import h5py as h5
from sklearn.externals import joblib
import collections
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.cross_validation import KFold
import pickle
import re
import os
import os.path

In [11]:
#use top 40,962 to match low-res subjects http://www.bic.mni.mcgill.ca/ServicesSoftware/StatisticalAnalysesUsingSurfstatMatlab
AAL_style = True
JDV_style = False
#Subjects:
baseline_dir = '/projects/nikhil/ADNI_prediction/input_datasets/CT/civet_out/'

#Regex for parsing PTID and IID 
ptid_re = re.compile('\d*(_S_)\d*')
iid_re = re.compile('(?<=I)\d*')

In [12]:
# Read atlas files
if AAL_style:
    atlas_file_L = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/AAL_atlas_left.txt'
    atlas_file_R = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/AAL_atlas_right.txt'
    with open(atlas_file_L) as f:
        atlas_data_highRes_L = f.readlines()

    with open(atlas_file_R) as f:
        atlas_data_highRes_R = f.readlines()
    
    atlas_data_list = atlas_data_highRes_L[:40962] + atlas_data_highRes_R[:40962]
    atlas_data = np.array(atlas_data_list,dtype=int)
    
elif JDV_style:
    atlas_data_L = pd.read_csv(atlas_file_L,header=3,delim_whitespace=True)
    atlas_data_R = pd.read_csv(atlas_file_R,header=3,delim_whitespace=True)
    
    atlas_data_list = list(atlas_data_L['v0']) + list(atlas_data_R['v0'])
    atlas_data = np.array(atlas_data_list,dtype=int)
    
else:        
    atlas_file_L = '/projects/nikhil/ADNI_prediction/input_datasets/CT/left-labels_C375.txt'
    #Use same atlas for L & R to preseve symmetry to ROIs
    atlas_file_R = '/projects/nikhil/ADNI_prediction/input_datasets/CT/left-labels_C375.txt'

    atlas_data_L = np.genfromtxt(atlas_file_L, dtype=int)
    atlas_data_R = 1000 + np.genfromtxt(atlas_file_R, dtype=int)#adding big enough offset to sperate L v R ROIs
    
    atlas_data = np.hstack((atlas_data_L[:40962],atlas_data_R[:40962]))
    atlas_data_list = list(atlas_data)
        
unique_roi = np.array(list(set(atlas_data)))


print 'atlas data (number of L + R vertices) {}'.format(len(atlas_data))
print '# of unique ROIs {}'.format(len(unique_roi))

atlas data (number of L + R vertices) 81924
# of unique ROIs 79


In [None]:
print unique_roi

In [7]:
# Grab left and right CT data for a given subject
def get_baseline_data(baseline_dir):
    print "nothing to do yet.. need to find mean thickness values from civet"
        
    
def get_legacy_ADNI1_SubjectData(baseline_dir,sub_id):
    sub_pre = 'ADNI_'
    sub_suf_L = '_native_rms_rsl_tlink_20mm_left.txt'
    sub_suf_R = '_native_rms_rsl_tlink_20mm_right.txt'
    
    subject_file_L = baseline_dir + sub_pre + sub_id + sub_suf_L
    subject_file_R = baseline_dir + sub_pre + sub_id + sub_suf_R
    
    if os.path.isfile(subject_file_L) and os.path.isfile(subject_file_R):
    
        with open(subject_file_L) as f:
            subject_data_list_L = f.readlines()

        with open(subject_file_R) as f:
            subject_data_list_R = f.readlines()

        subject_data = np.array(subject_data_list_L + subject_data_list_R,dtype=float)
        msg = True
            
    else:        
        subject_data = 0
        msg = False
        
    return {'subject_data':subject_data, 'success': msg}

def get_ADNI_SubjectData(baseline_dir,sub_id, prefix):
    sub_pre = sub_id + '/thickness/{}_'.format(prefix)
    sub_suf_L = '_native_rms_rsl_tlink_28.28mm_left.txt'
    sub_suf_R = '_native_rms_rsl_tlink_28.28mm_right.txt'
    
    subject_file_L = baseline_dir + sub_pre + sub_id + sub_suf_L
    subject_file_R = baseline_dir + sub_pre + sub_id + sub_suf_R
    
    if os.path.isfile(subject_file_L) and os.path.isfile(subject_file_R):
    
        with open(subject_file_L) as f:
            subject_data_list_L = f.readlines()

        with open(subject_file_R) as f:
            subject_data_list_R = f.readlines()

        subject_data = np.array(subject_data_list_L + subject_data_list_R,dtype=float)
        msg = True
            
    else:        
        subject_data = 0
        msg = False
        
    return {'subject_data':subject_data, 'success': msg}

# Create dictionary with roi_id:[thickness values]
def get_ROI_CT_dict(unique_roi, subject_data):
    roi_CT_dict = collections.defaultdict(list)
    for roi in unique_roi:
        roi_idx = atlas_data==roi
        roi_CT_dict[roi].append(subject_data[roi_idx])
        #print str(roi) + ': ' +  str(np.sum(roi_idx))    
        
    return roi_CT_dict

def save_dictionary(_dict,save_path):
    f = open(save_path, 'wb')
    pickle.dump(_dict, f)
    f.close()

In [None]:
len(subject_ROI_CT_dict[ptid][0])

In [13]:
# ADNI CT imports
prefix = 'ADNI2_BL'
subject_ROI_CT_dict_filename = baseline_dir + 'adni2/ADNI2_subject_ROI_CT_dict.pkl'
subject_dirs_path = baseline_dir + 'adni2/output/'
subject_dirs = os.listdir(subject_dirs_path)
#Dictionary of dictionary --> subject:{roi:CT_vals}
subject_ROI_CT_dict = collections.defaultdict(list)
subs_missing_data  = []
for sub_dir in subject_dirs:
    if sub_dir.split("_")[0] == 'ADNI':
        ptid = re.search(ptid_re, sub_dir).group(0).strip()
        #print ptid
        result = get_ADNI_SubjectData(subject_dirs_path,sub_dir,prefix)

        # check if subject data exists
        if result['success']:
            single_ROI_CT_dict = get_ROI_CT_dict(unique_roi,result['subject_data'])
            if len(single_ROI_CT_dict) != len(unique_roi):
                print "something is wrong"
                print sub_dir
            subject_ROI_CT_dict[ptid].append(single_ROI_CT_dict)
        else:
            subs_missing_data.append(ptid) 
            
    else:
        print 'Not an ADNI dir: {}'.format(sub_dir)

print 'missing data for subjects: {}'.format(len(subs_missing_data))

Not an ADNI dir: civet_adni2_QC.tar.gz
Not an ADNI dir: References.txt
Not an ADNI dir: QC
missing data for subjects: 1


In [None]:
# keep log of subs with missing data:
sub_missing_data_file = '/projects/nikhil/ADNI_prediction/input_datasets/CT/civet_out/adni1/bad_subs'
with open(sub_missing_data_file, 'w') as f:
    for s in subs_missing_data:
        f.write(s + '\n')
        
print len(subs_missing_data)


In [None]:
# ADNI-1 CT imports (legacy)
# Grab all the subject idx
baseline_dir = '/projects/nikhil/ADNI_prediction/input_datasets/CT/ADNI1_1.5T_CIVET_1.1.12/thickness/'
subject_files = os.listdir(baseline_dir)
subject_idx = []
for sub in subject_files:
    idx = sub.split('_')[1]
    subject_idx.append(idx)
    

In [None]:
# ADNI-1 CT imports (legacy)
# Generate dictionary
#Dictionary of dictionary --> subject:{roi:CT_vals}
subject_ROI_CT_dict = collections.defaultdict(list)
subs_missing_data  = []
for sub_id in list(set(subject_idx)):
    result = get_ADNI1_SubjectData(baseline_dir,sub_id)    
    # check if subject data exists
    if result['success']:
        single_ROI_CT_dict = get_ROI_CT_dict(unique_roi,result['subject_data'])
        subject_ROI_CT_dict[sub_id].append(single_ROI_CT_dict)
    else:
        subs_missing_data.append(sub_id)

In [None]:
#Chech if all ADNI-1 subjects have ROI-CT dataset assoicated with them
master_dataframe = '/projects/francisco/data/ADNI/master_fused.pkl'
data = pd.read_pickle(master_dataframe)
id_image = re.compile('(?<=I)\d*')
ADNI1_subs_with_CT_data = []
for uid in data.UID:
    img = re.search(ptid_re, uid).group(0)
    if img in subject_ROI_CT_dict.keys():
        if len(subject_ROI_CT_dict[img][0]) == len(unique_roi): #688 for spectral clustering, #79 for AAL
             ADNI1_subs_with_CT_data.append(img)    
        
        
print len(ADNI1_subs_with_CT_data), len(subject_ROI_CT_dict)

In [None]:
#Check the minimun number of vertices per ROI (=min sampling bound = 132 for ADNI1 baseline)
no_of_vertices = []
for key in single_ROI_CT_dict.keys():
    no_of_vertices.append(len(single_ROI_CT_dict[int(key)][0]))
    
print np.asarray(no_of_vertices).min()     

In [None]:
subject_ROI_CT_dict

In [None]:
# Distribution of the vertices per ROI
from collections import Counter
%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 10)
plt.style.use('ggplot')
plt.subplot(2,1,1)
plt.hist(Counter(atlas_data).values(),bins=100)
plt.show()




In [14]:
len(single_ROI_CT_dict), len(subject_ROI_CT_dict)

(79, 763)

In [15]:
# Save dictionary to the disk
cohort = 'ADNI2'
atlas = 'AAL'
save_CT_dict_path = baseline_dir + '{}_subject_ROI_CT_dict_{}.pkl'.format(cohort,atlas)
save_dictionary(subject_ROI_CT_dict,save_CT_dict_path)
#sub_CT_dict = pickle.load( open(save_CT_dict_path, "rb" ) )

In [1]:

ordered_roi_list = subject_ROI_CT_dict['40817'][0].keys()

roi_vert_count = {}
for roi in ordered_roi_list:
    roi_vert_count[roi] = np.sum(atlas_data==roi)
ignore_roi_list = [-1]

for roi in ignore_roi_list:
    ordered_roi_list.remove(roi)

print ordered_roi_list
#print (ordered_roi_list)
for key, val in roi_vert_count.iteritems():
    if int(val) < 66:
        print key

NameError: name 'subject_ROI_CT_dict' is not defined

In [28]:
baseline_dir = '/projects/nikhil/ADNI_prediction/input_datasets/CT/civet_out/adni2/'
adni2_subs = 'adni2_subject_list'
adni2_bpipe_out_ids = 'adni2_ids_clean' #same as civet in ids (so after bpipe QC)
adni2_civet_out_ids = 'civet_out_ids_1'

In [29]:
with open(baseline_dir + adni2_subs) as f:
    s1 = f.read().splitlines()
with open(baseline_dir + adni2_bpipe_out_ids) as f:
    s2 = f.read().splitlines()
with open(baseline_dir + adni2_civet_out_ids) as f:
    s3 = f.read().splitlines()

print len(s1),len(s2),len(s3)

780 764 763


In [30]:
print s1[:3]
print s2[:3]
print s3[:3]


['ADNI_002_S_4171_MR_MT1__N3m_Br_20110816094410627_S118013_I250649', 'ADNI_002_S_4213_MR_MT1__N3m_Br_20110910135704514_S121168_I255409', 'ADNI_002_S_4219_MR_MT1__N3m_Br_20110928093601592_S122143_I258694']
['ADNI_002_S_4171_MR_MT1__N3m_Br_20110816094410627_S118013_I250649', 'ADNI_002_S_4213_MR_MT1__N3m_Br_20110910135704514_S121168_I255409', 'ADNI_002_S_4219_MR_MT1__N3m_Br_20110928093601592_S122143_I258694']
['ADNI_002_S_4171_MR_MT1__N3m_Br_20110816094410627_S118013_I250649', 'ADNI_002_S_4213_MR_MT1__N3m_Br_20110910135704514_S121168_I255409', 'ADNI_002_S_4219_MR_MT1__N3m_Br_20110928093601592_S122143_I258694']


In [31]:
civet_proc_summary = []
for sub in s1:
    if sub in s2:
        bpipe_str = 'pass'
        if sub in s3:
            civet_str = 'pass'
        else:
            civet_str = 'fail'
    else:
        bpipe_str = 'fail'
        civet_str = 'fail'
    
    
        
    sub_proc_str = '{} {} {}'.format(sub,bpipe_str,civet_str)
    civet_proc_summary.append(sub_proc_str)

In [32]:
out_file = 'adni2_proc_summary.csv'
f = open(baseline_dir + out_file, "w")
for item in civet_proc_summary:
    f.write("%s\n" % item)
    
f.close()
