In [1]:
# Basic Imports
import numpy as np
import h5py as h5
from sklearn.externals import joblib
import collections
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.cross_validation import KFold
import pickle
import re
import os
import os.path

In [2]:
#Has the AAL ROI indices 

#use top 40,962 to match low-res subjects http://www.bic.mni.mcgill.ca/ServicesSoftware/StatisticalAnalysesUsingSurfstatMatlab
#atlas_file_L = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/AAL_atlas_left.txt'
#atlas_file_R = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/AAL_atlas_right.txt'

atlas_file_L = '/projects/nikhil/ADNI_prediction/input_datasets/CT/left-labels_C375.txt'
#Use same atlas for L & R to preseve symmetry to ROIs
atlas_file_R = '/projects/nikhil/ADNI_prediction/input_datasets/CT/left-labels_C375.txt'

AAL_style = False
JDV_style = False

#Subjects:
#baseline_dir = '/projects/nikhil/ADNI_prediction/input_datasets/CT/civet12_adni2_m00/output/'

#Has the CT values 
#subject_file_L = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/test_subject_L.txt'
#subject_file_R = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/test_subject_R.txt'

In [11]:
# Read atlas files
if AAL_style:
    with open(atlas_file_L) as f:
        atlas_data_highRes_L = f.readlines()

    with open(atlas_file_R) as f:
        atlas_data_highRes_R = f.readlines()
    
    atlas_data_list = atlas_data_highRes_L[:40962] + atlas_data_highRes_R[:40962]
    atlas_data = np.array(atlas_data_list,dtype=int)
    
elif JDV_style:
    atlas_data_L = pd.read_csv(atlas_file_L,header=3,delim_whitespace=True)
    atlas_data_R = pd.read_csv(atlas_file_R,header=3,delim_whitespace=True)
    
    atlas_data_list = list(atlas_data_L['v0']) + list(atlas_data_R['v0'])
    atlas_data = np.array(atlas_data_list,dtype=int)
    
else:        
    atlas_data_L = np.genfromtxt(atlas_file_L, dtype=int)
    atlas_data_R = 1000 + np.genfromtxt(atlas_file_R, dtype=int)#adding big enough offset to sperate L v R ROIs
    
    atlas_data = np.hstack((atlas_data_L[:40962],atlas_data_R[:40962]))
    atlas_data_list = list(atlas_data)
        
unique_roi = np.array(list(set(atlas_data)))


print len(atlas_data)
print len(unique_roi)

81924
688


In [15]:
print len(set(atlas_data_L))
print len(set(atlas_data))

345
688


In [6]:
# Grab left and right CT data for a given subject
def get_ADNI1_SubjectData(baseline_dir,sub_id):
    sub_pre = 'ADNI_'
    sub_suf_L = '_native_rms_rsl_tlink_20mm_left.txt'
    sub_suf_R = '_native_rms_rsl_tlink_20mm_right.txt'
    
    subject_file_L = baseline_dir + sub_pre + sub_id + sub_suf_L
    subject_file_R = baseline_dir + sub_pre + sub_id + sub_suf_R
    
    if os.path.isfile(subject_file_L) and os.path.isfile(subject_file_R):
    
        with open(subject_file_L) as f:
            subject_data_list_L = f.readlines()

        with open(subject_file_R) as f:
            subject_data_list_R = f.readlines()

        subject_data = np.array(subject_data_list_L + subject_data_list_R,dtype=float)
        msg = True
            
    else:        
        subject_data = 0
        msg = False
        
    return {'subject_data':subject_data, 'success': msg}

def get_ADNI2_SubjectData(baseline_dir,sub_id):
    sub_pre = sub_id + '/thickness/ADNI_'
    sub_suf_L = '_native_rms_rsl_tlink_28.28mm_left.txt'
    sub_suf_R = '_native_rms_rsl_tlink_28.28mm_right.txt'
    
    subject_file_L = baseline_dir + sub_pre + sub_id + sub_suf_L
    subject_file_R = baseline_dir + sub_pre + sub_id + sub_suf_R
    
    if os.path.isfile(subject_file_L) and os.path.isfile(subject_file_R):
    
        with open(subject_file_L) as f:
            subject_data_list_L = f.readlines()

        with open(subject_file_R) as f:
            subject_data_list_R = f.readlines()

        subject_data = np.array(subject_data_list_L + subject_data_list_R,dtype=float)
        msg = True
            
    else:        
        subject_data = 0
        msg = False
        
    return {'subject_data':subject_data, 'success': msg}

# Create dictionary with roi_id:[thickness values]
def get_ROI_CT_dict(unique_roi, subject_data):
    roi_CT_dict = collections.defaultdict(list)
    for roi in unique_roi:
        roi_idx = atlas_data==roi
        roi_CT_dict[roi].append(subject_data[roi_idx])
        #print str(roi) + ': ' +  str(np.sum(roi_idx))    
        
    return roi_CT_dict

def save_dictionary(_dict,save_path):
    f = open(save_path, 'wb')
    pickle.dump(_dict, f)
    f.close()

In [None]:
# ADNI-2 CT imports
subject_ROI_CT_dict_filename = '/projects/nikhil/ADNI_prediction/input_datasets/CT/civet12_adni2_m00/ADNI2_subject_ROI_CT_dict.pkl'
subject_dirs = os.listdir(baseline_dir)
#Dictionary of dictionary --> subject:{roi:CT_vals}
subject_ROI_CT_dict = collections.defaultdict(list)
subs_missing_data  = []
for sub_id in subject_dirs:
    result = get_ADNI2_SubjectData(baseline_dir,sub_id)
    
    # check if subject data exists
    if result['success']:
        single_ROI_CT_dict = get_ROI_CT_dict(unique_roi,result['subject_data'])
        subject_ROI_CT_dict[sub_id].append(single_ROI_CT_dict)
    else:
        subs_missing_data.append(sub_id)

In [None]:
# keep log of subs with missing data:
sub_missing_data_file = '/projects/nikhil/ADNI_prediction/input_datasets/CT/civet12_adni2_m00/bad_subs'
with open(sub_missing_data_file, 'w') as f:
    for s in subs_missing_data:
        f.write(s + '\n')

In [7]:
# ADNI-1 CT imports
# Grab all the subject idx
baseline_dir = '/projects/nikhil/ADNI_prediction/input_datasets/CT/ADNI1_1.5T_CIVET_1.1.12/thickness/'
subject_files = os.listdir(baseline_dir)
subject_idx = []
for sub in subject_files:
    idx = sub.split('_')[1]
    subject_idx.append(idx)
    

In [8]:
# Generate dictionary
#Dictionary of dictionary --> subject:{roi:CT_vals}
subject_ROI_CT_dict = collections.defaultdict(list)
subs_missing_data  = []
for sub_id in list(set(subject_idx)):
    result = get_ADNI1_SubjectData(baseline_dir,sub_id)    
    # check if subject data exists
    if result['success']:
        single_ROI_CT_dict = get_ROI_CT_dict(unique_roi,result['subject_data'])
        subject_ROI_CT_dict[sub_id].append(single_ROI_CT_dict)
    else:
        subs_missing_data.append(sub_id)

In [12]:
#Chech if all ADNI-1 subjects have ROI-CT dataset assoicated with them
master_dataframe = '/projects/francisco/data/ADNI/master_fused.pkl'
data = pd.read_pickle(master_dataframe)
id_image = re.compile('(?<=I)\d*')
ADNI1_subs_with_CT_data = []
for uid in data.UID:
    img = re.search(id_image, uid).group(0)
    if len(subject_ROI_CT_dict[img][0]) == 688: #691: #903: #79 for AAL
        ADNI1_subs_with_CT_data.append(img)
        
print len(ADNI1_subs_with_CT_data), len(subject_ROI_CT_dict)

698 3585


In [13]:
#Check the minimun number of vertices per ROI (=min sampling bound = 132 for ADNI1 baseline)
no_of_vertices = []
for key in single_ROI_CT_dict.keys():
    no_of_vertices.append(len(single_ROI_CT_dict[int(key)][0]))
    
print np.asarray(no_of_vertices).min()     

68


In [None]:
len(Counter(atlas_data_L['v0']).values()) + len(Counter(atlas_data_R['v0']).values())
#print np.sort(Counter(atlas_data).values())

In [None]:
from collections import Counter
%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 10)
plt.style.use('ggplot')
plt.subplot(2,1,1)
plt.hist(Counter(atlas_data).values(),bins=50)
plt.show()




In [None]:
len(single_ROI_CT_dict), len(subject_ROI_CT_dict)

In [14]:
save_CT_dict_path = '/projects/nikhil/ADNI_prediction/input_datasets/CT/subject_roi_ct_data/ADNI1_subject_ROI_CT_dict_C688.pkl'
save_dictionary(subject_ROI_CT_dict,save_CT_dict_path)
#sub_CT_dict = pickle.load( open(save_CT_dict_path, "rb" ) )

In [None]:

ordered_roi_list = subject_ROI_CT_dict['40817'][0].keys()

roi_vert_count = {}
for roi in ordered_roi_list:
    roi_vert_count[roi] = np.sum(atlas_data==roi)
ignore_roi_list = [-1]

for roi in ignore_roi_list:
    ordered_roi_list.remove(roi)

print ordered_roi_list
#print (ordered_roi_list)
for key, val in roi_vert_count.iteritems():
    if int(val) < 66:
        print key

In [None]:
len(ordered_roi_list)-1