In [None]:
# Basic Imports
import numpy as np
import h5py as h5
from sklearn.externals import joblib
import collections
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.cross_validation import KFold
import pickle
import re
import os
import os.path

In [None]:
#Has the AAL ROI indices 

#use top 40,962 to match low-res subjects http://www.bic.mni.mcgill.ca/ServicesSoftware/StatisticalAnalysesUsingSurfstatMatlab
#atlas_file_L = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/AAL_atlas_left.txt'
#atlas_file_R = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/AAL_atlas_right.txt'

atlas_file_L = '/projects/nikhil/ADNI_prediction/input_datasets/CT/JDV_Atlas/ctx_roi_1000-dil_civ.L.1D'
atlas_file_R = '/projects/nikhil/ADNI_prediction/input_datasets/CT/JDV_Atlas/ctx_roi_1000-dil_civ.R.1D'

AAL = False

#Subjects:
baseline_dir = '/projects/nikhil/ADNI_prediction/input_datasets/CT/civet12_adni2_m00/output/'

#Has the CT values 
subject_file_L = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/test_subject_L.txt'
subject_file_R = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL/test_subject_R.txt'

In [None]:
# Read atlas files
if AAL:
    with open(atlas_file_L) as f:
        atlas_data_highRes_L = f.readlines()

    with open(atlas_file_R) as f:
        atlas_data_highRes_R = f.readlines()
    
    atlas_data_list = atlas_data_highRes_L[:40962] + atlas_data_highRes_R[:40962]
    
else:
    atlas_data_L = pd.read_csv(atlas_file_L,header=3,delim_whitespace=True)
    atlas_data_R = pd.read_csv(atlas_file_R,header=3,delim_whitespace=True)
    
    atlas_data_list = list(atlas_data_L['v0']) + list(atlas_data_R['v0'])
    
unique_roi = np.array(list(set(atlas_data_list)),dtype=int)
atlas_data = np.array(atlas_data_list,dtype=int)


In [None]:
# Grab left and right CT data for a given subject
def get_ADNI1_SubjectData(baseline_dir,sub_id):
    sub_pre = 'ADNI_'
    sub_suf_L = '_native_rms_rsl_tlink_20mm_left.txt'
    sub_suf_R = '_native_rms_rsl_tlink_20mm_right.txt'
    
    subject_file_L = baseline_dir + sub_pre + sub_id + sub_suf_L
    subject_file_R = baseline_dir + sub_pre + sub_id + sub_suf_R
    
    if os.path.isfile(subject_file_L) and os.path.isfile(subject_file_R):
    
        with open(subject_file_L) as f:
            subject_data_list_L = f.readlines()

        with open(subject_file_R) as f:
            subject_data_list_R = f.readlines()

        subject_data = np.array(subject_data_list_L + subject_data_list_R,dtype=float)
        msg = True
            
    else:        
        subject_data = 0
        msg = False
        
    return {'subject_data':subject_data, 'success': msg}

def get_ADNI2_SubjectData(baseline_dir,sub_id):
    sub_pre = sub_id + '/thickness/ADNI_'
    sub_suf_L = '_native_rms_rsl_tlink_28.28mm_left.txt'
    sub_suf_R = '_native_rms_rsl_tlink_28.28mm_right.txt'
    
    subject_file_L = baseline_dir + sub_pre + sub_id + sub_suf_L
    subject_file_R = baseline_dir + sub_pre + sub_id + sub_suf_R
    
    if os.path.isfile(subject_file_L) and os.path.isfile(subject_file_R):
    
        with open(subject_file_L) as f:
            subject_data_list_L = f.readlines()

        with open(subject_file_R) as f:
            subject_data_list_R = f.readlines()

        subject_data = np.array(subject_data_list_L + subject_data_list_R,dtype=float)
        msg = True
            
    else:        
        subject_data = 0
        msg = False
        
    return {'subject_data':subject_data, 'success': msg}

# Create dictionary with roi_id:[thickness values]
def get_ROI_CT_dict(unique_roi, subject_data):
    roi_CT_dict = collections.defaultdict(list)
    for roi in unique_roi:
        roi_idx = atlas_data==roi
        roi_CT_dict[roi].append(subject_data[roi_idx])
        #print str(roi) + ': ' +  str(np.sum(roi_idx))    
        
    return roi_CT_dict

def save_dictionary(_dict,save_path):
    f = open(save_path, 'wb')
    pickle.dump(_dict, f)
    f.close()

In [None]:
# ADNI-2 CT imports
subject_ROI_CT_dict_filename = '/projects/nikhil/ADNI_prediction/input_datasets/CT/civet12_adni2_m00/ADNI2_subject_ROI_CT_dict.pkl'
subject_dirs = os.listdir(baseline_dir)
#Dictionary of dictionary --> subject:{roi:CT_vals}
subject_ROI_CT_dict = collections.defaultdict(list)
subs_missing_data  = []
for sub_id in subject_dirs:
    result = get_ADNI2_SubjectData(baseline_dir,sub_id)
    
    # check if subject data exists
    if result['success']:
        single_ROI_CT_dict = get_ROI_CT_dict(unique_roi,result['subject_data'])
        subject_ROI_CT_dict[sub_id].append(single_ROI_CT_dict)
    else:
        subs_missing_data.append(sub_id)

In [None]:
# keep log of subs with missing data:
sub_missing_data_file = '/projects/nikhil/ADNI_prediction/input_datasets/CT/civet12_adni2_m00/bad_subs'
with open(sub_missing_data_file, 'w') as f:
    for s in subs_missing_data:
        f.write(s + '\n')

In [None]:
# ADNI-1 CT imports
# Grab all the subject idx
baseline_dir = '/projects/nikhil/ADNI_prediction/input_datasets/CT/ADNI1_1.5T_CIVET_1.1.12/thickness/'
subject_files = os.listdir(baseline_dir)
subject_idx = []
for sub in subject_files:
    idx = sub.split('_')[1]
    subject_idx.append(idx)
    

In [None]:
# Generate dictionary
#Dictionary of dictionary --> subject:{roi:CT_vals}
subject_ROI_CT_dict = collections.defaultdict(list)
subs_missing_data  = []
for sub_id in list(set(subject_idx)):
    result = get_ADNI1_SubjectData(baseline_dir,sub_id)    
    # check if subject data exists
    if result['success']:
        single_ROI_CT_dict = get_ROI_CT_dict(unique_roi,result['subject_data'])
        subject_ROI_CT_dict[sub_id].append(single_ROI_CT_dict)
    else:
        subs_missing_data.append(sub_id)

In [None]:
#Chech if all ADNI-1 subjects have ROI-CT dataset assoicated with them
master_dataframe = '/projects/francisco/data/ADNI/master_fused.pkl'
data = pd.read_pickle(master_dataframe)
id_image = re.compile('(?<=I)\d*')
ADNI1_subs_with_CT_data = []
for uid in data.UID:
    img = re.search(id_image, uid).group(0)
    if len(subject_ROI_CT_dict[img][0]) == 903: #79 for AAL
        ADNI1_subs_with_CT_data.append(img)
        
print len(ADNI1_subs_with_CT_data), len(subject_ROI_CT_dict)

In [None]:
#Check the minimun number of vertices per ROI (=min sampling bound = 132 for ADNI1 baseline)
no_of_vertices = []
for key in single_ROI_CT_dict.keys():
    no_of_vertices.append(len(single_ROI_CT_dict[int(key)][0]))
    
print np.asarray(no_of_vertices).min()     

In [None]:
len(Counter(atlas_data_L['v0']).values()) + len(Counter(atlas_data_R['v0']).values())
#print np.sort(Counter(atlas_data).values())

In [None]:
from collections import Counter
plt.style.use('ggplot')
#np.sort(Counter(atlas_data_L['v0']).values())
plt.subplot(2,1,1)
plt.hist(Counter(atlas_data).values(),bins=50)
plt.show()




In [None]:
len(single_ROI_CT_dict), len(subject_ROI_CT_dict)

In [None]:
save_CT_dict_path = '/projects/nikhil/ADNI_prediction/input_datasets/CT/subject_roi_ct_data/ADNI1_subject_ROI_CT_dict_JDV.pkl'
#save_dictionary(subject_ROI_CT_dict,save_CT_dict_path)
sub_CT_dict = pickle.load( open(save_CT_dict_path, "rb" ) )

In [None]:
kf_file = "/projects/nikhil/ADNI_prediction/input_datasets/cli_ct_train_valid_KFold_UIDs.pkl"
kf = pickle.load( open(kf_file, "rb" ) )
val_fold_list= kf['valid_UIDs']
ordered_roi_list = sub_CT_dict['40817'][0].keys()
iid_re = re.compile('(?<=I)\d*')

for uid in val_fold_list[0]:
    uid = uid.strip()
    iid = re.search(iid_re, uid).group(0).strip()    
    sub_CT_all_rois = sub_CT_dict[iid][0]           
    sub_CScore = sub_cScores_dict[uid]        
    
    for roi in ordered_roi_list:              
        sub_CT_roi = np.squeeze(sub_CT_all_rois[roi])
        
        # Mean CT values per ROI
        sub_CT_sampx_dict[roi].append(np.mean(sub_CT_roi))
                           
        sub_CT_sampx = np.zeros((min_sampx, len(ordered_roi_list)))
        for col, roi in enumerate(ordered_roi_list):
            sub_CT_sampx[:,col] = np.asarray(sub_CT_sampx_dict[roi],dtype=float)
            
