In [1]:
# Basic Imports
import numpy as np
import h5py as h5
#from sklearn.externals import joblib
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.cross_validation import KFold
import pickle
import re
import random
import collections

In [2]:
#datasets

#input data
master_dataframe = '/projects/nikhil/ADNI_prediction/input_datasets/master_fused.pkl'
train_val_file = '/projects/nikhil/ADNI_prediction/input_datasets/cli_ct_seg_fused_train_plus_val.pkl'
test_file = '/projects/nikhil/ADNI_prediction/input_datasets/cli_ct_seg_fused_test.pkl'

#Candidate label dictionaries
sub_HC_vol_left_file = '/projects/nikhil/ADNI_prediction/input_datasets/HC/subject_HC_vol_dictionary_train_val_left.pkl'
sub_HC_vol_right_file = '/projects/nikhil/ADNI_prediction/input_datasets/HC/subject_HC_vol_dictionary_train_val_right.pkl'
sub_CT_file = '/projects/nikhil/ADNI_prediction/input_datasets/CT/subject_roi_ct_data/ADNI1_subject_ROI_CT_dict.pkl'
#k-fold indices (from a saved file)
kf_file = "/projects/nikhil/ADNI_prediction/input_datasets/cli_ct_train_valid_KFold_idx.pkl"

#save hdf_file for inflated sets
out_file = '/projects/nikhil/ADNI_prediction/input_datasets/HC_CT_inflated_CV_subsets.h5'


In [3]:
#Grab CV data with specific feature columes (independent vars) and specific clinical scale (dependent var)
def load_CV_data(sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, in_file, clinical_scale, kf_file, hdf_file):    
    # Grab subject_IDs from sub_HC / sub_CT dictionaries
    # Grab clinical score for each subject from master_csv table (infile). 
    # Filter out NANs
    # Loop through K-folds (kf_file) and append candidate labels + CT values
    # Note: UID = PTID + IID (036_S_0976_I65091)
    
    subject_ids = sub_HC_L_dict.keys()
     # Pick any UID to generate roi_list common across all subjects to stay consistents while tranforming dictionary into array
    ordered_roi_list = sub_CT_dict['40817'][0].keys()
    # ignore the "0" idx along with the 4 missing rois from the mean CT value csv
    ignore_roi_list = [0,29,30,39,40]
    for roi in ignore_roi_list:
        ordered_roi_list.remove(roi)
    
    csv_data = pd.read_pickle(in_file)
    clinical_scores = csv_data[csv_data.PTID.isin(subject_ids)][['UID',clinical_scale]]
    #remove NANs
    clinical_scores = clinical_scores[np.isfinite(clinical_scores[clinical_scale])]
    sub_clinical_scores_dict = dict(zip(clinical_scores['UID'],clinical_scores[clinical_scale]))  
    subject_uids = sub_clinical_scores_dict.keys()
    
    # K-folds
    kf = pickle.load( open(kf_file, "rb" ) )
    X_train = []
    X_valid = []
    y_train = []
    y_valid = []
    fold = 0 
    for train, valid in kf:      
        fold+=1
        print 'Staring fold # {}'.format(fold)
        print 'Starting train subset'
        for t, tr in enumerate(train):            
            uid = subject_uids[t]
            result = inflate_subject_samples(uid, sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, ordered_roi_list, sub_clinical_scores_dict)
            sub_X = result['sub_X']
            sub_y = result['sub_y']
            if t == 0:                
                X_train_stack = sub_X
                y_train_stack = sub_y
            else:
                X_train_stack = np.vstack((X_train_stack,sub_X))
                y_train_stack = np.concatenate((y_train_stack,sub_y))
        
        print 'Ending train subset'
        print 'Starting valid subset'
        for v, val in enumerate(valid):
            uid = subject_uids[val]
            result = inflate_subject_samples(uid, sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, ordered_roi_list, sub_clinical_scores_dict)
            sub_X = result['sub_X']
            sub_y = result['sub_y']
            if v == 0:                
                X_valid_stack = sub_X
                y_valid_stack = sub_y
            else:
                X_valid_stack = np.vstack((X_valid_stack,sub_X))
                y_valid_stack = np.concatenate((y_valid_stack,sub_y))     
        
        print 'Ending valid subset'
        input_data = h5.File(hdf_file, 'a')
        input_data.create_dataset('Fold_{}_train_X'.format(fold),data=X_train_stack)    
        input_data.create_dataset('Fold_{}_train_y'.format(fold),data=y_train_stack)    
        input_data.create_dataset('Fold_{}_valid_X'.format(fold),data=X_valid_stack)    
        input_data.create_dataset('Fold_{}_valid_y'.format(fold),data=y_valid_stack)    
        input_data.close()
        
        
        #X_train.append(X_train_stack)
        #X_valid.append(X_valid_stack)
        #y_train.append(y_train_stack)
        #y_valid.append(y_valid_stack)
        
    print 'All folds done!'
    
    
    # Return train and validation lists comprising all folds as well as unsplit data
    #return {'X_train':X_train,'X_valid':X_valid,'y_train':y_train,'y_valid':y_valid}


def inflate_subject_samples(uid, sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, ordered_roi_list, sub_cScores_dict):
    #UID = PTID + IID (PTID:[HC_vols], IID:{ROI:CT})
    uid = uid.strip()
    #print 'uid: {}'.format(uid)
    ptid_re = re.compile('\d*(_S_)\d*')
    iid_re = re.compile('(?<=I)\d*')
    ptid = re.search(ptid_re, uid).group(0).strip()
    iid = re.search(iid_re, uid).group(0).strip()
    missing_data = False
    min_CT_sampx = 132
    
    if ptid in sub_HC_L_dict.keys():
        sub_HC_L = np.asarray(sub_HC_L_dict[ptid][0])
    else: 
        print "missing HC_L entry for: {}".format(uid) 
        missing_data = True
    
    if ptid in sub_HC_R_dict.keys():
        sub_HC_R = np.asarray(sub_HC_R_dict[ptid][0])
    else: 
        print "missing HC_R entry for: {}".format(uid)
        missing_data = True
        
    if iid in sub_CT_dict.keys():
        sub_CT_all_rois = sub_CT_dict[iid][0]              
    else: 
        print "missing CT entry for: {}".format(uid) 
        missing_data = True
        
    if not missing_data:  
        sub_CScore = sub_cScores_dict[uid]        
        min_sampx = np.min([len(sub_HC_L),len(sub_HC_R),min_CT_sampx])
        #print min_sampx
        
        #select samples 
        sub_HC_L_sampx = random.sample(sub_HC_L, min_sampx)
        sub_HC_R_sampx = random.sample(sub_HC_R, min_sampx)
        
        #Draw equal number of samples per roi
        sub_CT_sampx_dict = collections.OrderedDict()
        for roi in ordered_roi_list:
            #print roi
            if roi not in sub_CT_sampx_dict:
                sub_CT_sampx_dict[roi]=[]
                
            sub_CT_roi = np.squeeze(sub_CT_all_rois[roi])
            #print sub_CT_roi.shape
            if len(sub_CT_roi) >= min_CT_sampx:
                sub_CT_sampx_dict[roi].append(random.sample(sub_CT_roi, min_sampx))            
                
        #print 'sub_CT_sampx_dict: {}'.format(len(sub_CT_sampx_dict))
        sub_CT_sampx = np.zeros((min_sampx, len(ordered_roi_list)))
        for col, roi in enumerate(ordered_roi_list):
            sub_CT_sampx[:,col] = np.asarray(sub_CT_sampx_dict[roi],dtype=float)
            
        
        #print 'sub_CT_sampx.shape :{}'.format(sub_CT_sampx.shape)
        
        sub_X = np.hstack((sub_HC_L_sampx,sub_HC_R_sampx,sub_CT_sampx))
        sub_y = np.tile(sub_CScore, min_sampx)
        
    else:
        sub_X = []
        sub_y = []
    
    return {'sub_X': sub_X, 'sub_y':sub_y}

#Load test data: Need to change this and collect Test data per subject with similar features length
def load_test_data(in_file, feature_cols, clinical_scale):

    data = pd.read_pickle(in_file)
    data_trunc = data[clinical_scale + feature_cols]
    # remove nans 
    data_trunc = data_trunc[np.isfinite(data_trunc[clinical_scale[0]])]
    X = np.asarray(data_trunc[feature_cols],dtype=float)
    y = np.asarray(data_trunc[clinical_scale[0]],dtype=float)
    return {'X':X, 'y':y}

In [4]:
kf = pickle.load( open(kf_file, "rb" ) )
master_csv = pickle.load( open(master_dataframe, "rb" ) )
#train_val_data = pickle.load( open(train_val_file, "rb" ) )
#test_val_data = pickle.load( open(test_file, "rb" ) )

sub_HC_L_dict = pickle.load( open(sub_HC_vol_left_file, "rb" ) )
sub_HC_R_dict = pickle.load( open(sub_HC_vol_right_file, "rb" ) )
sub_CT_dict = pickle.load( open(sub_CT_file, "rb" ) )

In [None]:
# trial run..
print len(master_csv),len(kf.idxs),len(train_val_data),len(sub_HC_vol_left_dict),len(sub_HC_vol_right_dict),len(sub_CT_dict)
uid  =' 002_S_0559_I118676'
clinical_scale = 'ADAS13'
subject_ids = sub_HC_L_dict.keys()
clinical_scores = master_csv[csv_data.PTID.isin(subject_ids)][['UID',clinical_scale]]
#remove NANs
clinical_scores = clinical_scores[np.isfinite(clinical_scores[clinical_scale])]
sub_clinical_scores_dict = dict(zip(clinical_scores['UID'],clinical_scores[clinical_scale]))  
ordered_roi_list = sub_CT_dict['40817'][0].keys()
result = inflate_subject_samples(uid, sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, ordered_roi_list, sub_clinical_scores_dict)
print result

In [7]:
clinical_scale = 'ADAS13'
test = load_CV_data(sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, master_dataframe, clinical_scale, kf_file, out_file)


Staring fold # 1
Starting train subset
Ending train subset
Starting valid subset
Ending valid subset
Staring fold # 2
Starting train subset
Ending train subset
Starting valid subset
Ending valid subset
Staring fold # 3
Starting train subset
Ending train subset
Starting valid subset
Ending valid subset
Staring fold # 4
Starting train subset
Ending train subset
Starting valid subset
Ending valid subset
Staring fold # 5
Starting train subset
Ending train subset
Starting valid subset
Ending valid subset
Staring fold # 6
Starting train subset
Ending train subset
Starting valid subset
Ending valid subset
Staring fold # 7
Starting train subset
Ending train subset
Starting valid subset
Ending valid subset
Staring fold # 8
Starting train subset
Ending train subset
Starting valid subset
Ending valid subset
Staring fold # 9
Starting train subset
Ending train subset
Starting valid subset
Ending valid subset
Staring fold # 10
Starting train subset
Ending train subset
Starting valid subset
Ending va

In [None]:
master_csv[master_csv.PTID=='114_S_0166']

In [4]:
def generateInnerFold(in_file, OF_id, fold_name_prefix, n_innerFolds, out_file_prefix):
    # Load data
    X_name = fold_name_prefix + '_X'
    y_name = fold_name_prefix + '_y'
    input_data = h5.File(in_file, 'r')
    X_raw = input_data[X_name][:]
    y = input_data[y_name][:]
    input_data.close()

    print X_raw.shape
    #Remove ROIs from ignore list
    L_HC_offset = 11427
    R_HC_offset = 10519
        
    ignore_list_CT_idx = list(L_HC_offset + R_HC_offset + np.array([0,29,30,37,38]))
    #X = np.delete(X_raw, np.s_[ignore_list_CT_idx], 1)
    X = X_raw
    print X.shape

    #Sample / Shuffle Data
    sampx = len(y)
    kf = KFold(sampx, n_folds=n_innerFolds,shuffle=True)

    #Save Data
    split_HC_CT = True #Split the HC and CT data layer to allow partitioned model
    k=0
    for train, valid in kf:
        k+=1
        out_file_train = out_file_prefix + 'train_InnerFold_{}_partition_ROI_74.h5'.format(k)
        out_file_valid = out_file_prefix + 'valid_InnerFold_{}_partition_ROI_74.h5'.format(k)
        #Save Train
        output_data = h5.File(out_file_train, 'a')
        X_all = X[train]
        if split_HC_CT:
            output_data.create_dataset('Fold_{}_X_L_HC'.format(OF_id),data=X_all[:,:L_HC_offset])
            output_data.create_dataset('Fold_{}_X_R_HC'.format(OF_id),data=X_all[:,L_HC_offset:L_HC_offset+R_HC_offset])
            output_data.create_dataset('Fold_{}_X_R_CT'.format(OF_id),data=X_all[:,L_HC_offset+R_HC_offset:])
        else:
            output_data.create_dataset('Fold_{}_X'.format(OF_id),data=X[train])

        output_data.create_dataset('Fold_{}_y'.format(OF_id),data=y[train])
        output_data.close()

        # Save valid
        output_data = h5.File(out_file_valid, 'a')
        X_all = X[valid]
        if split_HC_CT:
            output_data.create_dataset('Fold_{}_X_L_HC'.format(OF_id),data=X_all[:,:L_HC_offset])
            output_data.create_dataset('Fold_{}_X_R_HC'.format(OF_id),data=X_all[:,L_HC_offset:L_HC_offset+R_HC_offset])
            output_data.create_dataset('Fold_{}_X_R_CT'.format(OF_id),data=X_all[:,L_HC_offset+R_HC_offset:])
        else:
            output_data.create_dataset('Fold_{}_X'.format(OF_id),data=X[valid])

        output_data.create_dataset('Fold_{}_y'.format(OF_id),data=y[valid])
        output_data.close()


In [8]:
outer_CV_fold_file = '/projects/nikhil/ADNI_prediction/input_datasets/inflated_datasets/HC_CT_inflated_CV_subsets.h5'
single_CV_fold_dir = '/projects/nikhil/ADNI_prediction/input_datasets/inflated_datasets/'

n_innerFolds = 5

outer_folds = np.arange(1,11,1)
for of in outer_folds:
    fold_name_prefix = 'Fold_{}_train'.format(str(of))
    single_CV_fold_file = '{}HC_CT_inflated_CV_OuterFold_{}_'.format(single_CV_fold_dir,str(of))
    generateInnerFold(outer_CV_fold_file, str(of), fold_name_prefix, n_innerFolds, single_CV_fold_file)


(36456, 22020)
(36456, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)
(36572, 22020)


In [None]:
outer_CV_fold_file = '/projects/nikhil/ADNI_prediction/input_datasets/inflated_datasets/HC_CT_inflated_CV_subsets.h5'
X_name = fold_name_prefix + '_X'
input_data = h5.File(outer_CV_fold_file, 'r')
X = np.array(input_data[X_name][:])
print X.shape
#Remove ROIs from ignore list
HC_offset = 11427 + 10519
ignore_list_CT_idx = list(HC_offset + np.array([0,29,30,37,38]))
np.delete(X, np.s_[ignore_list_CT_idx], 1)
print X.shape

In [None]:
# Restructure OuterFold dataset h5 into two separate train and valid h5 files with identical dataset name
single_CV_fold_dir = '/projects/nikhil/ADNI_prediction/input_datasets/'

out_file_train = single_CV_fold_dir + 'HC_CT_inflated_CV_OuterFolds_train.h5'
out_file_valid = single_CV_fold_dir + 'HC_CT_inflated_CV_OuterFolds_valid.h5'

input_data = h5.File(single_CV_fold_dir + 'HC_CT_inflated_CV_subsets.h5', 'r')
out_data_train = h5.File(out_file_train, 'a')
out_data_valid = h5.File(out_file_valid, 'a')
                         
for OF_id in np.arange(1,11,1):
    dataset_train_X = 'Fold_{}_train_X'.format(OF_id)
    dataset_train_y = 'Fold_{}_train_y'.format(OF_id)
    dataset_valid_X = 'Fold_{}_valid_X'.format(OF_id)
    dataset_valid_y = 'Fold_{}_valid_y'.format(OF_id)
                     
    data_train_X = input_data[dataset_train_X][:]
    data_train_y = input_data[dataset_train_y][:]
    data_valid_X = input_data[dataset_valid_X][:]
    data_valid_y = input_data[dataset_valid_y][:]
                
    out_data_train.create_dataset('Fold_{}_X'.format(OF_id),data=data_train_X)    
    out_data_train.create_dataset('Fold_{}_y'.format(OF_id),data=data_train_y)    
    out_data_valid.create_dataset('Fold_{}_X'.format(OF_id),data=data_valid_X)    
    out_data_valid.create_dataset('Fold_{}_y'.format(OF_id),data=data_valid_y)    
    
input_data.close()
out_data_train.close()
out_data_valid.close()

In [None]:
sub_CT_dict = pickle.load( open(sub_CT_file, "rb" ) )

In [None]:
ordered_roi_list = sub_CT_dict['40817'][0].keys()
print 'total number of ROIs {}'.format(len(ordered_roi_list))
ignore_roi_list = [0,29,30,39,40]

for roi in ignore_roi_list:
    print 'Ignore ROI index {}'.format(ordered_roi_list.index(roi))


#for roi in ignore_roi_list:
#    ordered_roi_list.remove(roi)
#print ""
#print ordered_roi_list



In [None]:
arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print arr.shape
l = [1,2]
arr_trunc = np.delete(arr, np.s_[l], 1)
print arr_trunc.shape

In [None]:
list(10 + np.array([0,29,30,37,38]))