In [1]:
# Basic Imports
import numpy as np
import h5py as h5
#from sklearn.externals import joblib
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.cross_validation import KFold
import pickle
import re
import random
import collections
import ipyparallel as ipp
from functools import partial
import collections
import tables as tb

In [2]:
#datasets

#input data
master_dataframe = '/projects/nikhil/ADNI_prediction/input_datasets/master_fused.pkl'
train_val_file = '/projects/nikhil/ADNI_prediction/input_datasets/cli_ct_seg_fused_train_plus_val.pkl'
test_file = '/projects/nikhil/ADNI_prediction/input_datasets/cli_ct_seg_fused_test.pkl'

#Candidate label dictionaries
sub_HC_vol_left_file = '/projects/nikhil/ADNI_prediction/input_datasets/HC/subject_HC_vol_dictionary_train_val_left.pkl'
sub_HC_vol_right_file = '/projects/nikhil/ADNI_prediction/input_datasets/HC/subject_HC_vol_dictionary_train_val_right.pkl'

sub_CT_file = '/projects/nikhil/ADNI_prediction/input_datasets/CT/subject_roi_ct_data/ADNI1_subject_ROI_CT_dict_JDV.pkl'
#k-fold indices (from a saved file)
#kf_file = "/projects/nikhil/ADNI_prediction/input_datasets/cli_ct_train_valid_KFold_idx.pkl"
kf_file = "/projects/nikhil/ADNI_prediction/input_datasets/cli_ct_train_valid_KFold_UIDs.pkl"

#save hdf_file for inflated sets
out_file = '/projects/nikhil/ADNI_prediction/input_datasets/HC_CT_fused_CV_subsets_JDV.h5'


In [3]:
#Grab CV data with specific feature columes (independent vars) and specific clinical scale (dependent var)
def load_CV_data(sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, in_file, clinical_scale, kf_file, hdf_file):    
    # Grab subject_IDs from sub_HC / sub_CT dictionaries
    # Grab clinical score for each subject from master_csv table (infile). 
    # Filter out NANs
    # Loop through K-folds (kf_file) and append candidate labels + CT values
    # Note: UID = PTID + IID (036_S_0976_I65091)
    drawSamples = False  #Draw samples of point estimates
    computeTrainingFolds = False
    computeValidFolds = True
    
    csv_data = pd.read_pickle(in_file)
    subject_PTIDs = csv_data.PTID
    subject_IIDs = csv_data.IID
    
     # Pick any UID to generate roi_list common across all subjects to stay consistents while tranforming dictionary into array
    ordered_roi_list = sub_CT_dict['40817'][0].keys()
    
    # ignore the "0" idx along with the 4 missing rois from the mean CT value csv
    #ignore_roi_list = [0,29,30,39,40]
    #for roi in ignore_roi_list:
    #    ordered_roi_list.remove(roi)
            
    #clinical_scores = csv_data[csv_data.PTID.isin(subject_ids)][['UID',clinical_scale]]
    clinical_scores = csv_data[['PTID','IID',clinical_scale]]
    #print clinical_scores
    #remove NANs        
    clinical_scores = clinical_scores[np.isfinite(clinical_scores[clinical_scale])]    
    clinical_scores['UID'] = clinical_scores['PTID'] + '_' + clinical_scores['IID']
    #print clinical_scores
    #print len(clinical_scores)    
    
    sub_clinical_scores_dict = dict(zip(clinical_scores['UID'],clinical_scores[clinical_scale]))    
      
    #print list(clinical_scores['UID'])
    
    # K-folds
    kf = pickle.load( open(kf_file, "rb" ) )
    train_fold_list= kf['train_UIDs']
    val_fold_list= kf['valid_UIDs']
    
     
    #for train, valid in kf:  
    #    train_fold_list.append(train)
        
    X_train = []
    X_valid = []
    y_train = []
    y_valid = []
    runParallel = False
    if runParallel: #parallelized version:        
        rc = ipp.Client()
        #rc.block = False
        dview = rc[:]
        print dview
        dview.push(dict(inflate_fold = inflate_fold))        
        dview.push(dict(inflate_subject_samples = inflate_subject_samples))        
        mapfunc = partial(inflate_fold, sub_HC_L_dict=sub_HC_L_dict, sub_HC_R_dict=sub_HC_R_dict, sub_CT_dict=sub_CT_dict, 
                  ordered_roi_list=ordered_roi_list, sub_clinical_scores_dict=sub_clinical_scores_dict)

        parallel_result = dview.map_sync(mapfunc, train_fold_list)  
        return parallel_result
    
    else:
        fold = 0 
        uid_sampx_dict_list = []
        uid_sampx_list_list = []
        
        if computeTrainingFolds: 
            for train in train_fold_list:      
                fold+=1
                print 'Staring fold # {}'.format(fold)
                print 'Starting train subset'
                uid_sampx_dict = collections.OrderedDict()
                uid_sampx_list = collections.OrderedDict()
                X_train_PE = []
                y_train_PE = []
                for t, tr in enumerate(train):                  
                    uid = tr             
                    #print uid
                    result = inflate_subject_samples(uid, sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, ordered_roi_list, sub_clinical_scores_dict, drawSamples)
                    sub_X = result['sub_X']
                    sub_y = result['sub_y']
                
                    if drawSamples:
                        if t == 0:                
                            X_train_stack = sub_X
                            y_train_stack = sub_y
                        else:
                            print X_train_stack, y_train_stack
                            print sub_X, sub_y
                            X_train_stack = np.vstack((X_train_stack,sub_X))
                            y_train_stack = np.concatenate((y_train_stack,sub_y))
                        
                    else:
                        X_train_PE.append(sub_X)
                        y_train_PE.append(sub_y)
                
                    #uid_sampx_dict[uid] = len(sub_y)
                    uid_sampx_dict[uid] = 1
                
                if not drawSamples:   
                    X_train_stack  = np.squeeze(np.array(X_train_PE))
                    y_train_stack  = np.array(y_train_PE)
                
                input_data = h5.File(hdf_file, 'a')
                input_data.create_dataset('Fold_{}_train_X'.format(fold),data=X_train_stack)    
                input_data.create_dataset('Fold_{}_train_y'.format(fold),data=y_train_stack)                    
                input_data.close()
                                
                print 'Ending train subset'
                uid_sampx_dict_list.append(uid_sampx_dict)
                uid_sampx_list_list.append(uid_sampx_list)
                
        if computeValidFolds:    
            #val_score_list = []
            fold = 0 
            for valid in val_fold_list:
                fold+=1
                #uid = val
                #val_score_list.append(sub_clinical_scores_dict[uid])
                #print val_score_list
            
                # This is optional - validation by default should be on "fused features"
                print 'Starting valid subset'
                X_valid_PE = []
                y_valid_PE = []
                for v, val in enumerate(valid):
                    #print valid
                    uid = val
                    result = inflate_subject_samples(uid, sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, ordered_roi_list, sub_clinical_scores_dict, drawSamples)
                    sub_X = result['sub_X']
                    sub_y = result['sub_y']
                
                    if drawSamples:
                        if v == 0:                
                            X_valid_stack = sub_X
                            y_valid_stack = sub_y
                        else:
                            X_valid_stack = np.vstack((X_valid_stack,sub_X))
                            y_valid_stack = np.concatenate((y_valid_stack,sub_y))     
                    else:
                        X_valid_PE.append(sub_X)
                        y_valid_PE.append(sub_y)
        
                print 'Ending valid subset' 
                if not drawSamples:   
                    X_valid_stack  = np.squeeze(np.array(X_valid_PE))
                    y_valid_stack  = np.array(y_valid_PE)
                
                input_data = h5.File(hdf_file, 'a')            
                input_data.create_dataset('Fold_{}_valid_X'.format(fold),data=X_valid_stack)    
                input_data.create_dataset('Fold_{}_valid_y'.format(fold),data=y_valid_stack)    
                input_data.close()
            
        #Save uid--> sampx list of dictionaries per fold
        f = open(hdf_file+'.pkl', 'wb')
        pickle.dump(uid_sampx_dict_list, f)
        f.close()
        
        print 'All folds done!'


def inflate_subject_samples(uid, sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, ordered_roi_list, sub_cScores_dict, drawSamples):
    import numpy as np
    import h5py as h5    
    import pandas as pd
    from scipy import stats
    from sklearn.cross_validation import KFold
    import pickle
    import re
    import random
    import collections
    #UID = PTID + IID (PTID:[HC_vols], IID:{ROI:CT})
    uid = uid.strip()
    #print 'uid: {}'.format(uid)
    ptid_re = re.compile('\d*(_S_)\d*')
    iid_re = re.compile('(?<=I)\d*')
    ptid = re.search(ptid_re, uid).group(0).strip()
    iid = re.search(iid_re, uid).group(0).strip()
    missing_data = False
    #min_CT_sampx = 132
    min_CT_sampx = 14
    MC = False
    
    
    if ptid in sub_HC_L_dict.keys():
        sub_HC_L = np.asarray(sub_HC_L_dict[ptid][0])
    else: 
        print "missing HC_L entry for: {}".format(uid) 
        missing_data = True
    
    if ptid in sub_HC_R_dict.keys():
        sub_HC_R = np.asarray(sub_HC_R_dict[ptid][0])
    else: 
        print "missing HC_R entry for: {}".format(uid)
        missing_data = True
        
    if iid in sub_CT_dict.keys():
        sub_CT_all_rois = sub_CT_dict[iid][0]              
    else: 
        print "missing CT entry for: {}".format(uid) 
        missing_data = True
        
    if not missing_data:
        sub_CScore = sub_cScores_dict[uid]
        sub_CT_sampx_dict = collections.OrderedDict()
        if drawSamples:                    
            min_sampx = np.min([len(sub_HC_L),len(sub_HC_R),min_CT_sampx])
            
            #select samples 
            sub_HC_L_sampx = random.sample(sub_HC_L, min_sampx)
            sub_HC_R_sampx = random.sample(sub_HC_R, min_sampx)
        
            #Draw equal number of samples per roi
            
            for roi in ordered_roi_list:
                #print roi
                if roi not in sub_CT_sampx_dict:
                    sub_CT_sampx_dict[roi]=[]
                
                sub_CT_roi = np.squeeze(sub_CT_all_rois[roi])
                #print sub_CT_roi.shape
                if len(sub_CT_roi) >= min_CT_sampx:
                    #Do you want averaged out samples or true thickness samples
                    if MC: 
                        MC_mult = int(0.5*len(sub_CT_roi)) #Pool for averaged out samples 
                        CT_MC_sampx = []
                        for i in np.arange(min_sampx): #Generate CT samples 
                            CT_MC_sampx.append(np.mean(random.sample(sub_CT_roi, MC_mult))) #Average out individual samples

                        sub_CT_sampx_dict[roi].append(CT_MC_sampx)      
                    else:
                        # Draw true samples
                        sub_CT_sampx_dict[roi].append(random.sample(sub_CT_roi, min_sampx))  
                        #sub_CT_sampx_dict[roi].append(np.mean(sub_CT_roi))
                else:
                    print "Wrong value for the min_CT_sampx"
                    
            #Clinical Score            
            sub_y = np.tile(sub_CScore, min_sampx)
        
        # Or just collect point esimates (fused labels + mean thickness values)
        else:
            #select point-estimates
            min_sampx = 1
            sub_HC_L_sampx = stats.mode(sub_HC_L)[0]
            sub_HC_R_sampx = stats.mode(sub_HC_R)[0]
            
            for roi in ordered_roi_list:                
                sub_CT_roi = np.squeeze(sub_CT_all_rois[roi])                
                sub_CT_sampx_dict[roi] = np.mean(sub_CT_roi)
            #Clinical Score            
            sub_y = sub_CScore
            
            
        # Convert samples or a mean vector to a numpy array   
        sub_CT_sampx = np.zeros((min_sampx, len(ordered_roi_list)))
        for col, roi in enumerate(ordered_roi_list):
            sub_CT_sampx[:,col] = np.asarray(sub_CT_sampx_dict[roi],dtype=float)
        
        sub_X = np.hstack((sub_HC_L_sampx,sub_HC_R_sampx,sub_CT_sampx))
        
    else:
        sub_X = []
        sub_y = []
    
    return {'sub_X': sub_X, 'sub_y':sub_y}

#If you want only inflate training subset (this is used for parallel implementation)

def inflate_fold(train,sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, ordered_roi_list, sub_clinical_scores_dict):
    import numpy as np
    import h5py as h5    
    import pandas as pd
    from scipy import stats
    from sklearn.cross_validation import KFold
    import pickle
    import re
    import random
    import collections
    print 'Starting train subset'
    for t, tr in enumerate(train):            
        uid = subject_uids[t]
        result = inflate_subject_samples(uid, sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, ordered_roi_list, sub_clinical_scores_dict)
        sub_X = result['sub_X']
        sub_y = result['sub_y']
        if t == 0:                
            X_train_stack = sub_X
            y_train_stack = sub_y
        else:
            X_train_stack = np.vstack((X_train_stack,sub_X))
            y_train_stack = np.concatenate((y_train_stack,sub_y))
        
    print 'Ending train subset'

In [None]:
tmp = np.random.randn(100)
print np.mean(tmp)
sampx1=[]
sampx2=[]
for i in np.arange(2):
    sampx1.append(np.mean(random.sample(tmp, 75)))

for i in np.arange(50):
    sampx2.append(np.mean(random.sample(tmp, 2)))

print (np.array(sampx1))
print np.mean(np.array(sampx2))

int(10.7)

In [4]:
#kf = pickle.load( open(kf_file, "rb" ) )
#master_csv = pickle.load( open(master_dataframe, "rb" ) )
#train_val_data = pickle.load( open(train_val_file, "rb" ) )
#test_val_data = pickle.load( open(test_file, "rb" ) )

sub_HC_L_dict = pickle.load( open(sub_HC_vol_left_file, "rb" ) )
sub_HC_R_dict = pickle.load( open(sub_HC_vol_right_file, "rb" ) )

sub_CT_dict = pickle.load( open(sub_CT_file, "rb" ) )
#train_val_data = pickle.load( open(train_val_file, "rb" ) )

In [None]:
ordered_roi_list = sub_CT_dict['40817'][0].keys()
ignore_roi_list = [0,29,30,39,40]
for roi in ignore_roi_list:
    ordered_roi_list.remove(roi)
    
print ordered_roi_list

In [59]:
clinical_scale = 'ADAS13'
CV_inflated_data = load_CV_data(sub_HC_L_dict, sub_HC_R_dict, sub_CT_dict, train_val_file, clinical_scale, kf_file, out_file)


Starting valid subset
Ending valid subset
Starting valid subset
Ending valid subset
Starting valid subset
Ending valid subset
Starting valid subset
Ending valid subset
Starting valid subset
Ending valid subset
Starting valid subset
Ending valid subset
Starting valid subset
Ending valid subset
Starting valid subset
Ending valid subset
Starting valid subset
Ending valid subset
Starting valid subset
Ending valid subset
All folds done!


In [56]:
kf = pickle.load( open(kf_file, "rb" ) )
train_fold_list= kf['train_UIDs']
val_fold_list= kf['valid_UIDs']

print val_fold_list[0]

['002_S_0729_I118682', '002_S_0955_I118689', '005_S_0221_I72128', '005_S_0610_I32667', '005_S_0814_I74591', '007_S_1339_I56319', '012_S_0803_I118716', '014_S_0519_I39647', '014_S_0520_I39660', '021_S_0642_I33452', '022_S_0096_I59456', '022_S_0750_I59552', '027_S_1385_I47574', '029_S_0845_I64867', '031_S_0321_I65383', '031_S_1209_I67441', '032_S_0677_I119102', '032_S_0718_I119105', '033_S_0567_I119126', '033_S_0724_I119128', '036_S_0760_I38652', '037_S_0182_I65134', '062_S_0535_I50426', '062_S_0768_I50506', '067_S_0059_I119188', '067_S_0076_I119190', '067_S_0110_I119194', '067_S_0336_I119202', '068_S_0473_I140334', '072_S_1380_I119226', '073_S_0909_I119235', '094_S_1015_I40763', '098_S_0160_I65739', '098_S_0172_I65757', '098_S_0288_I323256', '098_S_0884_I56026', '099_S_0040_I34607', '100_S_0006_I33025', '100_S_0035_I33074', '100_S_0069_I33105', '100_S_0892_I66120', '100_S_1286_I66144', '116_S_0370_I59777', '116_S_1249_I59647', '128_S_0528_I119401', '128_S_0740_I119406', '129_S_1246_I697

In [None]:
print len(sub_HC_L_dict.keys()), len(sub_HC_R_dict.keys()), len(sub_CT_dict)
clinical_scale = 'ADAS13'
subject_ids = sub_HC_L_dict.keys()
csv_data = pd.read_pickle(master_dataframe)
clinical_scores = csv_data[csv_data.PTID.isin(subject_ids)][['UID',clinical_scale]]
#remove NANs
clinical_scores = clinical_scores[np.isfinite(clinical_scores[clinical_scale])]
sub_clinical_scores_dict = dict(zip(clinical_scores['UID'],clinical_scores[clinical_scale]))  
subject_uids = sub_clinical_scores_dict.keys()

print len(subject_uids)

In [None]:
sub_id_list = list(train_val_data.IID)
iid_re = re.compile('(?<=I)\d*')    
ordered_roi_list = sub_CT_dict['40817'][0].keys()
sub_roi_mat_mean = np.zeros((len(sub_id_list), len(ordered_roi_list)))
sub_roi_mat_std = np.zeros((len(sub_id_list), len(ordered_roi_list)))
sub_roi_mat_sampx = np.zeros((len(sub_id_list), len(ordered_roi_list)))

for s,sub in enumerate(sub_id_list):
    iid = re.search(iid_re, sub).group(0).strip()
    for r,roi in enumerate(ordered_roi_list):        
        sub_roi_vals = np.squeeze(sub_CT_dict[iid][0][roi])
        sub_roi_mat_mean[s,r] = np.mean(sub_roi_vals)
        sub_roi_mat_std[s,r] = np.std(sub_roi_vals)
        sub_roi_mat_sampx[s,r] = random.sample(sub_roi_vals, 1)[0]            

In [None]:
plt.figure()
for r,roi in enumerate(ordered_roi_list):      
    plt.subplot(8,10,r+1)
    plt.hist(sub_roi_mat_sampx[:,r]-sub_roi_mat[:,r])
plt.show()

In [6]:
def generateInnerFold(OF_id, fold_name_prefix, out_file_prefix, in_file, n_innerFolds):
    import numpy as np
    import h5py as h5    
    import pandas as pd
    from scipy import stats
    from sklearn.cross_validation import KFold
    import pickle
    import re
    import random
    import collections
    # Load data
    X_name = fold_name_prefix + '_X'
    y_name = fold_name_prefix + '_y'
    input_data = h5.File(in_file, 'r')
    X_raw = input_data[X_name][:]
    y = input_data[y_name][:]
    input_data.close()

    print X_raw.shape
    #Remove ROIs from ignore list
    L_HC_offset = 11427
    R_HC_offset = 10519
        
    ignore_list_CT_idx = list(L_HC_offset + R_HC_offset + np.array([0,29,30,37,38]))
    #X = np.delete(X_raw, np.s_[ignore_list_CT_idx], 1)
    X = X_raw
    print X.shape

    #Sample / Shuffle Data
    sampx = len(y)
    kf = KFold(sampx, n_folds=n_innerFolds,shuffle=True)

    #Save Data
    split_HC_CT = True #Split the HC and CT data layer to allow partitioned model
    k=0
    for train, valid in kf:
        k+=1
        out_file_train = out_file_prefix + 'train_InnerFold_{}_partition_ROI_74.h5'.format(k)
        out_file_valid = out_file_prefix + 'valid_InnerFold_{}_partition_ROI_74.h5'.format(k)
        #Save Train
        output_data = h5.File(out_file_train, 'a')
        X_all = X[train]
        if split_HC_CT:
            output_data.create_dataset('Fold_{}_X_L_HC'.format(OF_id),data=X_all[:,:L_HC_offset])
            output_data.create_dataset('Fold_{}_X_R_HC'.format(OF_id),data=X_all[:,L_HC_offset:L_HC_offset+R_HC_offset])
            output_data.create_dataset('Fold_{}_X_R_CT'.format(OF_id),data=X_all[:,L_HC_offset+R_HC_offset:])
        else:
            output_data.create_dataset('Fold_{}_X'.format(OF_id),data=X[train])

        output_data.create_dataset('Fold_{}_y'.format(OF_id),data=y[train])
        output_data.close()

        # Save valid
        output_data = h5.File(out_file_valid, 'a')
        X_all = X[valid]
        if split_HC_CT:
            output_data.create_dataset('Fold_{}_X_L_HC'.format(OF_id),data=X_all[:,:L_HC_offset])
            output_data.create_dataset('Fold_{}_X_R_HC'.format(OF_id),data=X_all[:,L_HC_offset:L_HC_offset+R_HC_offset])
            output_data.create_dataset('Fold_{}_X_R_CT'.format(OF_id),data=X_all[:,L_HC_offset+R_HC_offset:])
        else:
            output_data.create_dataset('Fold_{}_X'.format(OF_id),data=X[valid])

        output_data.create_dataset('Fold_{}_y'.format(OF_id),data=y[valid])
        output_data.close()


In [8]:
outer_CV_fold_file = '/projects/nikhil/ADNI_prediction/input_datasets/inflated_datasets_jdv/HC_CT_inflated_CV_subsets_JDV.h5'
single_CV_fold_dir = '/projects/nikhil/ADNI_prediction/input_datasets/inflated_datasets_jdv/'

n_innerFolds = 5

outer_folds = np.arange(1,11,1)
fold_name_prefix_list = []
single_CV_fold_file_list = []
for of in outer_folds:
    fold_name_prefix_list.append('Fold_{}_train'.format(str(of)))
    single_CV_fold_file_list.append('{}HC_CT_inflated_CV_OuterFold_{}_'.format(single_CV_fold_dir,str(of)))
    
    #generateInnerFold(outer_CV_fold_file, str(of), fold_name_prefix, n_innerFolds, single_CV_fold_file)

runParallel = True
if runParallel: #parallelized version:        
        rc = ipp.Client()
        #rc.block = False
        dview = rc[:]
        print dview
        dview.push(dict(generateInnerFold = generateInnerFold))                   
        mapfunc = partial(generateInnerFold, in_file=outer_CV_fold_file, n_innerFolds=n_innerFolds)

        parallel_result = dview.map_sync(mapfunc, outer_folds, fold_name_prefix_list, single_CV_fold_file_list)  

<DirectView [0, 1, 2, 3, 4]>


In [10]:
# Use this data for computing subject wise performance during outerloop cross-validation + held-out testset
baseline_dir = '/projects/nikhil/ADNI_prediction/input_datasets/'
in_file = 'HC_CT_fused_CV_subsets_JDV.h5'

CV = True
L_HC_offset=11427
R_HC_offset=10519

subset = 'valid'
CV_partition_file = 'HC_CT_fused_CV_OuterFolds_{}_partition.h5'.format(subset)
if CV:
    CV_fused_data = h5.File(baseline_dir + in_file,'a')
    for f in np.arange(10):
        X = CV_fused_data['Fold_{}_{}_X'.format(f+1,subset)]
        y = CV_fused_data['Fold_{}_{}_y'.format(f+1,subset)]
        CV_fused_data.create_dataset('Fold_{}_X_L_HC'.format(f+1),data=X[:,:L_HC_offset])
        CV_fused_data.create_dataset('Fold_{}_X_R_HC'.format(f+1),data=X[:,L_HC_offset:L_HC_offset+R_HC_offset])
        CV_fused_data.create_dataset('Fold_{}_X_R_CT'.format(f+1),data=X[:,L_HC_offset+R_HC_offset:])#Typo : R_CT
        CV_fused_data.create_dataset('Fold_{}_y'.format(f+1),data=y)
    
    CV_fused_data.close()

In [None]:
# Restructure OuterFold dataset h5 into two separate train and valid h5 files with identical dataset name
single_CV_fold_dir = '/projects/nikhil/ADNI_prediction/input_datasets/'

out_file_train = single_CV_fold_dir + 'HC_CT_inflated_CV_OuterFolds_train.h5'
out_file_valid = single_CV_fold_dir + 'HC_CT_inflated_CV_OuterFolds_valid.h5'

input_data = h5.File(single_CV_fold_dir + 'HC_CT_inflated_CV_subsets.h5', 'r')
out_data_train = h5.File(out_file_train, 'a')
out_data_valid = h5.File(out_file_valid, 'a')
                         
for OF_id in np.arange(1,11,1):
    dataset_train_X = 'Fold_{}_train_X'.format(OF_id)
    dataset_train_y = 'Fold_{}_train_y'.format(OF_id)
    dataset_valid_X = 'Fold_{}_valid_X'.format(OF_id)
    dataset_valid_y = 'Fold_{}_valid_y'.format(OF_id)
                     
    data_train_X = input_data[dataset_train_X][:]
    data_train_y = input_data[dataset_train_y][:]
    data_valid_X = input_data[dataset_valid_X][:]
    data_valid_y = input_data[dataset_valid_y][:]
                
    out_data_train.create_dataset('Fold_{}_X'.format(OF_id),data=data_train_X)    
    out_data_train.create_dataset('Fold_{}_y'.format(OF_id),data=data_train_y)    
    out_data_valid.create_dataset('Fold_{}_X'.format(OF_id),data=data_valid_X)    
    out_data_valid.create_dataset('Fold_{}_y'.format(OF_id),data=data_valid_y)    
    
input_data.close()
out_data_train.close()
out_data_valid.close()

In [None]:
sub_CT_dict = pickle.load( open(sub_CT_file, "rb" ) )

In [None]:
#Generate map between AAL atlas ROI : index based on
AAL_roi_map_file = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL_ROI_IDx'
AAL_roi_Name_featID_file = '/projects/nikhil/ADNI_prediction/input_datasets/CT/AAL_ROI_Name_featIDx.pkl'
ordered_roi_list = sub_CT_dict['40817'][0].keys()


ignore_roi_list = [0,29,30,39,40]
for roi in ignore_roi_list:
    ordered_roi_list.remove(roi)

ordered_roi_idx_dict = {}
for i,idx in enumerate(ordered_roi_list):
    ordered_roi_idx_dict[idx]=i

print ordered_roi_idx_dict
#data = pd.read_csv(AAL_roi_map_file,delim_whitespace=True)
print data.columns

data['feature_id'] = data['Ind'].map(ordered_roi_idx_dict) 
print data
#data = data[~np.isnan(data['feature_id'])]
#print data

roi_name_featIDx_Dict = data.set_index('feature_id')['Name'].to_dict()
od = collections.OrderedDict(sorted(roi_name_featIDx_Dict.items())) #order by feature colume index
print od

#f = open(AAL_roi_Name_featID_file, 'wb')
#pickle.dump(od, f)
#f.close()

#print 'total number of ROIs {}'.format(len(ordered_roi_list))
#ignore_roi_list = [0,29,30,39,40]

#for roi in ignore_roi_list:
#    print 'Ignore ROI index {}'.format(ordered_roi_list.index(roi))




In [None]:
arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print arr.shape
l = [1,2]
arr_trunc = np.delete(arr, np.s_[l], 1)
print arr_trunc.shape

In [None]:
#Create inflated HC total vol dataset (no voxel wise features)
def load_data(data_path, input_node):
    data = tb.open_file(data_path, 'r')
    X = data.get_node('/' + input_node)[:]
    data.close()
    return X


baseline_dir= '/projects/nikhil/ADNI_prediction/input_datasets/'
in_file = 'HC_CT_inflated_CV_subsets_MC.h5'
out_file = 'total_HC_vol_CT_inflated_CV_subsets_ROI_74_MC.h5'

L_HC_offset=11427
R_HC_offset=10519

ignore_cols = False
for lid in np.arange(1,11,1):
    print 'Starting Fold {}'.format(lid)
    out_train_X_raw = load_data(baseline_dir + in_file,'Fold_{}_train_X'.format(lid))
    out_train_y = load_data(baseline_dir + in_file,'Fold_{}_train_y'.format(lid))

    #out_valid_X_raw = load_data(baseline_dir + in_file,'Fold_{}_valid_X'.format(lid))
    #out_valid_y = load_data(baseline_dir + in_file,'Fold_{}_valid_y'.format(lid))

    #if you want to remove some CT columes (74 connundrum)
    if ignore_cols:
        ignore_list_CT_idx = list(L_HC_offset + R_HC_offset + np.array([0,29,30,37,38]))
        out_train_X = np.delete(out_train_X_raw, np.s_[ignore_list_CT_idx], 1)
        #out_valid_X = np.delete(out_valid_X_raw, np.s_[ignore_list_CT_idx], 1)
    else:
        out_train_X = out_train_X_raw
        #out_valid_X = out_valid_X_raw


    out_data = h5.File(baseline_dir + out_file,'a')
    #Train
    out_data.create_dataset('Fold_{}_train_X_L_HC'.format(lid),data=np.sum(out_train_X[:,:L_HC_offset],axis=1))
    out_data.create_dataset('Fold_{}_train_X_R_HC'.format(lid),data=np.sum(out_train_X[:,L_HC_offset:L_HC_offset+R_HC_offset],axis=1))
    out_data.create_dataset('Fold_{}_train_X_CT'.format(lid),data=out_train_X[:,L_HC_offset+R_HC_offset:])
    out_data.create_dataset('Fold_{}_train_y'.format(lid),data=out_train_y)
    #Valid
    #out_data.create_dataset('Fold_{}_valid_X_L_HC'.format(lid),data=np.sum(out_valid_X[:,:L_HC_offset],axis=1))
    #out_data.create_dataset('Fold_{}_valid_X_R_HC'.format(lid),data=np.sum(out_valid_X[:,L_HC_offset:L_HC_offset+R_HC_offset],axis=1))
    #out_data.create_dataset('Fold_{}_valid_X_CT'.format(lid),data=out_valid_X[:,L_HC_offset+R_HC_offset:])
    #out_data.create_dataset('Fold_{}_valid_y'.format(lid),data=out_valid_y)
    out_data.close()
