In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import math
import os
import time

import load_data
import file_structure
def label_conversion(numpy_array):
    labels = numpy_array.tolist()
    labels_str = list()

    for i in range(len(labels)):

        if numpy_array[i] == 0:
            labels_str.append('non-codel')
        else:
            labels_str.append('codel')

    return labels_str

In [18]:
study = 'texture'
mode = study
(origdir,basedir,imagedir,normdir,splitdir,out_dir,model_dir) = file_structure.file_dirs(mode)

study_list = ['texture','TDA','CNN']
for mode in study_list:
    samples = np.load(os.path.join(splitdir,'slices_%s_features.npy') % mode)
    feature_names = np.load(os.path.join(splitdir,'%s_feature_names.npy' % mode))
    label_id = np.expand_dims(np.load(os.path.join(splitdir,'label_id.npy')),1)
    label_1p19q = np.load(os.path.join(splitdir,'label_1p19q.npy'))
    label_1p19q = label_1p19q.astype(np.float32).reshape(-1,1)
    label_1p19q_str = label_conversion(label_1p19q)
    features = np.hstack((label_id,samples))
    # print(feature_names)
    new_fnames  =np.hstack((['pid'],feature_names))
    features_df = pd.DataFrame(data=features, columns=new_fnames, index=label_1p19q_str)
    features_df.to_csv(os.path.join(out_dir,'full_ft_%s.csv' % mode))

In [17]:
features = np.hstack((label_id,samples))
# print(feature_names)
new_fnames  =np.hstack((['pid'],feature_names))
features_df = pd.DataFrame(data=features, columns=new_fnames, index=label_1p19q_str)
print(features_df)

             pid  firstorder_10Percentile_FLAIR  \
non-codel    0.0                      -0.374346   
non-codel    0.0                      -0.236240   
non-codel    1.0                       0.230607   
non-codel    1.0                       0.200568   
non-codel    2.0                       0.558672   
non-codel    2.0                       0.851001   
non-codel    3.0                      -0.364248   
non-codel    3.0                      -0.278173   
non-codel    4.0                      -0.043816   
non-codel    4.0                      -0.394673   
non-codel    5.0                      -0.369207   
non-codel    5.0                      -0.257116   
non-codel    6.0                      -0.457251   
non-codel    6.0                      -0.340307   
non-codel    7.0                       0.104104   
non-codel    7.0                       0.023965   
non-codel    8.0                      -1.078178   
non-codel    8.0                      -2.324999   
non-codel    9.0               

In [26]:
# modals
modal_list = ['FLAIR','T1', 'T1post','T2']
study_list = ['texture','TDA','CNN','cmb']
ICC_filt = 0
if ICC_filt == 1:
    suffix = '_icc2'
else: 
    suffix = ''

for study in study_list:
    (origdir,basedir,imagedir,normdir,splitdir,out_dir,model_dir) = file_structure.file_dirs(study)
    # load feature names and corr+rfe filtered feature names
    #if study == 'TDA':
    #    feature_names = np.load(data_dir + 'ph_feature_names.npy')
    #    corr_rfe_selected_list = np.load(os.path.join(out_dir,'corr_rfe_ph_features_09.npy')).item()
    #elif study == 'texture':
    #    feature_names = np.load(data_dir + 'rfecv_tex_features_selected.npy')
    #    corr_rfe_selected_list = np.load(os.path.join(out_dir, 'rfe_tex_features_selected.npy')).item()
    corr_rfe_selected_list = np.load(os.path.join(out_dir,'rfe_%s_features_selected%s.npy' % (study.lower()[0:3],suffix))).item()
    random_state_list = [2,4,6,8,10,12,14,16,18,20] 

    # For timing purpose
    time_list = []

    for fold in range(10):
        print('%s fold number: %d' % (study, fold))
        #t0 = time.time() python 2 vs python 3
        t0 = time.time()

        random_state = random_state_list[fold]

        fold_dir = os.path.join(splitdir,'fold_' + str(fold))

        #study_saved_model = data_dir_fold + 'saved_models/' + study + '_saved_model/'

        # load data
        (train_samples, train_features_pca, train_1p19q, train_age, 
         train_KPS, train_gender, train_id, train_fnames)= load_data.load_data('train',fold,study,ICC_filt)
        (val_samples,val_features_pca, val_1p19q, val_age, 
         val_KPS, val_gender, val_id, val_fnames)= load_data.load_data('val',fold,study,ICC_filt)
        
        feature_names = train_fnames
        
        
        # conver 1,0 into string: 'codel' and 'non-codel'
        train_1p19q_str = label_conversion(train_1p19q)
        val_1p19q_str = label_conversion(val_1p19q)

        # Create clinical variable df
        clinical_names = ['age', 'KPS', 'gender']
        train_clinical_df = pd.DataFrame(data = np.vstack((train_age, train_KPS, train_gender)).transpose(), 
                                            columns = clinical_names, index=train_1p19q_str)
        val_clinical_df = pd.DataFrame(data = np.vstack((val_age, val_KPS, val_gender)).transpose(), 
                                            columns = clinical_names, index=val_1p19q_str)

        for train_features, val_features, rfe_pca_flag in [(train_features_pca, val_features_pca,'pca'),
                                                           (train_clinical_df, val_clinical_df,'clinical'),
                                                           (train_samples, val_samples,'rfe')
                                                          ]:

            if rfe_pca_flag == 'rfe': # original features (i.e. non PCA features). This part needs to be filtered with corr_rfe selection
                # create the pandas dataframes
                train_features_df = pd.DataFrame(data=train_features, columns=feature_names, index=train_1p19q_str)
                val_features_df = pd.DataFrame(data=val_features, columns=feature_names, index=val_1p19q_str)

                # Remove Flatness and LeastAxis since they are all 0s for every sample
                drop_features = ['shape_Flatness_', 'shape_LeastAxis_']
                drop_features = [elemt + modal for modal in modal_list for elemt in drop_features]

                train_features_df = train_features_df.drop(drop_features, axis=1, errors = 'ignore')
                val_features_df = val_features_df.drop(drop_features, axis=1, errors = 'ignore')

                # obtain feature names to select from corr + RFE
                corr_rfe_selected = corr_rfe_selected_list[fold]

                # only the corr rfe selected features
                train_features_df = train_features_df[corr_rfe_selected]
                val_features_df = val_features_df[corr_rfe_selected]

            elif rfe_pca_flag == 'pca': #PCA features
                # create the pandas dataframes
                train_features_df = pd.DataFrame(data=train_features, index=train_1p19q_str)
                val_features_df = pd.DataFrame(data=val_features, index=val_1p19q_str)
            elif rfe_pca_flag == 'clinical': #PCA features
                # create the pandas dataframes
                if study != 'cmb':
                    continue
                train_features_df = pd.DataFrame(data=train_features, index=train_1p19q_str)
                val_features_df = pd.DataFrame(data=val_features, index=val_1p19q_str)

            # combine train and val for CV purpose
            train_val_features_df = pd.concat([train_features_df, val_features_df], axis=0)
            train_val_clinical_df = pd.concat([train_clinical_df, val_clinical_df], axis=0)
            train_val_1p19q = np.append(train_1p19q, val_1p19q)
            train_val_1p19q_str = train_1p19q_str + val_1p19q_str

            # combine the membership as well for CV purpose
            #train_val_group_membership = train_group_membership + val_group_membership
            train_val_id = np.append(train_id, val_id)

            # features + clinical 
            train_val_all_df = pd.concat([train_val_features_df, train_val_clinical_df], axis=1)
            train_val_all_df = train_val_all_df.assign(pid = train_val_id)
            # save to excel
            output_name = ('training'+'_'+study + '_' + rfe_pca_flag + '_' + str(fold) +suffix + '.csv')
            if not(os.path.exists(os.path.join(out_dir,'switchbox'))):
                os.makedirs(os.path.join(out_dir,'switchbox'))
            train_val_features_df.to_csv(os.path.join(out_dir,'switchbox',output_name))
    

texture fold number: 0
texture fold number: 1
texture fold number: 2
texture fold number: 3
texture fold number: 4
texture fold number: 5
texture fold number: 6
texture fold number: 7
texture fold number: 8
texture fold number: 9
TDA fold number: 0
TDA fold number: 1
TDA fold number: 2
TDA fold number: 3
TDA fold number: 4
TDA fold number: 5
TDA fold number: 6
TDA fold number: 7
TDA fold number: 8
TDA fold number: 9
CNN fold number: 0
CNN fold number: 1
CNN fold number: 2
CNN fold number: 3
CNN fold number: 4
CNN fold number: 5
CNN fold number: 6
CNN fold number: 7
CNN fold number: 8
CNN fold number: 9
cmb fold number: 0
cmb fold number: 1
cmb fold number: 2
cmb fold number: 3
cmb fold number: 4
cmb fold number: 5
cmb fold number: 6
cmb fold number: 7
cmb fold number: 8
cmb fold number: 9


In [27]:
study_list = ['texture','TDA','CNN','cmb']
ICC_filt = 0
if ICC_filt == 1:
    suffix = '_icc2'
else: 
    suffix = ''

for study in study_list:

    (origdir,basedir,imagedir,normdir,splitdir,out_dir,model_dir) = file_structure.file_dirs(study)
    corr_rfe_selected_list = np.load(os.path.join(out_dir,'rfe_%s_features_selected%s.npy' % (study.lower()[0:3],suffix))).item()
    random_state_list = [2,4,6,8,10,12,14,16,18,20] 


    res_arr = []
    for fold in range(10):
        result = {}
        print('%s fold number: %d' % (study, fold))
        #(train_samples, train_pca , train_1p19q, train_age, train_KPS, train_gender, train_id, train_fnames)= load_data.load_data('train',fold)
        #(val_samples, val_pca , val_1p19q, val_age, val_KPS, val_gender, val_id, val_fnames)= load_data.load_data('val',fold)
        (test_samples, test_pca , test_1p19q, test_age,
         test_KPS, test_gender, test_id, feature_names)= load_data.load_data('test',fold, study,ICC_filt)
        
        # add PID to DF
        test_samples = np.hstack((np.expand_dims(test_id,1),test_samples))
        feature_names  =np.hstack((['pid'],feature_names))
        
        
        
        fold_dir = os.path.join(splitdir,'fold_' + str(fold))

        # convert 1,0 into string: 'codel' and 'non-codel'
        test_1p19q_str = label_conversion(test_1p19q)

        # create the pandas dataframes
        data_test_imaging = pd.DataFrame(data=test_samples, columns=feature_names, index=test_1p19q_str)

        clinical_names = ['age', 'KPS', 'gender']
        test_clinical_df = pd.DataFrame(data = np.vstack((test_age, test_KPS, test_gender)).transpose(),
                                        columns = clinical_names, index=test_1p19q_str)
        test_clin_by_pt_df = test_clinical_df.assign(pid = pd.Series(test_id).values)
        test_clin_by_pt_df = test_clin_by_pt_df.groupby('pid').mean()
        #data_test_age = pd.DataFrame(data = np.hstack((test_age)), columns = ['age'], index=test_1p19q_str)


#         for test_features, rfe_pca_flag in [(test_clin_by_pt_df,'clin_ppt')]:
        for test_features, rfe_pca_flag in [(test_clinical_df,'clinical'),
                                            (test_samples,'rfe'),
                                            (test_pca,'pca')]:

            if rfe_pca_flag == 'rfe': # original features (i.e. non PCA features). This part needs to be filtered with corr_rfe selection
                # create the pandas dataframes
                test_features_df = pd.DataFrame(data=test_features, columns=feature_names, index=test_1p19q_str)

                # Remove Flatness and LeastAxis since they are all 0s for every sample
                drop_features = ['shape_Flatness_', 'shape_LeastAxis_']
                drop_features = [elemt + modal for modal in modal_list for elemt in drop_features]

                test_features_df = test_features_df.drop(drop_features, axis=1 , errors = 'ignore')

                # obtain feature names to select from corr + RFE
                corr_rfe_selected = corr_rfe_selected_list[fold]

                # only the corr rfe selected features
                test_features_df = test_features_df[corr_rfe_selected]

            elif rfe_pca_flag == 'pca': #PCA features
                # create the pandas dataframes            
                test_features_df = pd.DataFrame(data=test_features, index=test_1p19q_str)

            elif rfe_pca_flag == 'clinical': #PCA features
                if study != 'cmb':
                    continue
                # create the pandas dataframes            
                test_features_df = pd.DataFrame(data=test_features, index=test_1p19q_str)
            
            output_name = ('testing'+'_'+study+'_' + rfe_pca_flag + '_' + str(fold) +suffix + '.csv')
            
            test_features_df = test_features_df.assign(pid = test_id)
            if not(os.path.exists(os.path.join(out_dir,'switchbox'))):
                os.makedirs(os.path.join(out_dir,'switchbox'))
            test_features_df.to_csv(os.path.join(out_dir,'switchbox',output_name))

texture fold number: 0
texture fold number: 1
texture fold number: 2
texture fold number: 3
texture fold number: 4
texture fold number: 5
texture fold number: 6
texture fold number: 7
texture fold number: 8
texture fold number: 9
TDA fold number: 0
TDA fold number: 1
TDA fold number: 2
TDA fold number: 3
TDA fold number: 4
TDA fold number: 5
TDA fold number: 6
TDA fold number: 7
TDA fold number: 8
TDA fold number: 9
CNN fold number: 0
CNN fold number: 1
CNN fold number: 2
CNN fold number: 3
CNN fold number: 4
CNN fold number: 5
CNN fold number: 6
CNN fold number: 7
CNN fold number: 8
CNN fold number: 9
cmb fold number: 0
cmb fold number: 1
cmb fold number: 2
cmb fold number: 3
cmb fold number: 4
cmb fold number: 5
cmb fold number: 6
cmb fold number: 7
cmb fold number: 8
cmb fold number: 9
