## Data Composition

1. Texture feature 851 * 2
2. Deep feature 128 * 2
3. clinical feature 3

1. version
    1. version 1 : Texture Only
    2. version 2 : Deep Only
    3. version 3 : Texture + clinical
    4. version 4 : Deep + clinical
    5. version 5 : Texture + Deep + clinical

In [9]:
import pandas as pd
import numpy as np
from glob import glob
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import seaborn as sns
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, multilabel_confusion_matrix

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from feature_classifier_utils import ensemble_voting, GridSVM, Lasso_feature_selection
from feature_oversampling_utils import random_oversampling
from feature_performance_utils import load_texture_feature,\
                                        load_clinical_data,\
                                        save_confusion_matrix,\
                                        Validation, \
                                        normal_porosis,\
                                        save_roc_curve, \
                                        dummy_labelize_swk, \
                                        binarize_threshold_swk

from random import shuffle

from sklearn.preprocessing import StandardScaler

from tqdm import notebook

def ad_dlr_dataloader():
    texture_features = ['../Data/ad_radiomics/texture_feature/left_mask.csv', '../Data/ad_radiomics/texture_feature/right_mask.csv']
    rename_file = lambda x : x.rename(columns={"Unnamed: 0" : "file"})

    left_texture = rename_file(pd.read_csv(texture_features[0]))
    right_texture = rename_file(pd.read_csv(texture_features[1]))

    deep_features = glob(os.path.join('../Data/ad_radiomics/deep_feature/', '*.npy'))
    label_file = '../Data/ad_radiomics/label_dict.pickle'
    clinical_file = '../Data/ad_radiomics/clinical_data.pickle'

    with open(label_file, 'rb') as file: label_dict = pickle.load(file)
    with open(clinical_file, 'rb') as file: clinical_dict = pickle.load(file)

    feature_names = np.array(list(pd.read_csv(texture_features[0]).columns)[1:])

    whole_feature = []
    whole_label = []
    whole_subjects = []

    for key, clinical_value in notebook.tqdm_notebook(clinical_dict.items()):

        left_texture_feature = list(left_texture.loc[left_texture['file'] == key+'_L'].values[0, 1:])
        right_texture_feature = list(right_texture.loc[right_texture['file'] == key+'_R'].values[0, 1:])

        left_deep_feature = list([np.load(x) for x in deep_features if key+'_0' in x][0])
        right_deep_feature = list([np.load(x) for x in deep_features if key+'_1' in x][0])

        clinical_feature = clinical_value
        label = label_dict[key]
        
        label = 0 if label in [0, 1] else 1

        whole_feature.append([*left_texture_feature,
                              *right_texture_feature,
                              *left_deep_feature,
                              *right_deep_feature,
                              *clinical_feature])

        whole_label.append(label)
        whole_subjects.append(key)

    whole_feature = np.array(whole_feature)
    whole_label = np.array(whole_label)

    return whole_feature, whole_label, whole_subjects

In [19]:
def result2csv(results, filename = './result.csv'):
    means = []
    stds = []
    for key, value in results.items():
        means.extend(np.mean(value, axis=0))
        stds.extend(np.std(value, axis=0))

    columns = ['accuracy', 'auroc', 'precision', 'f1']
    keys = list(results.keys())
    index = []
    for key in keys:
        for col in columns:
            index.append((key, col))

    index = pd.MultiIndex.from_tuples(index, names=['type', 'metric'])
    pd.DataFrame(np.array([means, stds]).T, index=index, columns=['mean', 'std']).to_csv(filename)
    
reporting_results = dict(
    v1=[],
    v2=[],
    v3=[],
    v4=[],
    v5=[]
)
    
def DLR(version, TR=0.04, RANDOM_NUMBER=42, data=None, test_split=False):
    
    if version == 'v1':
        CLINIC, DEEP, TEXTURE = False, False, True
    elif version == 'v2':
        CLINIC, DEEP, TEXTURE = False, True, False
    elif version == 'v3':
        CLINIC, DEEP, TEXTURE = True, False, True
    elif version == 'v4':
        CLINIC, DEEP, TEXTURE = True, True, False
    elif version == 'v5':
        CLINIC, DEEP, TEXTURE = True, True, True
    
    # Data Loading
    if data is None:
        try:
            X = np.load(os.path.join(save_folder, '{}_X.npy'.format(version)))
            Y = np.load(os.path.join(save_folder, '{}_Y.npy'.format(version)))
        except:
            raise ValueError("input should be specified")
            return
    else:
        X, Y = data

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.30,
                                                        random_state=RANDOM_NUMBER)
    # Standard Scaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    print("num features before LASSO : ", X_train.shape[1])

    """
        Feature Selection
    """

    # only Texture | version 1
    if ( not CLINIC ) & TEXTURE & ( not DEEP ):
        sfm = Lasso_feature_selection(X_train, y_train, tr=TR)
        X_train = sfm.transform(X_train)
        X_test = sfm.transform(X_test)

    # only Deep | version 2
    elif ( not CLINIC ) & ( not TEXTURE ) & DEEP :
        sfm = Lasso_feature_selection(X_train, y_train, tr=TR)
        X_train = sfm.transform(X_train)
        X_test = sfm.transform(X_test)

    # Texture + clinic | version 3
    elif CLINIC & TEXTURE & ( not DEEP ) :
        sfm = Lasso_feature_selection(X_train[:, :-3], y_train, tr=TR)
        X_train = np.hstack([sfm.transform(X_train[:, :-3]), X_train[:, -3:]])
        X_test = np.hstack([sfm.transform(X_test[:, :-3]), X_test[:, -3:]])

    # deep + clinic | version 4
    elif CLINIC & ( not TEXTURE ) & DEEP :
        sfm = Lasso_feature_selection(X_train[:, :-3], y_train, tr=TR)
        X_train = np.hstack([sfm.transform(X_train[:, :-3]), X_train[:, -3:]])
        X_test = np.hstack([sfm.transform(X_test[:, :-3]), X_test[:, -3:]])

    # texture + deep + clinic | version 5
    elif CLINIC & TEXTURE & DEEP :
        # texture + clinic + deep
        clinical_features = X_train[:, -3:]
        deep_features = X_train[:, -259:-3]
        texture_features = X_train[:, :-259]

        deep_sfm = Lasso_feature_selection(deep_features, y_train, tr=0.06)
        texture_sfm = Lasso_feature_selection(texture_features, y_train, tr=0.04)

        X_train = np.hstack([texture_sfm.transform(texture_features),
                             deep_sfm.transform(deep_features),
                             clinical_features])

        clinical_features_test = X_test[:, -3:]
        deep_features_test = X_test[:, -259:-3]
        texture_features_test = X_test[:, :-259]

        X_test = np.hstack([texture_sfm.transform(texture_features_test),
                 deep_sfm.transform(deep_features_test),
                 clinical_features_test])

        sfm = [texture_sfm, deep_sfm]

    print("num features after LASSO : ", X_train.shape[1])

    # LASSO feature Selection -> Random OverSampling
    """
        Threshold > 0.1, 0.15, 0.2
    """
    X_train, y_train = random_oversampling(X_train, y_train, random_state=RANDOM_NUMBER)

    # model
    classifier = ensemble_voting(X_train, y_train, random_state=RANDOM_NUMBER, cv=5)

    # Training
    classifier.fit(X_train, y_train)

    # Testing
    test_result = Validation(classifier, X_test, y_test)

    # Save Result
    reporting_results[version].append(list(test_result[:4]))

    output = dict(
        model = classifier,
        version = version,
        scaler = scaler,
        sfm = sfm,
        random_number = RANDOM_NUMBER
    )
    return output
    
def sfm_transform(sfm, data):
    return data[:, sfm.get_support() == True]

In [20]:
whole_feature, whole_label, whole_subjects = ad_dlr_dataloader()
print("Data Loaded")

version_functions = dict(
    v1 = lambda x : x[:, :851 * 2],
    v2 = lambda x : x[:, -259:-3],
    v3 = lambda x : np.concatenate([x[:, :851 * 2], x[:, -3:]], axis=1),
    v4 = lambda x : x[:, -259:],
    v5 = lambda x : x[:, :]
)

results = dict()
for vers_index in range(1, 6):
    version = 'v{}'.format(vers_index)
    version_function = version_functions[version]
    
    data = [version_function(whole_feature), whole_label]
    
    for tr in [0.01, 0.02]:
        print('\n', tr)
        train_module = DLR(version, TR=tr, RANDOM_NUMBER=42, data=data[:2], test_split=True)


Data Loaded

 0.01
num features before LASSO :  1702
num features after LASSO :  4
Ensemble Voting Grid search running...
Random Forest Grid search running...
{'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 200}
XGBoost Grid search running...
{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
Adaboost Grid search running...
{'learning_rate': 1.0, 'n_estimators': 200}
Estimators Ready
Voting Classifier Grid search running ... 
{'voting': 'soft'}
done
Validation
confusion :  [[24  1]
 [11  5]]
Accuracy : 0.707
roc-auc score : 0.802
f1 score : 0.455
precision : 0.833

 0.02
num features before LASSO :  1702
num features after LASSO :  4
Ensemble Voting Grid search running...
Random Forest Grid search running...
{'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 200}
XGBoost Grid search running...
{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 7, 'n_