In [21]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import seaborn as sns
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from feature_classifier_utils import ensemble_voting, GridSVM, Lasso_feature_selection
from feature_performance_utils import load_texture_feature,\
                                        load_clinical_data,\
                                        save_confusion_matrix,\
                                        Validation, \
                                        normal_porosis,\
                                        save_roc_curve, \
                                        dummy_labelize_swk, \
                                        binarize_threshold_swk

from sklearn.preprocessing import StandardScaler

In [33]:
def save_prediction(feature_index, y_pred, y_pred_proba, y_test, filename, binary):
    
    if binary :

        data = {
            'true label' : y_test,
            'pred label' : y_pred,
            'confidence score for class 0' : y_pred_proba.T[0],
            'confidence score for class 1' : y_pred_proba.T[1]
        }
        
        pd.DataFrame(index=feature_index, data=data).to_excel(filename)
        return 
    else:
        
        data = {
            'true label' : y_test,
            'pred label' : y_pred,
            'confidence score for class 0' : y_pred_proba.T[0],
            'confidence score for class 1' : y_pred_proba.T[1],
            'confidence score for class 2' : y_pred_proba.T[2]
        }
        
        pd.DataFrame(index=feature_index, data=data).to_excel(filename)
        return

def binary_class_configuration(feature_data, feature_label, delete_label):
    
    resampled_feature_data = []
    resampled_feature_label = []
    
    for d, l in zip(feature_data, feature_label):
        
        if l == int(delete_label):
            continue
            
        else:
            resampled_feature_data.append(d)
            resampled_feature_label.append(l)
    
    return resampled_feature_data, resampled_feature_label

### Concatenate Clinical data to feature data

In [34]:
texture_features = ['../result/left_mask.csv', '../result/right_mask.csv']
label_file = '../data/label_dict.pickle'
clinical_file = '../data/clinical_data.pickle'

with open(label_file, 'rb') as file: label_dict = pickle.load(file)
with open(clinical_file, 'rb') as file: clinical_dict = pickle.load(file)
    
feature_names = list(pd.read_csv(texture_features[0]).columns)[1:]

whole_feature = []
whole_label = []
whole_subjects = []

for texture_feature in texture_features:
#     print(texture_feature)
    
    for key, value in pd.read_csv(texture_feature).iterrows():

        subject = '_'.join(list(value)[0].split('_')[:-1])
        label = label_dict[subject]
        clinic_data = clinical_dict[subject] # is male, is female, age

#         print(key, list(value)[0], len(list(value[1:])), label, clinic_data)
        
        whole_feature.append(list(value[1:]) + clinic_data)
        whole_label.append(label)
        whole_subjects.append(list(value)[0])

#         break
        
print('Whole feature :',np.array(whole_feature).shape) # (270 = 135 + 135, 854 = 851 + 3)
print('Whole labels :', np.array(whole_label).shape)
print('Whole subjects :', len(whole_subjects))
print('Number by labels :',Counter(whole_label))

Whole feature : (270, 854)
Whole labels : (270,)
Whole subjects : 270
Number by labels : Counter({1: 92, 2: 90, 0: 88})


### Feature Analysis with Lasso

- LASSO feature Extraction Method applied 
- Scaled data ( x - mean / std ) were used ( mean, std from train dataset were applied to test dataset )

In [46]:
num_try = 3
try_dict = dict()

# feature_data = feature_data_with_clinical
highest_acc = 0.0
binary = True

classification_result_save_dir = '../classification_result/cn_ad/'
target_names = ['CN', 'AD']
delete_label = 1

internal_feature_data = whole_feature
internal_feature_label = np.array(whole_label)

# if Bianary remove MCI
if binary:
    internal_feature_data, internal_feature_label = \
                        binary_class_configuration(internal_feature_data, internal_feature_label, delete_label)
    
    internal_feature_data = np.array(internal_feature_data)
    
    # 2 label ==> 1 label / 1 이 사라졌기 때문에..!
    internal_feature_data[internal_feature_data == 2] = 1

scaler = StandardScaler()

trs = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08]
#  first try : 23, 18, 13, 8, 8, 6, 4, 3

# trs = [0.07] # 4

for i in range(num_try):
    
    print("++++++++++")
    print(i+1 , "st try")
    print("++++++++++\n")
        
    # train test set divide
    X_train, X_test, y_train, y_test = train_test_split(internal_feature_data,
                                                        internal_feature_label,
                                                        test_size=0.20,
                                                        random_state=42+i,
                                                        stratify = internal_feature_label)
    y_test = np.array(y_test)
    y_test[y_test == 2] = 1
        
    X_train_resampled, Y_train_resampled = X_train, y_train

    # Scale dataset
    scaler.fit(X_train_resampled)
    X_train_scaled = scaler.transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)

    for i, tr in enumerate(trs):

        # Texture Feature Selection
        sfm = Lasso_feature_selection(X_train_scaled[:, :-3], Y_train_resampled, tr=tr)
        num_selected_feature = len(np.where(sfm.get_support() == True)[0])
        print(i+1,'Number of Feature Selected with ', tr , ':', num_selected_feature)

        X_train_new = np.hstack([sfm.transform(X_train_scaled[:, :-3]), X_train_scaled[:, -3:]])
        X_test_tr = np.hstack([sfm.transform(X_test_scaled[:, :-3]), X_test_scaled[:, -3:]])

        print(num_selected_feature)

        clf = ensemble_voting(X_train_new, Y_train_resampled, random_state=42+i, cv=5)

#         train
        clf.fit(X_train_new, Y_train_resampled)

        # evaluation

        print('================================')
        print("Test with ", num_selected_feature, " features")

        internal_result = Validation(clf, X_test_tr, y_test)
        
        # multilabel 에 대한 confusion matrix 를 어떻게 구할까.. 3차원인가 ??
#         confusion = internal_result[-1]; save_confusion_matrix(confusion, conf_index, conf_cols, "Internal Validation Confusion matrix")

#         y_pred_proba = clf.predict_proba(X_test_tr); y_pred = clf.predict(X_test_tr)

#         save_prediction(np.arange(len(y_pred)), y_pred, y_pred_proba, y_test,\
#                          os.path.join(classification_result_save_dir , str(num_selected_feature)+'.xlsx'),
#                        binary=binary)
        
#         print(classification_report(y_test, y_pred, target_names = target_names))

        print('================================')

#             break

print('done')

++++++++++
1 st try
++++++++++

1 Number of Feature Selected with  0.01 : 30
30
Ensemble Voting Grid search running...
Random Forest Grid search running...
{'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 200}
XGBoost Grid search running...
{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
Adaboost Grid search running...
{'learning_rate': 1.0, 'n_estimators': 200}
Estimators Ready
Voting Classifier Grid search running ... 
{'voting': 'soft'}
done
Test with  30  features
Validation
confusion :  [[17  1]
 [ 9  9]]
Accuracy : 0.722
roc-auc score : 0.917
f1 score : 0.643
precision : 0.900
2 Number of Feature Selected with  0.02 : 25
25
Ensemble Voting Grid search running...
Random Forest Grid search running...
{'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 400}
XGBoost Grid search running...
{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 7, 'n_est

4 Number of Feature Selected with  0.04 : 21
21
Ensemble Voting Grid search running...
Random Forest Grid search running...
{'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 500}
XGBoost Grid search running...
{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Adaboost Grid search running...
{'learning_rate': 1.0, 'n_estimators': 100}
Estimators Ready
Voting Classifier Grid search running ... 
{'voting': 'soft'}
done
Test with  21  features
Validation
confusion :  [[15  3]
 [ 1 17]]
Accuracy : 0.889
roc-auc score : 0.926
f1 score : 0.895
precision : 0.850
5 Number of Feature Selected with  0.05 : 15
15
Ensemble Voting Grid search running...
Random Forest Grid search running...
{'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 400}
XGBoost Grid search running...
{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
Adaboost Grid sea

{'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 400}
XGBoost Grid search running...
{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Adaboost Grid search running...
{'learning_rate': 1.0, 'n_estimators': 200}
Estimators Ready
Voting Classifier Grid search running ... 
{'voting': 'soft'}
done
Test with  8  features
Validation
confusion :  [[16  2]
 [ 4 14]]
Accuracy : 0.833
roc-auc score : 0.914
f1 score : 0.824
precision : 0.875
8 Number of Feature Selected with  0.08 : 5
5
Ensemble Voting Grid search running...
Random Forest Grid search running...
{'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 400}
XGBoost Grid search running...
{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Adaboost Grid search running...
{'learning_rate': 1.0, 'n_estimators': 200}
Estimators Ready
Voting Classifier Grid search running ... 
{'voting

In [26]:
Counter(Y_train_resampled)

# - test set 을 balancing 하여 평가하였다
#TODO 1: Confusion Matrix
#TODO 2: AUROC score with multiple labels
#TODO 3: 

Counter({1: 74, 2: 72, 0: 70})