In [None]:
# common
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from datetime import datetime
from itertools import product
from collections import OrderedDict

import shap

In [None]:
# transform
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer, Normalizer

In [None]:
# selection
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFECV, SelectFromModel
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
# sampling
from sklearn.manifold import TSNE
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SMOTENC, ADASYN, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids, NeighbourhoodCleaningRule, OneSidedSelection, RandomUnderSampler, NearMiss, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, CondensedNearestNeighbour
from imblearn.combine import SMOTEENN, SMOTETomek

In [None]:
# evaluation
import sklearn
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from delong import auc_ci

In [None]:
# models
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import plot_importance
#from lightgbm import plot_importance
from xgboost import XGBClassifier

In [None]:
def get_clf_eval(y_test, pred = None, pred_proba = None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    # ROC_AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    
    fp = confusion[0][1]
    tn = confusion[0][0]
    fn = confusion[1][0]
    tp = confusion[1][1]
    
    specificity = tn / (tn + fp)
    
    # ROC_AUC print 추가
    print('정확도: {:.4f}, 정밀도: {:.4f}, 재현율: {:.4f}, F1: {:.4f}, AUC: {:.4f}'.format(accuracy,precision,recall,f1, roc_auc))
    print('특이도(Specificity) : {:.4f}'.format(specificity))
    print('TN : {} / TP : {} / FP : {} / FN : {}'.format(tn, tp, fp, fn))

In [None]:
def get_clf_eval_dataframe(model, y_test, pred = None, pred_proba = None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    
    count_precision = tp / (tp + fp)
    print
    # ROC_AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
   # ROC_AUC print 추가
    print('정확도: {:.4f}, 정밀도: {:.4f}, 재현율: {:.4f}, F1: {:.4f}, AUC: {:.4f}'.format(accuracy,precision,recall,f1, roc_auc))     
    print('계산된 정밀도 : {:.4}'.format(count_precision)) 
    print('tn fp fn tp')
    print(tn, fp, fn, tp)
    
    
    delong_roc_auc, (auc_low, auc_high) = auc_ci(y_test.ravel(), pred_proba, return_ci=True)
    
    model_dict = dict()
    model_dict['ROC-AUC'] = roc_auc
    model_dict['ROC-AUC(Delong)'] = delong_roc_auc
    model_dict['ROC-AUC(95CI_UPPER)'] = auc_high
    model_dict['ROC-AUC(95CI_LOWER)'] = auc_low
    model_dict['Accuracy'] = accuracy
    model_dict['Precision'] = precision
    model_dict['Recall'] = recall
    model_dict['F1 Score'] = f1


    
    model_dict['TN'] = tn
    model_dict["FP"] = fp
    model_dict['FN'] = fn
    model_dict['TP'] = tp
    
    model_df = pd.DataFrame(model_dict, index = [model])
    
    return model_df

In [None]:
train = pd.read_csv(os.path.join(train_test_dir ,'Perfusion_ML_Train_Split-K12(190pts).csv'))
test = pd.read_csv(os.path.join(train_test_dir ,'Perfusion_ML_Test_Split-K12(190pts).csv'))

In [None]:
train_index, test_index = train[index_column].reset_index(drop=True), test[index_column].reset_index(drop=True)
train_label, test_label   = train[label_column].reset_index(drop=True), test[label_column].reset_index(drop=True)

train = train.drop([index_column, label_column], axis=1).reset_index(drop=True)
test = test.drop([index_column, label_column], axis=1).reset_index(drop=True)

In [None]:
scalers = [MinMaxScaler()]
selectors = [f_classif]
#selectors = [f_classif, mutual_info_classif, RFECV, 'Without_selector']
samplers = [SMOTE]
classifiers =  [ExtraTreesClassifier]
cases = list(product(scalers, selectors, samplers, classifiers))
print(len(cases), cases)

if type(num_features) == tuple:
    min_feature_num = max(min(num_features), 1)
    max_feature_num = min(len(train.columns), max(num_features))
    total_length = len(cases) * (max_feature_num - min_feature_num)
else:
    min_feature_num = max(num_features, 1)
    max_feature_num = min(len(train.columns), num_features)
    total_length = len(cases)

In [None]:
# SEV 데이터로 우선
preprocess_dict = preprocessing(train, test, train_label, test_label, cases[0], must_include_features, len(must_include_features))

In [None]:
sampled_train = preprocess_dict['sampled_train']
sampled_train_label = preprocess_dict['sampled_train_label']
selected_test = preprocess_dict['selected_test']
test_label = preprocess_dict['test_label']
clf = preprocess_dict['clf']
selected_feature = preprocess_dict['selected_features']

In [None]:
# test cohort
clf.fit(sampled_train, sampled_train_label)
test_predict = clf.predict(selected_test)
test_proba = clf.predict_proba(selected_test)[:, 1]

In [None]:
get_clf_eval(test_label, test_predict ,test_proba)

In [None]:
selected_feature

# Shap 그리기

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(clf)
train_shap_values = explainer.shap_values(preprocess_dict['selected_train']) ## 이거는 Sampling 안된 것 - 즉, Original Train
### Sampling 된거로 하려면 preprocess_dict['sampled_train_label']
test_shap_values = explainer.shap_values(selected_test)

In [None]:
preprocess_dict['selected_train']

In [None]:
selected_test

## Summary Plot - Bar

### Train

In [None]:
selected_feature

In [None]:
train_feature_name_clean = ['KtransMap_firstorder_10Percentile',
                       'KtransMap_glcm_Correlation',
                      'KtransMap_glcm_MCC',
                     'KtransMap_gldm_DependenceVariance']
## 중요한거는 위에 데이터 프레임에 있는 Feature 순서대로 가기

In [None]:
np.array(train_shap_values[0]).shape ## 하나의 class 만 뽑아내기

In [None]:
# 수정 전
shap.summary_plot(train_shap_values, preprocess_dict['selected_train'],plot_type= 'bar' ,auto_size_plot= None, plot_size= (13, 8) , show= False)
plt.tight_layout()
plt.title('SHAP Summary Plot - Train(Model K12)', fontsize = 17)
# plt.legend(loc='lower right', fontsize = 12)
plt.show()

In [None]:
# 수정 후
shap.summary_plot(train_shap_values[0], preprocess_dict['selected_train'],feature_names = feature_name_clean,plot_type= 'bar' ,auto_size_plot= None, plot_size= (13, 8) , show= False)
plt.tight_layout()
plt.title('SHAP Summary Plot - Train(Model KtransMap)', fontsize = 17)
# plt.legend(loc='lower right', fontsize = 12)
plt.show()

### Test

In [None]:
shap.summary_plot(test_shap_values, selected_test,plot_type= 'bar' ,auto_size_plot= None, plot_size= (13, 8) , show= False)
plt.tight_layout()
plt.title('SHAP Summary Plot - Test(Model K12)', fontsize = 17)
# plt.legend(loc='lower right', fontsize = 12)
plt.show()

In [None]:
# 수정 후
shap.summary_plot(test_shap_values[0], selected_test,feature_names = feature_name_clean,plot_type= 'bar' ,auto_size_plot= None, plot_size= (13, 8) , show= False)
plt.tight_layout()
plt.title('SHAP Summary Plot - Test(Model KtransMap)', fontsize = 17)
# plt.legend(loc='lower right', fontsize = 12)
plt.show()

## Summary Plot - Dot

### Train

In [None]:
shap.summary_plot(train_shap_values[1], preprocess_dict['selected_train'],plot_type= 'dot' ,auto_size_plot= None, plot_size= (13, 8) , show= False)
plt.tight_layout()
plt.title('SHAP Summary Plot - Train(Model K12)', fontsize = 17)
# plt.legend(loc='lower right', fontsize = 12)
plt.show()

### Test

In [None]:
shap.summary_plot(test_shap_values[1], selected_test,plot_type= 'dot' ,auto_size_plot= None, plot_size= (13, 8) , show= False)
plt.tight_layout()
plt.title('SHAP Summary Plot - Test(Model K12)', fontsize = 17)
# plt.legend(loc='lower right', fontsize = 12)
plt.show()

## Decision Plot

In [None]:
explainer.expected_value

### Train

In [None]:
# Pos 기준
plt.figure(figsize= (12,8))
shap.decision_plot( explainer.expected_value[1], train_shap_values[1], preprocess_dict['selected_train'], auto_size_plot= None, show= False)
plt.tight_layout()
#plt.legend(loc='lower right', fontsize = 12)
plt.title('SHAP Decision Plot - Train(Model K12)', fontsize = 15)
plt.show()

In [None]:
np.array(train_shap_values)[1].shape

### Test

In [None]:
# Pos 기준
plt.figure(figsize= (12,8))
shap.decision_plot( explainer.expected_value[1], test_shap_values[1], selected_test, auto_size_plot= None, show= False)
plt.tight_layout()
#plt.legend(loc='lower right', fontsize = 12)
plt.title('SHAP Decision Plot - Test(Model K12)', fontsize = 15)
plt.show()

## Dependence Plot

### Train

In [None]:
shap.initjs()
# 총 13개 특성의 Shapley value를 절댓값 변환 후 각 특성마다 더함 -> np.argsort()는 작은 순서대로 정렬, 큰 순서대로 정렬하려면
# 앞에 마이너스(-) 기호를 붙임
top_inds = np.argsort(-np.sum(np.abs(train_shap_values[1]), 0))

# 영향력 Feature 4개 순서대로 컬럼
for i in range(4):
    shap.dependence_plot(top_inds[i], train_shap_values[1], preprocess_dict['selected_train'])

### Test

In [None]:
shap.initjs()
# 총 13개 특성의 Shapley value를 절댓값 변환 후 각 특성마다 더함 -> np.argsort()는 작은 순서대로 정렬, 큰 순서대로 정렬하려면
# 앞에 마이너스(-) 기호를 붙임
top_inds = np.argsort(-np.sum(np.abs(test_shap_values[1]), 0))

# 영향력 Feature 5개 순서대로 컬럼
for i in range(4):
    shap.dependence_plot(top_inds[i], test_shap_values[1], selected_test)