In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve,
    confusion_matrix,
    accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold
from keras.models import model_from_json
import pickle

from machine_learing.core.utils import fill_na_mean

In [2]:
def roc_cutoff(y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    cutoff = thresholds[np.argmax(tpr - fpr)]
    return cutoff


def evaluate_clf(y_true, y_pred, cutoff):
    pred_label = (y_pred >= cutoff) * 1
    tn, fp, fn, tp = confusion_matrix(y_true, pred_label).ravel()
    accuracy = accuracy_score(y_true, pred_label)
    balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
    mcc = matthews_corrcoef(y_true, pred_label)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    auc = roc_auc_score(y_true, y_pred)
    metrics = {
        'auc': [auc],
        'acc': [accuracy],
        'sen': [sensitivity],
        'spe': [specificity],
        'bac': [balanced_accuracy],
        'mcc': [mcc],
        'cutoff': [cutoff]
    }
    return metrics


def load_lgb(path):
    if 'LGB' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_rf(path):
    if 'RandomForest' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_svm(path):
    if 'SVM' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_xgb(path):
    if 'XGB' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_nn(path_json, path_h5, path_transformer):
    if 'NN' in path_json:
        with open(path_json, 'r') as f:
            json_string = f.read()
        model = model_from_json(json_string)
        model.load_weights(path_h5)
        if os.path.exists(path_transformer):
            with open(path_transformer, 'rb') as f:
                transformer = pickle.load(f)
        else:
            transformer = None
        return model, transformer


def sort_preds(PREDS, IDXES):
    va_idxes = np.concatenate(IDXES)
    order = np.argsort(va_idxes)
    va_preds = np.concatenate(PREDS)
    return va_preds[order]

In [3]:
root = os.getcwd()
print(root)

n_splits_ncv = 5
seed_ncv = 1712

/home/kurosaki/Document/Research/PJ0/Repository/Bioenv/src


In [4]:
f_names = [
        'DeepLocDescriptorAAindex.csv',
        'DeepLocDescriptorAutocorrelation.csv'
    ]
data_names = [
    'DeepLocAAindex',
    'DeepLocAutocorr'
    ]

results = {
    data:{
        i:{
        'LGBMClassifier':None,
        'RandomForestClassifier':None,
        'SVMClassifier':None,
        'NNClassifier':None
        }
        for i in range(5)
        }
        for data in data_names
}

for f_name, data_name in zip(f_names, data_names):
    df = pd.read_csv(f'../data/DeepLoc/{f_name}')
    M_idx = df.iloc[:,2]=='M'
    S_idx = df.iloc[:,2]=='S'
    df = pd.concat([df[M_idx], df[S_idx]], axis=0)
    X = np.array(df.iloc[:,3:])
    y = np.concatenate([np.array([1]*sum(M_idx)), np.array([0]*sum(S_idx))]).flatten()
    print(data_name)
    print('X_shape', X.shape)
    print('y_shape', y.shape)
    skf_outer = StratifiedKFold(n_splits=n_splits_ncv, random_state=seed_ncv, shuffle=True) 
    outer_idxes = list(skf_outer.split(X, y))

    # evaluate ptrained model
    for i, (inner_idx, te_idx) in enumerate(outer_idxes):
        if i == 0:
            print('*'*100)
            print('fold', i)
            for model_name in ['LGBMClassifier', 'RandomForestClassifier', 'SVMClassifier', 'NNClassifier']:
                skf_inner = StratifiedKFold(n_splits=n_splits_ncv, random_state=seed_ncv, shuffle=True)
                inner_idxes = list(skf_outer.split(X[inner_idx], y[inner_idx]))
                print('-'*100)
                print(model_name)
                te_preds, cutoffs = [], []
                for j, (tr_idx, va_idx) in enumerate(inner_idxes):
                    if 'LGB' in model_name:
                        model = load_lgb(f'../results/models/{data_name}/LGBMClassifier/LGBMClassifier_ij{i}{j}_trainedmodel.pkl')
                        X_valid = X[inner_idx][va_idx]
                        X_test = X[te_idx]
                        va_pred = model.predict_proba(X_valid, num_iterations=model.best_iteration_)[:,1]
                        te_pred = model.predict_proba(X_test, num_iterations=model.best_iteration_)[:,1]
                    elif 'RandomForest' in model_name:
                        model = load_rf(f'../results/models/{data_name}/RandomForestClassifier/RandomForestClassifier_ij{i}{j}_trainedmodel.pkl')
                        X_valid = fill_na_mean(X[inner_idx][va_idx])
                        X_test = fill_na_mean(X[te_idx])
                        va_pred = model.predict_proba(X_valid)[:,1]
                        te_pred = model.predict_proba(X_test)[:,1]
                    elif 'SVM' in model_name:
                        model = load_svm(f'../results/models/{data_name}/SVMClassifier/SVMClassifier_ij{i}{j}_trainedmodel.pkl')
                        X_valid = fill_na_mean(X[inner_idx][va_idx])
                        X_test = fill_na_mean(X[te_idx])
                        va_pred = model.predict_proba(X_valid)[:,1]
                        te_pred = model.predict_proba(X_test)[:,1]
                    elif 'NN' in model_name:
                        model, transformer = load_nn(
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_i{i}_architecture.json',
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_ij{i}{j}_trainedweight.h5',
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_ij{i}{j}_transformer.pkl'
                            )
                        X_valid = fill_na_mean(X[inner_idx][va_idx])
                        X_test = fill_na_mean(X[te_idx])
                        va_pred = model.predict(transformer.transform(X_valid))
                        te_pred = model.predict(transformer.transform(X_test))
                    cutoff = roc_cutoff(y[inner_idx][va_idx], va_pred)
                    te_preds.append(te_pred)
                    cutoffs.append(cutoff)
                te_pred_mean = np.mean(te_preds, axis=0)
                metrics = evaluate_clf(y[te_idx], te_pred_mean, np.mean(cutoffs))
                print(pd.DataFrame(metrics))
                results[data_name][i][model_name] = pd.DataFrame(metrics)
            print('*'*100)


DeepLocAAindex
X_shape (4832, 566)
y_shape (4832,)
****************************************************************************************************
fold 0
----------------------------------------------------------------------------------------------------
LGBMClassifier
       auc       acc       sen       spe      bac       mcc    cutoff
0  0.81733  0.754912  0.605598  0.857143  0.73137  0.483018  0.429081
----------------------------------------------------------------------------------------------------
RandomForestClassifier


[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.


        auc       acc       sen       spe       bac       mcc    cutoff
0  0.785683  0.749741  0.600509  0.851916  0.726213  0.471879  0.419848
----------------------------------------------------------------------------------------------------
SVMClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.752795  0.722854  0.559796  0.834495  0.697146  0.413228  0.413129
----------------------------------------------------------------------------------------------------
NNClassifier


2022-07-20 11:33:22.116223: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-20 11:33:23.417567: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-07-20 11:33:23.420008: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2198655000 Hz


        auc       acc       sen       spe       bac       mcc    cutoff
0  0.818093  0.761117  0.585242  0.881533  0.733387  0.496522  0.480465
****************************************************************************************************
DeepLocAutocorr
X_shape (4832, 720)
y_shape (4832,)
****************************************************************************************************
fold 0
----------------------------------------------------------------------------------------------------
LGBMClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.802143  0.752844  0.679389  0.803136  0.741263  0.485245  0.410101
----------------------------------------------------------------------------------------------------
RandomForestClassifier


[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.


        auc       acc       sen       spe       bac       mcc    cutoff
0  0.721729  0.690796  0.526718  0.803136  0.664927  0.344307  0.433009
----------------------------------------------------------------------------------------------------
SVMClassifier
        auc       acc  sen  spe  bac  mcc    cutoff
0  0.427408  0.593588  0.0  1.0  0.5  0.0  0.674078
----------------------------------------------------------------------------------------------------
NNClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.767444  0.726991  0.582697  0.825784  0.704241  0.423463  0.471754
****************************************************************************************************


In [5]:
df_results = []
for data_name in data_names:
    result_dict = results[data_name][0]
    df_result = pd.concat([v for v in result_dict.values()])
    df_result['data'] = data_name
    df_result.index = list(result_dict.keys())[-len(df_result):]
    df_results.append(df_result)

In [6]:
df_summary = pd.concat(df_results)
df_summary

Unnamed: 0,auc,acc,sen,spe,bac,mcc,cutoff,data
LGBMClassifier,0.81733,0.754912,0.605598,0.857143,0.73137,0.483018,0.429081,DeepLocAAindex
RandomForestClassifier,0.785683,0.749741,0.600509,0.851916,0.726213,0.471879,0.419848,DeepLocAAindex
SVMClassifier,0.752795,0.722854,0.559796,0.834495,0.697146,0.413228,0.413129,DeepLocAAindex
NNClassifier,0.818093,0.761117,0.585242,0.881533,0.733387,0.496522,0.480465,DeepLocAAindex
LGBMClassifier,0.802143,0.752844,0.679389,0.803136,0.741263,0.485245,0.410101,DeepLocAutocorr
RandomForestClassifier,0.721729,0.690796,0.526718,0.803136,0.664927,0.344307,0.433009,DeepLocAutocorr
SVMClassifier,0.427408,0.593588,0.0,1.0,0.5,0.0,0.674078,DeepLocAutocorr
NNClassifier,0.767444,0.726991,0.582697,0.825784,0.704241,0.423463,0.471754,DeepLocAutocorr


In [7]:
df_summary.to_csv('../results/summary/DSCDeepLoc.csv')