In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve,
    confusion_matrix,
    accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold
from keras.models import model_from_json
import pickle

In [2]:
def roc_cutoff(y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    cutoff = thresholds[np.argmax(tpr - fpr)]
    return cutoff


def evaluate_clf(y_true, y_pred, cutoff):
    pred_label = (y_pred >= cutoff) * 1
    tn, fp, fn, tp = confusion_matrix(y_true, pred_label).ravel()
    accuracy = accuracy_score(y_true, pred_label)
    balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
    mcc = matthews_corrcoef(y_true, pred_label)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    auc = roc_auc_score(y_true, y_pred)
    metrics = {
        'auc': [auc],
        'acc': [accuracy],
        'sen': [sensitivity],
        'spe': [specificity],
        'bac': [balanced_accuracy],
        'mcc': [mcc],
        'cutoff': [cutoff]
    }
    return metrics


def load_lgb(path):
    if 'LGB' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_rf(path):
    if 'RandomForest' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_svm(path):
    if 'SVM' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_xgb(path):
    if 'XGB' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_nn(path_json, path_h5, path_transformer):
    if 'NN' in path_json:
        with open(path_json, 'r') as f:
            json_string = f.read()
        model = model_from_json(json_string)
        model.load_weights(path_h5)
        if os.path.exists(path_transformer):
            with open(path_transformer, 'rb') as f:
                transformer = pickle.load(f)
        else:
            transformer = None
        return model, transformer


def sort_preds(PREDS, IDXES):
    va_idxes = np.concatenate(IDXES)
    order = np.argsort(va_idxes)
    va_preds = np.concatenate(PREDS)
    return va_preds[order]

In [3]:
root = os.getcwd()
print(root)

n_splits_ncv = 5
seed_ncv = 1712

/home/kurosaki/Document/Research/PJ0/Repository/Bioenv/src


In [4]:
f_names = [
        'DeepPPIEmbedd_BERT_BFD.csv',
        'DeepPPIEmbedd_Albert_BFD.csv',
        'DeepPPIEmbedd_T5_BFD.csv',
        'DeepPPIEmbedd_T5_FT.csv'
    ]
data_names = [
    'DeepPPIBERT',
        'DeepPPIAlbert',
        'DeepPPIT5',
        'DeepPPIT5FT'
    ]

results = {
    data:{
        i:{
        'LGBMClassifier':None,
        'RandomForestClassifier':None,
        'NNClassifier':None
        }
        for i in range(5)
        }
        for data in data_names
}

for f_name, data_name in zip(f_names, data_names):
     # load core data set for PPI classification
    df = pd.read_csv('../data/DeepPPI/DeepPPIAll.csv')
    protein_a = np.array(df['proteinA'])
    protein_b = np.array(df['proteinB'])
    y = np.array(df['interaction'])

    # load features
    df_feature = pd.read_csv(f'../data/DeepPPI/{f_name}')
    # pre-processing of protein features
    feature_dict = {
        Id: np.array(df_feature[df_feature.iloc[:,0]==Id].iloc[:,1:])
        for Id in df_feature.iloc[:,0].tolist()
    }
    feature_a, feature_b = [], []
    for a, b in zip(protein_a, protein_b):
        feature_a.append(feature_dict[a])
        feature_b.append(feature_dict[b])
    X_a, X_b = np.concatenate(feature_a), np.concatenate(feature_b)
    X = np.concatenate([X_a, X_b], axis=1)
    print(data_name)
    print('X_shape', X.shape)
    print('y_shape', y.shape)
    skf_outer = StratifiedKFold(n_splits=n_splits_ncv, random_state=seed_ncv, shuffle=True) 
    outer_idxes = list(skf_outer.split(X, y))

    # evaluate ptrained model
    for i, (inner_idx, te_idx) in enumerate(outer_idxes):
        if i == 0:
            print('*'*100)
            print('fold', i)
            for model_name in ['LGBMClassifier', 'RandomForestClassifier', 'NNClassifier']:
                skf_inner = StratifiedKFold(n_splits=n_splits_ncv, random_state=seed_ncv, shuffle=True)
                inner_idxes = list(skf_outer.split(X[inner_idx], y[inner_idx]))
                print('-'*100)
                print(model_name)
                te_preds, cutoffs = [], []
                for j, (tr_idx, va_idx) in enumerate(inner_idxes):
                    if 'LGB' in model_name:
                        model = load_lgb(f'../results/models/{data_name}/LGBMClassifier/LGBMClassifier_ij{i}{j}_trainedmodel.pkl')
                        va_pred = model.predict_proba(X[inner_idx][va_idx], num_iterations=model.best_iteration_)[:,1]
                        te_pred = model.predict_proba(X[te_idx], num_iterations=model.best_iteration_)[:,1]
                    elif 'RandomForest' in model_name:
                        model = load_rf(f'../results/models/{data_name}/RandomForestClassifier/RandomForestClassifier_ij{i}{j}_trainedmodel.pkl')
                        va_pred = model.predict_proba(X[inner_idx][va_idx])[:,1]
                        te_pred = model.predict_proba(X[te_idx])[:,1]
                    elif 'NN' in model_name:
                        model, transformer = load_nn(
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_i{i}_architecture.json',
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_ij{i}{j}_trainedweight.h5',
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_ij{i}{j}_transformer.pkl'
                            )
                        va_pred = model.predict(transformer.transform(X[inner_idx][va_idx]))
                        te_pred = model.predict(transformer.transform(X[te_idx]))
                cutoff = roc_cutoff(y[inner_idx][va_idx], va_pred)
                te_preds.append(te_pred)
                cutoffs.append(cutoff)
                te_pred_mean = np.mean(te_preds, axis=0)
                metrics = evaluate_clf(y[te_idx], te_pred_mean, np.mean(cutoffs))
                print(pd.DataFrame(metrics))
                results[data_name][i][model_name] = pd.DataFrame(metrics)
            print('*'*100)


DeepPPIBERT
X_shape (65851, 2048)
y_shape (65851,)
****************************************************************************************************
fold 0
----------------------------------------------------------------------------------------------------
LGBMClassifier
       auc       acc       sen       spe       bac       mcc    cutoff
0  0.98096  0.942753  0.926419  0.948554  0.937487  0.856277  0.244995
----------------------------------------------------------------------------------------------------
RandomForestClassifier


[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.


        auc       acc       sen       spe       bac       mcc    cutoff
0  0.934172  0.896211  0.843279  0.915012  0.879146  0.739729  0.260835
----------------------------------------------------------------------------------------------------
NNClassifier


2022-06-15 11:37:12.727493: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-15 11:37:21.027225: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-06-15 11:37:21.027604: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2198655000 Hz


        auc       acc       sen       spe       bac       mcc    cutoff
0  0.948846  0.894769  0.875435  0.901636  0.888535  0.744296  0.229315
****************************************************************************************************
DeepPPIAlbert
X_shape (65851, 8192)
y_shape (65851,)
****************************************************************************************************
fold 0
----------------------------------------------------------------------------------------------------
LGBMClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.980465  0.939564  0.929896  0.942998  0.936447  0.849684  0.236839
----------------------------------------------------------------------------------------------------
RandomForestClassifier


[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.


        auc      acc       sen       spe       bac       mcc    cutoff
0  0.890095  0.85339  0.784183  0.877971  0.831077  0.638169  0.272296
----------------------------------------------------------------------------------------------------
NNClassifier
        auc       acc      sen       spe       bac       mcc    cutoff
0  0.968254  0.921494  0.90759  0.926433  0.917011  0.806545  0.227431
****************************************************************************************************
DeepPPIT5
X_shape (65851, 2048)
y_shape (65851,)
****************************************************************************************************
fold 0
----------------------------------------------------------------------------------------------------
LGBMClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.981526  0.944575  0.931344  0.949275  0.940309  0.861093  0.242883
------------------------------------------------------------------------------------

[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.


        auc       acc      sen       spe       bac       mcc    cutoff
0  0.930297  0.879052  0.86095  0.885482  0.873216  0.709666  0.186697
----------------------------------------------------------------------------------------------------
NNClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.964751  0.914585  0.900348  0.919642  0.909995  0.790523  0.240993
****************************************************************************************************
DeepPPIT5FT
X_shape (65851, 2048)
y_shape (65851,)
****************************************************************************************************
fold 0
----------------------------------------------------------------------------------------------------
LGBMClassifier
        auc       acc       sen      spe       bac       mcc    cutoff
0  0.981817  0.940475  0.934241  0.94269  0.938465  0.852395  0.218906
----------------------------------------------------------------------------------

[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.


       auc       acc       sen      spe       bac       mcc    cutoff
0  0.93963  0.897806  0.852549  0.91388  0.883215  0.745103  0.237156
----------------------------------------------------------------------------------------------------
NNClassifier
        auc       acc       sen       spe     bac      mcc    cutoff
0  0.957855  0.897426  0.894554  0.898446  0.8965  0.75445  0.211178
****************************************************************************************************


In [5]:
df_results = []
for data_name in data_names:
    result_dict = results[data_name][0]
    df_result = pd.concat([v for v in result_dict.values()])
    df_result['data'] = data_name
    df_result.index = list(result_dict.keys())[-len(df_result):]
    df_results.append(df_result)

In [6]:
df_summary = pd.concat(df_results)
df_summary

Unnamed: 0,auc,acc,sen,spe,bac,mcc,cutoff,data
LGBMClassifier,0.98096,0.942753,0.926419,0.948554,0.937487,0.856277,0.244995,DeepPPIBERT
RandomForestClassifier,0.934172,0.896211,0.843279,0.915012,0.879146,0.739729,0.260835,DeepPPIBERT
NNClassifier,0.948846,0.894769,0.875435,0.901636,0.888535,0.744296,0.229315,DeepPPIBERT
LGBMClassifier,0.980465,0.939564,0.929896,0.942998,0.936447,0.849684,0.236839,DeepPPIAlbert
RandomForestClassifier,0.890095,0.85339,0.784183,0.877971,0.831077,0.638169,0.272296,DeepPPIAlbert
NNClassifier,0.968254,0.921494,0.90759,0.926433,0.917011,0.806545,0.227431,DeepPPIAlbert
LGBMClassifier,0.981526,0.944575,0.931344,0.949275,0.940309,0.861093,0.242883,DeepPPIT5
RandomForestClassifier,0.930297,0.879052,0.86095,0.885482,0.873216,0.709666,0.186697,DeepPPIT5
NNClassifier,0.964751,0.914585,0.900348,0.919642,0.909995,0.790523,0.240993,DeepPPIT5
LGBMClassifier,0.981817,0.940475,0.934241,0.94269,0.938465,0.852395,0.218906,DeepPPIT5FT


In [7]:
df_summary.to_csv('../results/summary/LMDeepPPI.csv')