In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve,
    confusion_matrix,
    accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold
from keras.models import model_from_json
import pickle

In [1]:
def roc_cutoff(y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    cutoff = thresholds[np.argmax(tpr - fpr)]
    return cutoff


def evaluate_clf(y_true, y_pred, cutoff):
    pred_label = (y_pred >= cutoff) * 1
    tn, fp, fn, tp = confusion_matrix(y_true, pred_label).ravel()
    accuracy = accuracy_score(y_true, pred_label)
    balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
    mcc = matthews_corrcoef(y_true, pred_label)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    auc = roc_auc_score(y_true, y_pred)
    metrics = {
        'auc': [auc],
        'acc': [accuracy],
        'sen': [sensitivity],
        'spe': [specificity],
        'bac': [balanced_accuracy],
        'mcc': [mcc],
        'cutoff': [cutoff]
    }
    return metrics


def load_lgb(path):
    if 'LGB' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_rf(path):
    if 'RandomForest' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_svm(path):
    if 'SVM' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_xgb(path):
    if 'XGB' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_nn(path_json, path_h5, path_transformer):
    if 'NN' in path_json:
        with open(path_json, 'r') as f:
            json_string = f.read()
        model = model_from_json(json_string)
        model.load_weights(path_h5)
        if os.path.exists(path_transformer):
            with open(path_transformer, 'rb') as f:
                transformer = pickle.load(f)
        else:
            transformer = None
        return model, transformer


def sort_preds(PREDS, IDXES):
    va_idxes = np.concatenate(IDXES)
    order = np.argsort(va_idxes)
    va_preds = np.concatenate(PREDS)
    return va_preds[order]

In [5]:
root = os.getcwd()
print(root)

n_splits_ncv = 5
seed_ncv = 1712

/home/kurosaki/Document/Research/PJ0/Repository/Bioenv/src


In [13]:
f_names = [
        'DeepLocEmbedd_BERT_BFD.csv',
        'DeepLocEmbedd_Albert_BFD.csv',
        'DeepLocEmbedd_T5_BFD.csv',
        'DeepLocEmbedd_T5_FT.csv'
    ]
data_names = [
    'DeepLocBERT',
    'DeepLocAlbert',
    'DeepLocT5',
    'DeepLocT5FT'
    ]

results = {
    data:{
        i:{
        'LGBMClassifier':None,
        'RandomForestClassifier':None,
        'SVMClassifier':None
        }
        for i in range(5)
        }
        for data in data_names
}

for f_name, data_name in zip(f_names, data_names):
    df = pd.read_csv(f'../data/DeepLoc/{f_name}')
    M_idx = df.iloc[:,1]=='M'
    S_idx = df.iloc[:,1]=='S'
    df = pd.concat([df[M_idx], df[S_idx]], axis=0)
    X = np.array(df.iloc[:,2:])
    y = np.concatenate([np.array([1]*sum(M_idx)), np.array([0]*sum(S_idx))]).flatten()
    print(data_name)
    print('X_shape', X.shape)
    print('y_shape', y.shape)
    skf_outer = StratifiedKFold(n_splits=n_splits_ncv, random_state=seed_ncv, shuffle=True) 
    outer_idxes = list(skf_outer.split(X, y))

    # evaluate ptrained model
    for i, (inner_idx, te_idx) in enumerate(outer_idxes):
        if i == 0:
            print('*'*100)
            print('fold', i)
            for model_name in ['LGBMClassifier', 'RandomForestClassifier', 'SVMClassifier', 'NNClassifier']:
                skf_inner = StratifiedKFold(n_splits=n_splits_ncv, random_state=seed_ncv, shuffle=True)
                inner_idxes = list(skf_outer.split(X[inner_idx], y[inner_idx]))
                print('-'*100)
                print(model_name)
                te_preds, cutoffs = [], []
                for j, (tr_idx, va_idx) in enumerate(inner_idxes):
                    if 'LGB' in model_name:
                        model = load_lgb(f'../results/models/{data_name}/LGBMClassifier/LGBMClassifier_ij{i}{j}_trainedmodel.pkl')
                        va_pred = model.predict_proba(X[inner_idx][va_idx], num_iterations=model.best_iteration_)[:,1]
                        te_pred = model.predict_proba(X[te_idx], num_iterations=model.best_iteration_)[:,1]
                    elif 'RandomForest' in model_name:
                        model = load_rf(f'../results/models/{data_name}/RandomForestClassifier/RandomForestClassifier_ij{i}{j}_trainedmodel.pkl')
                        va_pred = model.predict_proba(X[inner_idx][va_idx])[:,1]
                        te_pred = model.predict_proba(X[te_idx])[:,1]
                    elif 'SVM' in model_name:
                        model = load_svm(f'../results/models/{data_name}/SVMClassifier/SVMClassifier_ij{i}{j}_trainedmodel.pkl')
                        va_pred = model.predict_proba(X[inner_idx][va_idx])[:,1]
                        te_pred = model.predict_proba(X[te_idx])[:,1]
                    elif 'NN' in model_name:
                        model, transformer = load_nn(
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_i{i}_architecture.json',
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_ij{i}{j}_trainedweight.h5',
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_ij{i}{j}_transformer.pkl'
                            )
                        va_pred = model.predict(transformer.transform(X[inner_idx][va_idx]))
                        te_pred = model.predict(transformer.transform(X[te_idx]))
                cutoff = roc_cutoff(y[inner_idx][va_idx], va_pred)
                te_preds.append(te_pred)
                cutoffs.append(cutoff)
                te_pred_mean = np.mean(te_preds, axis=0)
                metrics = evaluate_clf(y[te_idx], te_pred_mean, np.mean(cutoffs))
                print(pd.DataFrame(metrics))
                results[data_name][i][model_name] = pd.DataFrame(metrics)
            print('*'*100)


DeepLocBERT
X_shape (4832, 1024)
y_shape (4832,)
****************************************************************************************************
fold 0
----------------------------------------------------------------------------------------------------
LGBMClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.916598  0.864529  0.821883  0.893728  0.857806  0.718339  0.317998
----------------------------------------------------------------------------------------------------
RandomForestClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.885594  0.825233  0.659033  0.939024  0.799029  0.638578  0.472167
----------------------------------------------------------------------------------------------------
SVMClassifier
        auc       acc      sen       spe       bac      mcc    cutoff
0  0.839267  0.815926  0.78626  0.836237  0.811248  0.62015  0.393979
------------------------------------------------------------

In [17]:
df_results

[                             auc       acc       sen       spe       bac  \
 LGBMClassifier          0.916598  0.864529  0.821883  0.893728  0.857806   
 RandomForestClassifier  0.885594  0.825233  0.659033  0.939024  0.799029   
 SVMClassifier           0.839267  0.815926  0.786260  0.836237  0.811248   
 NNClassifier            0.915800  0.849018  0.821883  0.867596  0.844739   
 
                              mcc    cutoff         data  
 LGBMClassifier          0.718339  0.317998  DeepLocBERT  
 RandomForestClassifier  0.638578  0.472167  DeepLocBERT  
 SVMClassifier           0.620150  0.393979  DeepLocBERT  
 NNClassifier            0.687880  0.351950  DeepLocBERT  ,
                              auc       acc       sen       spe       bac  \
 LGBMClassifier          0.920991  0.844881  0.809160  0.869338  0.839249   
 RandomForestClassifier  0.859625  0.804550  0.648855  0.911150  0.780002   
 SVMClassifier           0.494856  0.593588  0.000000  1.000000  0.500000   
 NNClassi

In [14]:
df_results = []
for data_name in data_names:
    result_dict = results[data_name][0]
    df_result = pd.concat([v for v in result_dict.values()])
    df_result['data'] = data_name
    df_result.index = list(result_dict.keys())[-len(df_result):]
    df_results.append(df_result)

In [16]:
df_summary = pd.concat(df_results)
df_summary

Unnamed: 0,auc,acc,sen,spe,bac,mcc,cutoff,data
LGBMClassifier,0.916598,0.864529,0.821883,0.893728,0.857806,0.718339,0.317998,DeepLocBERT
RandomForestClassifier,0.885594,0.825233,0.659033,0.939024,0.799029,0.638578,0.472167,DeepLocBERT
SVMClassifier,0.839267,0.815926,0.78626,0.836237,0.811248,0.62015,0.393979,DeepLocBERT
NNClassifier,0.9158,0.849018,0.821883,0.867596,0.844739,0.68788,0.35195,DeepLocBERT
LGBMClassifier,0.920991,0.844881,0.80916,0.869338,0.839249,0.678498,0.291704,DeepLocAlbert
RandomForestClassifier,0.859625,0.80455,0.648855,0.91115,0.780002,0.591402,0.462963,DeepLocAlbert
SVMClassifier,0.494856,0.593588,0.0,1.0,0.5,0.0,1.406519,DeepLocAlbert
NNClassifier,0.925739,0.864529,0.832061,0.88676,0.85941,0.719111,0.318228,DeepLocAlbert
LGBMClassifier,0.936201,0.855222,0.862595,0.850174,0.856385,0.705354,0.276454,DeepLocT5
RandomForestClassifier,0.903219,0.807653,0.839695,0.785714,0.812704,0.615582,0.349314,DeepLocT5


In [7]:
df_summary.to_csv('../results/summary/LMDeepLoc.csv')