In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve,
    confusion_matrix,
    accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold
from keras.models import model_from_json
import pickle

In [2]:
def roc_cutoff(y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    cutoff = thresholds[np.argmax(tpr - fpr)]
    return cutoff


def evaluate_clf(y_true, y_pred, cutoff):
    pred_label = (y_pred >= cutoff) * 1
    tn, fp, fn, tp = confusion_matrix(y_true, pred_label).ravel()
    accuracy = accuracy_score(y_true, pred_label)
    balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
    mcc = matthews_corrcoef(y_true, pred_label)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    auc = roc_auc_score(y_true, y_pred)
    metrics = {
        'auc': [auc],
        'acc': [accuracy],
        'sen': [sensitivity],
        'spe': [specificity],
        'bac': [balanced_accuracy],
        'mcc': [mcc],
        'cutoff': [cutoff]
    }
    return metrics


def load_lgb(path):
    if 'LGB' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_rf(path):
    if 'RandomForest' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_svm(path):
    if 'SVM' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_xgb(path):
    if 'XGB' in path:
        with open(path, 'rb') as f:
            model = pickle.load(f)
    return model


def load_nn(path_json, path_h5, path_transformer):
    if 'NN' in path_json:
        with open(path_json, 'r') as f:
            json_string = f.read()
        model = model_from_json(json_string)
        model.load_weights(path_h5)
        if os.path.exists(path_transformer):
            with open(path_transformer, 'rb') as f:
                transformer = pickle.load(f)
        else:
            transformer = None
        return model, transformer


def sort_preds(PREDS, IDXES):
    va_idxes = np.concatenate(IDXES)
    order = np.argsort(va_idxes)
    va_preds = np.concatenate(PREDS)
    return va_preds[order]

In [3]:
root = os.getcwd()
print(root)

n_splits_ncv = 5
seed_ncv = 1712

/home/kurosaki/Document/Research/PJ0/Repository/Bioenv/src


In [8]:
f_names = [
        'DeepLocEmbedd_BERT_BFD.csv',
        'DeepLocEmbedd_Albert_BFD.csv',
        'DeepLocEmbedd_T5_BFD.csv',
        'DeepLocEmbedd_T5_FT.csv'
    ]
data_names = [
    'DeepLocBERT',
    'DeepLocAlbert',
    'DeepLocT5',
    'DeepLocT5FT'
    ]

results = {
    data:{
        i:{
        'LGBMClassifier':None,
        'RandomForestClassifier':None,
        'SVMClassifier':None,
        'NNClassifier':None
        }
        for i in range(5)
        }
        for data in data_names
}

for f_name, data_name in zip(f_names, data_names):
    df = pd.read_csv(f'../data/DeepLoc/{f_name}')
    M_idx = df.iloc[:,1]=='M'
    S_idx = df.iloc[:,1]=='S'
    df = pd.concat([df[M_idx], df[S_idx]], axis=0)
    X = np.array(df.iloc[:,2:])
    y = np.concatenate([np.array([1]*sum(M_idx)), np.array([0]*sum(S_idx))]).flatten()
    print(data_name)
    print('X_shape', X.shape)
    print('y_shape', y.shape)
    skf_outer = StratifiedKFold(n_splits=n_splits_ncv, random_state=seed_ncv, shuffle=True) 
    outer_idxes = list(skf_outer.split(X, y))

    # evaluate ptrained model
    for i, (inner_idx, te_idx) in enumerate(outer_idxes):
        if i == 0:
            print('*'*100)
            print('fold', i)
            for model_name in [
                # 'LGBMClassifier',
                # 'RandomForestClassifier',
                'SVMClassifier', 
                # 'NNClassifier'
                ]:
                skf_inner = StratifiedKFold(n_splits=n_splits_ncv, random_state=seed_ncv, shuffle=True)
                inner_idxes = list(skf_outer.split(X[inner_idx], y[inner_idx]))
                print('-'*100)
                print(model_name)
                va_preds, va_idxes, te_preds, cutoffs = [], [], [], []
                for j, (tr_idx, va_idx) in enumerate(inner_idxes):
                    if 'LGB' in model_name:
                        model = load_lgb(f'../results/models/{data_name}/LGBMClassifier/LGBMClassifier_ij{i}{j}_trainedmodel.pkl')
                        va_pred = model.predict_proba(X[inner_idx][va_idx], num_iterations=model.best_iteration_)[:,1]
                        te_pred = model.predict_proba(X[te_idx], num_iterations=model.best_iteration_)[:,1]
                    elif 'RandomForest' in model_name:
                        model = load_rf(f'../results/models/{data_name}/RandomForestClassifier/RandomForestClassifier_ij{i}{j}_trainedmodel.pkl')
                        va_pred = model.predict_proba(X[inner_idx][va_idx])[:,1]
                        te_pred = model.predict_proba(X[te_idx])[:,1]
                    elif 'SVM' in model_name:
                        model = load_svm(f'../results/models/{data_name}/SVMClassifier/SVMClassifier_ij{i}{j}_trainedmodel.pkl')
                        va_pred = model.predict_score(X[inner_idx][va_idx])
                        te_pred = model.predict_score(X[te_idx])
                    elif 'NN' in model_name:
                        model, transformer = load_nn(
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_i{i}_architecture.json',
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_ij{i}{j}_trainedweight.h5',
                            f'../results/models/{data_name}/NNClassifier/NNClassifier_ij{i}{j}_transformer.pkl'
                            )
                        va_pred = model.predict(transformer.transform(X[inner_idx][va_idx]))
                        te_pred = model.predict(transformer.transform(X[te_idx]))
                    cutoff = roc_cutoff(y[inner_idx][va_idx], va_pred)
                    va_preds.append(va_pred)
                    va_idxes.append(va_idx)
                    te_preds.append(te_pred)
                    cutoffs.append(cutoff)
                te_pred_mean = np.mean(te_preds, axis=0)
                va_pred_sprted = sort_preds(va_preds, va_idxes)
                metrics = evaluate_clf(y[te_idx], te_pred_mean, np.mean(cutoffs))
                print(pd.DataFrame(metrics))
                results[data_name][i][model_name] = pd.DataFrame(metrics)
            print('*'*100)


DeepLocBERT
X_shape (4832, 1024)
y_shape (4832,)
****************************************************************************************************
fold 0
----------------------------------------------------------------------------------------------------
SVMClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.858655  0.829369  0.814249  0.839721  0.826985  0.649554 -0.202649
****************************************************************************************************
DeepLocAlbert
X_shape (4832, 4096)
y_shape (4832,)
****************************************************************************************************
fold 0
----------------------------------------------------------------------------------------------------
SVMClassifier
        auc       acc       sen       spe       bac       mcc    cutoff
0  0.809433  0.778697  0.600509  0.900697  0.750603  0.535679 -0.650804
*****************************************************************

In [30]:
df_results = []
for data_name in data_names:
    result_dict = results[data_name][0]
    df_result = pd.concat([v for v in result_dict.values()])
    df_result['data'] = data_name
    # df_result.index = list(result_dict.keys())[-len(df_result):]
    df_result.index = ['SVMClassifier']
    df_results.append(df_result)

In [32]:
df_summary = pd.concat(df_results)
df_summary

Unnamed: 0,auc,acc,sen,spe,bac,mcc,cutoff,data
SVMClassifier,0.858655,0.829369,0.814249,0.839721,0.826985,0.649554,-0.202649,DeepLocBERT
SVMClassifier,0.809433,0.778697,0.600509,0.900697,0.750603,0.535679,-0.650804,DeepLocAlbert
SVMClassifier,0.850999,0.825233,0.806616,0.837979,0.822297,0.640648,-0.189951,DeepLocT5
SVMClassifier,0.854104,0.825233,0.80916,0.836237,0.822699,0.641039,-0.188526,DeepLocT5FT


In [33]:
df_summary.to_csv('../results/summary/LMDeepLoc_SVM.csv')