In [15]:
import pandas as pd
import os

# read every csv file from a folder and concatenate them into a single dataframe with a new column for the filename as protein_id
folder = 'proba/'
dfs = []
for file in os.listdir(folder):
    if file.endswith('.csv'):
        df = pd.read_csv(folder + file)
        df.columns = ['site', 'probability']
        df['protein_id'] = file.split('.')[0]
        dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

test = pd.read_csv('../../Embeddings/Prot_t5/test_t5.csv')

# match the protein_id , site and add label from test to the dataframe
df['site'] = df['site'].astype(str)
df['site'] = df['site'].str.replace('site_', '')
df['site'] = df['site'].astype(int)
df = df.merge(test, on=['protein_id', 'site'], how='left')

df.drop(columns=['embedding', 'sequence'], inplace=True)

df.dropna(subset=['label'], inplace=True)

print(len(df))

df.to_csv('proba.csv', index=False)

3085


In [4]:
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, average_precision_score
import numpy as np
def evaluate_model(y_pred_probs, y_true, print_metrics=True):

    # Convert probabilities/logits to binary predictions (threshold = 0.5).
    y_pred = (y_pred_probs > 0.5).astype(int)

    # If y_true is one-hot encoded, convert it to binary format
    if len(y_true.shape) > 1 and y_true.shape[1] > 1:  # Check if y_true is one-hot encoded
        y_true = np.argmax(y_true, axis=1)  # Convert one-hot encoded y_true to binary labels

    # Ensure y_pred is also 1D
    if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
        y_pred = np.argmax(y_pred, axis=1)  # Convert y_pred to binary labels if necessary

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_probs)
    auprc = average_precision_score(y_true, y_pred_probs)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Compute Specificity
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)

    # Print the results
    if print_metrics:
        print(f'Accuracy: {accuracy:.4f}')
        print(f'MCC: {mcc:.4f}')
        print(f'AUC: {auc:.4f}')
        print(f'AUPRC: {auprc:.4f}')
        print(f'Precision: {precision:.4f}')
        print(f'Recall: {recall:.4f}')
        print(f'Specificity: {specificity:.4f}')
        print(f'F1 Score: {f1:.4f}')

    return accuracy, mcc, auc, auprc, precision, recall, specificity, f1

In [5]:
import pandas as pd
pSuc_df = pd.read_csv('proba.csv')
y_prob_pSuc = pSuc_df['probability'].values
y_true_pSuc = pSuc_df['label'].values

evaluate_model(y_prob_pSuc, y_true_pSuc)

Accuracy: 0.7446
MCC: 0.3088
AUC: 0.8357
AUPRC: 0.3077
Precision: 0.2041
Recall: 0.7875
Specificity: 0.7409
F1 Score: 0.3242


(0.7445705024311183,
 0.3088289817877136,
 0.8357322788517867,
 0.30769136881825615,
 0.2041036717062635,
 0.7875,
 0.7409490333919156,
 0.3241852487135506)