In [2]:
import sys

import os
import json
import time
import argparse

import pandas as pd
import numpy as np

from Bio import SeqIO
import esm

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset

from torchinfo import summary

from sklearn.metrics import f1_score, roc_auc_score, roc_curve, auc, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import wandb

sys.path.append('./../../../src/')

from utils import *
from utils_torch import * 
from MHCCBM import *
from TAPPredictor import *
from tqdm import tqdm

In [5]:
train_df = pd.read_csv('./../../../data/TAP/DeepTAP_train_test_split/train.csv')
test_df = pd.read_csv('./../../../data/TAP/DeepTAP_train_test_split/test.csv')

with open('./../../../data/TAP/classification_peptides_esm1b.pkl', 'rb') as f:
    peptide_embedding_dict = pickle.load(f)
f.close()

train_sequences = torch.cat([peptide_embedding_dict[p] for p in train_df['peptide'].to_list()])
train_labels = train_df['label'].to_numpy()
test_sequences = torch.cat([peptide_embedding_dict[p] for p in test_df['peptide'].to_list()])
test_labels = test_df['label'].to_numpy()


## Performance of DeepTAP on it's own datasplit

In [2]:
deeptap_split_df = pd.read_csv('./../../../data/TAP/DeepTAP_train_test_split/test.csv')
deeptap_split_result_df = pd.read_csv('./../../../data/TAP/DeepTAP_train_test_split/deeptap_results/test_peptides_deeptap_DeepTAP_cla_predresult.csv')

deeptap_merged_df = deeptap_split_result_df.merge(deeptap_split_df,on='peptide')

labels_ls = deeptap_merged_df['label']
proba_ls = deeptap_merged_df['pred_score']
preds_ls = deeptap_merged_df['pred_label']

fpr, tpr, _ = roc_curve(labels_ls, proba_ls)
print("AUROC: ", auc(fpr, tpr))

f1 = f1_score(labels_ls, preds_ls, average='weighted')
print("f1: ",f1)
        
precision, recall, _ = precision_recall_curve(labels_ls, proba_ls)
auprc_score = auc(recall, precision)
print("AUPRC: ", auprc_score)

AUROC:  0.91330110712615
f1:  0.870425521932911
AUPRC:  0.9571925284874283


# DeepTAP on splits with seeds

In [11]:
# Split the data
TAP_df = pd.read_csv('./../../../data/TAP/classification_DS868.csv',sep='\t')
seed = 42

train_sequences, test_sequences, train_labels, test_labels = train_test_split(TAP_df['peptide'], TAP_df['label'], 
                                                                              test_size=0.2, random_state=seed, stratify=TAP_df['label'])

# valid_sequences, test_sequences, valid_labels, test_labels = train_test_split(temp_sequences, temp_labels, 
#                                                                               test_size=0.5, random_state=seed, stratify=temp_labels)

# pd.DataFrame({'peptide':test_sequences.to_list()}).to_csv('./../../../data/TAP/test_peptides_splits'+str(seed)+'.csv', 
#                                                           index=False)

split_df = pd.DataFrame({'peptide':test_sequences,'label':test_labels}).reset_index(drop=True)

split_result_df = pd.read_csv('./../../../data/TAP/DeepTAP_train_test_split/deeptap_results/test_peptides_splits'+str(seed)+'_DeepTAP_cla_predresult.csv')

split_merged_df = split_result_df.merge(split_df,on='peptide')

labels_ls = split_merged_df['label']
proba_ls = split_merged_df['pred_score']
preds_ls = split_merged_df['pred_label']

fpr, tpr, _ = roc_curve(labels_ls, proba_ls)
print("AUROC: ", auc(fpr, tpr))

f1 = f1_score(labels_ls, preds_ls, average='weighted')
print("f1: ",f1)
        
precision, recall, _ = precision_recall_curve(labels_ls, proba_ls)
auprc_score = auc(recall, precision)
print("AUPRC: ", auprc_score)

AUROC:  0.8987654320987655
f1:  0.8194984431054703
AUPRC:  0.9028300578979659
