In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "/home/qfchen/CPPCGM/model/ProtBert_BFD_CPPSet1/"  
tokenizer = AutoTokenizer.from_pretrained('../Rostlab/prot_bert_bfd', do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

model.eval()

file = '/home/qfchen/CPPCGM/CPPGenerator/results/mask_generated_peptides_1.csv'
sequences =  [item for item in pd.read_csv(file)['Peptide'].values]

predictions = []
with torch.no_grad():  
    for seq in sequences:
        inputs = tokenizer(seq, return_tensors="pt", padding=True, truncation=True, max_length=80)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.append(torch.argmax(logits, dim=1).item()) 

bert_bfd_results = []
for seq, pred in zip(sequences, predictions):
    #print(f"Sequence: {seq}, Prediction: {pred}")
    bert_bfd_results.append([seq, pred])

In [2]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "/home/qfchen/CPPCGM/model/ProtBert_CPPSet1"  
tokenizer = AutoTokenizer.from_pretrained('../Rostlab/prot_bert', do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

model.eval()

file = '/home/qfchen/CPPCGM/CPPGenerator/results/mask_generated_peptides_1.csv'
sequences =  [item for item in pd.read_csv(file)['Peptide'].values]

predictions = []
with torch.no_grad():  
    for seq in sequences:
        inputs = tokenizer(seq, return_tensors="pt", padding=True, truncation=True, max_length=80)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.append(torch.argmax(logits, dim=1).item()) 

bert_results = []
for seq, pred in zip(sequences, predictions):
    # print(f"Sequence: {seq}, Prediction: {pred}")
    bert_results.append([seq, pred])

In [3]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "/home/qfchen/CPPCGM/model/ProtElectra_CPPSet1/"  
tokenizer = AutoTokenizer.from_pretrained('../Rostlab/prot_electra_discriminator_bfd', do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

model.eval()

file = '/home/qfchen/CPPCGM/CPPGenerator/results/mask_generated_peptides_1.csv'
sequences =  [item for item in pd.read_csv(file)['Peptide'].values]

predictions = []
with torch.no_grad():  
    for seq in sequences:
        inputs = tokenizer(seq, return_tensors="pt", padding=True, truncation=True, max_length=80)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.append(torch.argmax(logits, dim=1).item())

electra_results = []
for seq, pred in zip(sequences, predictions):
    electra_results.append([seq, pred])
    #print(f"Sequence: {seq}, Prediction: {pred}")

In [4]:
df1 = pd.DataFrame(bert_results, columns=["Sequence", 'Prediction']) 
df2 = pd.DataFrame(bert_bfd_results, columns=["Sequence", 'Prediction']) 
df3 = pd.DataFrame(electra_results, columns=["Sequence", 'Prediction']) 

merged_df = pd.merge(df1, df2, on='Sequence', how='inner')
merged_df = pd.merge(merged_df, df3, on='Sequence', how='inner')
merged_df.columns = ['Sequence', 'ProtBert', 'ProtBert_BFD','ProtElectra']
merged_df.to_csv('mask_peptides.csv', index=False)
merged_df

Unnamed: 0,Sequence,ProtBert,ProtBert_BFD,ProtElectra
0,XKWMKWKK,1,0,1
1,XKWMKWKKW,1,0,1
2,XKWMKWKKWK,1,0,1
3,XKWMKWKKWKK,1,0,1
4,XKWMKWKKWKKK,1,0,1
...,...,...,...,...
310,AMAAYRDLLSALLRLLAALRRLLRRLARLRAAYRRLLRLLMAAYRR...,1,0,1
311,AMAAYRDLLSALLRLLAALRRLLRRLARLRAAYRRLLRLLMAAYRR...,1,0,1
312,AMAAYRDLLSALLRLLAALRRLLRRLARLRAAYRRLLRLLMAAYRR...,1,0,1
313,AMAAYRDLLSALLRLLAALRRLLRRLARLRAAYRRLLRLLMAAYRR...,1,0,1
