In [1]:
# The examples in this notebook use a set of nine benchmarks described in our publication.
# These benchmarks can be downloaded via FTP from: ftp.cs.huji.ac.il/users/nadavb/protein_bert/protein_benchmarks
# Download the benchmarks into a directory on your machine and set the following variable to the path of that directory.

import os

import pandas as pd
from IPython.display import display

from tensorflow import keras

from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs


2023-06-18 15:59:18.195111: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
import pickle

with open('./DBP_model_param/model_attention.pt', 'rb') as f:
    model_generator = pickle.load(f)
with open('./DBP_model_param/model_input_encoder_attention.pt', 'rb') as f:
    input_encoder = pickle.load(f)
with open('./DBP_model_param/model_OUTPUT_SPEC_attention.pt', 'rb') as f:
    OUTPUT_SPEC = pickle.load(f)


In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [4]:
test_set = pd.read_csv('./datasets/DBP_Predict_PDB.test.csv').dropna().drop_duplicates()

X_test,y_preds,y_trues,results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['protein_ID'],test_set['seq'], test_set['label'], \
        start_seq_len = 512, start_batch_size = 32)

print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)



2023-06-18 15:59:46.131401: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2023-06-18 15:59:50.940231: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-18 15:59:50.941979: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:00:08.0 name: Tesla P100-PCIE-12GB computeCapability: 6.0
coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 511.41GiB/s
2023-06-18 15:59:50.942058: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2023-06-18 15:59:52.016725: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2023-06-18 15:59:52.016877: I tensorflow/stream_executor/plat

Test-set performance:


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,562,0.844612
1024,29,1.0
2048,7,0.8
All,598,0.850885


Confusion matrix:


Unnamed: 0,0,1
0,267,32
1,93,206


In [5]:
dataframe_512 = pd.DataFrame()
dataframe_1024 = pd.DataFrame()
dataframe_2048 = pd.DataFrame()

dataframe_512['protein_ID'] = X_test[0]
dataframe_512['y_preds'] = y_preds[0]
dataframe_512['y_trues'] = y_trues[0]
dataframe_1024['protein_ID'] = X_test[1]
dataframe_1024['y_preds'] = y_preds[1]
dataframe_1024['y_trues'] = y_trues[1]
dataframe_2048['protein_ID'] = X_test[2]
dataframe_2048['y_preds'] = y_preds[2]
dataframe_2048['y_trues'] = y_trues[2]

dataframe = pd.concat([dataframe_512,dataframe_1024,dataframe_2048])
dataframe.to_csv('./PLM-DBPPred_predict_result.csv')