In [1]:
# The examples in this notebook use a set of nine benchmarks described in our publication.
# These benchmarks can be downloaded via FTP from: ftp.cs.huji.ac.il/users/nadavb/protein_bert/protein_benchmarks
# Download the benchmarks into a directory on your machine and set the following variable to the path of that directory.
BENCHMARKS_DIR = './datasets/'
import os

import pandas as pd
from IPython.display import display

from tensorflow import keras

from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs


2023-06-14 07:43:18.583909: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:

BENCHMARK_NAME = 'DBP_Predict_uniprot_0.5'

# A local (non-global) bianry output
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)


# Loading the dataset

train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
train_set, valid_set = train_test_split(train_set, stratify = train_set['label'], test_size = 0.04, random_state = 0)

test_set_file_path = os.path.join(BENCHMARKS_DIR, 'DBP_Predict_PDB.test.csv')
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()

print(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')


# Loading the pre-trained model and fine-tuning it on the loaded dataset

pretrained_model_generator, input_encoder = load_pretrained_model()

# get_model_with_hidden_layers_as_outputs gives the model output access to the hidden layers (on top of the output)
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
]

finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], \
        seq_len = 512, batch_size = 32, max_epochs_per_stage =10, lr = 1e-04, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = 1e-05, callbacks = training_callbacks)


# Evaluating the performance on the test-set



25541 training set records, 1065 validation set records, 598 test set records.
[2023_06_14-07:43:22] Training set: Filtered out 8195 of 25541 (32.1%) records of lengths exceeding 510.
[2023_06_14-07:43:23] Validation set: Filtered out 333 of 1065 (31.3%) records of lengths exceeding 510.
[2023_06_14-07:43:23] Training with frozen pretrained layers...


2023-06-14 07:43:23.487540: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2023-06-14 07:43:26.671572: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 07:43:26.679998: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:00:07.0 name: Tesla P100-PCIE-12GB computeCapability: 6.0
coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 511.41GiB/s
2023-06-14 07:43:26.680149: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 07:43:26.681641: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 1 with properties: 
pciBusID: 000

last_hidden_layer (None, 15599)
last_hidden_layer (None, 1, 15599)
attention_layer  (None, 15599)


2023-06-14 07:43:37.287156: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2023-06-14 07:43:37.287685: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2200110000 Hz


Epoch 1/10


2023-06-14 07:43:45.981222: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2023-06-14 07:43:46.555255: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2023-06-14 07:43:46.555564: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2023-06-14 07:43:47.081746: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8201
2023-06-14 07:43:47.574637: E tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-06-14 07:43:47.574701: W tensorflow/stream_executor/gpu/asm_compiler.cc:56] Couldn't invoke ptxas --version
2023-06-14 07:43:47.575714: E tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-06-14 07:43:47.575809: W tensorflow/stream_executor/gpu/redzo

Epoch 2/10

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 3/10

Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
[2023_06_14-07:45:40] Training the entire fine-tuned model...
last_hidden_layer (None, 15599)
last_hidden_layer (None, 1, 15599)
attention_layer  (None, 15599)
[2023_06_14-07:45:53] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/10
Epoch 2/10

Epoch 00002: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 3/10

Epoch 00003: ReduceLROnPlateau reducing learning rate to 1e-05.
[2023_06_14-07:50:21] Training on final epochs of sequence length 1024...
[2023_06_14-07:50:21] Training set: Filtered out 2196 of 25541 (8.6%) records of lengths exceeding 1022.
[2023_06_14-07:50:24] Validation set: Filtered out 83 of 1065 (7.8%) records of lengths exceeding 1022.
last_hidden_layer (None, 15599)
last_hidden_layer (None, 1, 15599)
attention_layer  (None, 15599)


In [None]:

X_test,y_preds,y_trues,results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['protein_ID'],test_set['seq'], test_set['label'], \
        start_seq_len = 512, start_batch_size = 32)

print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)

dataframe_512 = pd.DataFrame()
dataframe_1024 = pd.DataFrame()
dataframe_2048 = pd.DataFrame()


dataframe_512['protein_ID'] = X_test[0]
dataframe_512['y_preds'] = y_preds[0]
dataframe_512['y_trues'] = y_trues[0]
dataframe_1024['protein_ID'] = X_test[1]
dataframe_1024['y_preds'] = y_preds[1]
dataframe_1024['y_trues'] = y_trues[1]
dataframe_2048['protein_ID'] = X_test[2]
dataframe_2048['y_preds'] = y_preds[2]
dataframe_2048['y_trues'] = y_trues[2]

dataframe = pd.concat([dataframe_512,dataframe_1024,dataframe_2048])


In [4]:
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix,precision_score,recall_score,f1_score
import numpy as np

y_pred = np.array(dataframe['y_preds'])
# print(y_pred)

y_true = dataframe['y_trues'].values

y_pred_classes = (y_pred >= 0.5)
y_true_classes = (y_true >= 0.5)

# print(y_pred_classes)
# print(y_true_classes)
results = pd.DataFrame()
results['AUC'] = [roc_auc_score(y_true, y_pred)]
results['Accuracy'] = [accuracy_score(y_true, y_pred_classes)]
results['precision'] = [precision_score(y_true, y_pred_classes)]
results['recall_score'] = [recall_score(y_true, y_pred_classes)]
results['f1_score'] = [f1_score(y_true, y_pred_classes)]

tn, fp, fn, tp = confusion_matrix(y_true,y_pred_classes).ravel()

results['sensitivity'] = [tp / (tp + fn)]
results['specificity'] = [tn / (tn + fp)]

confusion_matrix = pd.DataFrame(confusion_matrix(y_true,y_pred_classes))
confusion_matrix
results


Unnamed: 0,AUC,Accuracy,precision,recall_score,f1_score,sensitivity,specificity
0,0.850885,0.79097,0.865546,0.688963,0.767225,0.688963,0.892977


In [6]:
import pickle
with open('./DBP_model_param/model_attention.pt', 'wb') as f:
    pickle.dump(model_generator, f)

with open('./DBP_model_param/model_OUTPUT_SPEC_attention.pt', 'wb') as f:
    pickle.dump(OUTPUT_SPEC, f)

with open('./DBP_model_param/model_input_encoder_attention.pt', 'wb') as f:
    pickle.dump(input_encoder, f)