This Colab notebook utilizes the ProteinBERT model to classify protein sequences from the SignalP dataset as signal proteins or non-signal proteins. The workflow includes loading and preprocessing the data, generating embeddings using ProteinBERT and then performing classification. The model's performance is evaluated using AUC, allowing for a comparison with results obtained from the ProtT5 model.



In [None]:
!pip install git+https://github.com/nadavbra/protein_bert.git

import os
import pandas as pd
from IPython.display import display
from tensorflow import keras
from sklearn.model_selection import train_test_split
from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len, log
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

BENCHMARK_NAME = 'signalP_binary'
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]

settings = {
    'max_dataset_size': None,
    'max_epochs_per_stage': 40,  # Monitor this to prevent overfitting
    'seq_len': 512,
    'batch_size': 32,
    'final_epoch_seq_len': 1024,
    'initial_lr_with_frozen_pretrained_layers': 1e-02,
    'initial_lr_with_all_layers': 1e-04,
    'final_epoch_lr': 1e-05,
    'dropout_rate': 0.5,
    'training_callbacks': [
        keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.25, min_lr=1e-05, verbose=1),
        keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
    ],
}

# Load dataset function
def load_benchmark_dataset():
    train_set_url = 'https://raw.githubusercontent.com/nadavbra/protein_bert/master/protein_benchmarks/signalP_binary.train.csv'
    test_set_url = 'https://raw.githubusercontent.com/nadavbra/protein_bert/master/protein_benchmarks/signalP_binary.test.csv'

    train_set = pd.read_csv(train_set_url).dropna().drop_duplicates()
    test_set = pd.read_csv(test_set_url).dropna().drop_duplicates()

    train_set, valid_set = train_test_split(train_set, stratify=train_set['label'], test_size=0.1, random_state=0)

    return train_set, valid_set, test_set

train_set, valid_set, test_set = load_benchmark_dataset()

def run_benchmark(benchmark_name, pretraining_model_generator, input_encoder, pretraining_model_manipulation_function=None):
    log('========== %s ==========' % benchmark_name)

    output_type = OUTPUT_TYPE
    log('Output type: %s' % output_type)

    train_set, valid_set, test_set = load_benchmark_dataset()
    log(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')

    if settings['max_epochs_per_stage'] is not None:
        log('Limiting the training, validation and test sets to %d records each.' % settings['max_epochs_per_stage'])
        train_set = train_set.sample(min(settings['max_epochs_per_stage'], len(train_set)), random_state=0)
        valid_set = valid_set.sample(min(settings['max_epochs_per_stage'], len(valid_set)), random_state=0)
        test_set = test_set.sample(min(settings['max_epochs_per_stage'], len(test_set)), random_state=0)

    if output_type.is_seq or output_type.is_categorical:
        train_set['label'] = train_set['label'].astype(str)
        valid_set['label'] = valid_set['label'].astype(str)
        test_set['label'] = test_set['label'].astype(str)
    else:
        train_set['label'] = train_set['label'].astype(float)
        valid_set['label'] = valid_set['label'].astype(float)
        test_set['label'] = test_set['label'].astype(float)

    if output_type.is_categorical:
        if output_type.is_seq:
            unique_labels = sorted(set.union(*train_set['label'].apply(set)) | set.union(*valid_set['label'].apply(set)) | set.union(*test_set['label'].apply(set)))
        else:
            unique_labels = sorted(set(train_set['label'].unique()) | set(valid_set['label'].unique()) | set(test_set['label'].unique()))
        log('%d unique labels.' % len(unique_labels))
    elif output_type.is_binary:
        unique_labels = [0, 1]
    else:
        unique_labels = None

    output_spec = OutputSpec(output_type, unique_labels)
    model_generator = FinetuningModelGenerator(pretraining_model_generator, output_spec, pretraining_model_manipulation_function=pretraining_model_manipulation_function, dropout_rate=settings['dropout_rate'])

    finetune(model_generator, input_encoder, output_spec, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'],
             seq_len=settings['seq_len'], batch_size=settings['batch_size'], max_epochs_per_stage=settings['max_epochs_per_stage'],
             lr=settings['initial_lr_with_all_layers'], begin_with_frozen_pretrained_layers=True,
             lr_with_frozen_pretrained_layers=settings['initial_lr_with_frozen_pretrained_layers'], n_final_epochs=1,
             final_seq_len=settings['final_epoch_seq_len'], final_lr=settings['final_epoch_lr'], callbacks=settings['training_callbacks'])

    for dataset_name, dataset in [('Training-set', train_set), ('Validation-set', valid_set), ('Test-set', test_set)]:
        log('*** %s performance: ***' % dataset_name)
        results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, output_spec, dataset['seq'], dataset['label'],
                                                   start_seq_len=settings['seq_len'], start_batch_size=settings['batch_size'])

        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            display(results)

        if confusion_matrix is not None:
            with pd.option_context('display.max_rows', 16, 'display.max_columns', 10):
                log('Confusion matrix:')
                display(confusion_matrix)

    return model_generator

pretrained_model_generator, input_encoder = load_pretrained_model()

run_benchmark('signalP_binary', pretrained_model_generator, input_encoder, pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs)

log('Done.')


Collecting git+https://github.com/nadavbra/protein_bert.git
  Cloning https://github.com/nadavbra/protein_bert.git to /tmp/pip-req-build-hoxuqnpo
  Running command git clone --filter=blob:none --quiet https://github.com/nadavbra/protein_bert.git /tmp/pip-req-build-hoxuqnpo
  Resolved https://github.com/nadavbra/protein_bert.git to commit 168a4db5aac281ff14165d00e50f862d780a8966
  Running command git submodule update --init --recursive -q
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorflow_addons (from protein-bert==1.0.1)
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyfaidx (from protein-bert==1.0.1)
  Downloading pyfaidx-0.8.1.1-py3-none-any.whl (28 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons->protein-bert==1.0.1)
  Downloading typeguard-2.13.3-py3-none-



Epoch 2/40




Epoch 2: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 3/40



Epoch 4/40



Epoch 5/40



Epoch 6/40



Epoch 7/40



Epoch 8/40



Epoch 9/40



Epoch 10/40




Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 11/40




Epoch 11: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2024_07_08-09:35:12] Training the entire fine-tuned model...
[2024_07_08-09:36:35] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40



Epoch 2/40



Epoch 3/40




Epoch 3: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 4/40




Epoch 4: ReduceLROnPlateau reducing learning rate to 1e-05.
[2024_07_08-09:37:48] Training on final epochs of sequence length 1024...
[2024_07_08-09:37:48] Training set: Filtered out 0 of 40 (0.0%) records of lengths exceeding 1022.
[2024_07_08-09:37:48] Validation set: Filtered out 0 of 40 (0.0%) records of lengths exceeding 1022.



[2024_07_08-09:38:59] *** Training-set performance: ***


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,40,1.0
All,40,1.0


[2024_07_08-09:39:09] Confusion matrix:


Unnamed: 0,0,1
0,33,0
1,0,7


[2024_07_08-09:39:09] *** Validation-set performance: ***


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,40,0.77
All,40,0.77


[2024_07_08-09:39:17] Confusion matrix:


Unnamed: 0,0,1
0,29,1
1,6,4


[2024_07_08-09:39:17] *** Test-set performance: ***


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,40,0.982079
All,40,0.982079


[2024_07_08-09:39:25] Confusion matrix:


Unnamed: 0,0,1
0,31,0
1,4,5


[2024_07_08-09:39:25] Done.


Achieved an accuracy of 98%, demonstrating a clear separation between signal proteins and non-signal proteins. Comparisons with the ProtT5 model yielded similarly high accuracy, highlighting the effectiveness of both models in this classification task.