In [1]:
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import Dataset
import os
import pandas as pd
import requests
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from imblearn.metrics import specificity_score
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import warnings
import os
os.environ["WANDB_MODE"] = "disabled"

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../dataset/CPPSet1.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1024, stratify=df['Label'])

train_df.to_csv('../dataset/CPPSet1-train.csv', index=False)
test_df.to_csv('../dataset/CPPSet1-test.csv', index=False)

print("Done!")


Done!


In [3]:
model_name = '../Rostlab/prot_bert_bfd'

In [4]:
class DeepLocDataset(Dataset):
    def __init__(self, split="train", tokenizer_name=model_name, max_length=80):
        self.datasetFolderPath = '../dataset/'
        self.trainFilePath = os.path.join(self.datasetFolderPath, 'CPPSet1-train.csv')
        self.testFilePath = os.path.join(self.datasetFolderPath, 'CPPSet1-test.csv')
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        if split=="train":
            self.seqs, self.labels = self.load_dataset(self.trainFilePath)
        else:
            self.seqs, self.labels = self.load_dataset(self.testFilePath)
        self.max_length = max_length
        
    def load_dataset(self,path):
        df = pd.read_csv(path,names=['input','labels'],skiprows=1)
        seq = list(df['input'])
        seq = [' '.join(i) for i in seq]
        label = list(df['labels'])
        assert len(seq) == len(label)
        return seq, label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOB]", "X", seq)
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)
        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [5]:
train_dataset = DeepLocDataset(split="train", tokenizer_name=model_name, max_length=80)
test_dataset = DeepLocDataset(split="test", tokenizer_name=model_name, max_length=80)

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    spec = specificity_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'specificity': spec,
        'f1': f1,
        'mcc' : matthews_corrcoef(labels, preds)
    }

In [7]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(model_name)#.cuda()
    #for param in model.parameters(): param.data = param.data.contiguous()
    return model

In [8]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    output_dir='../results',
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='../logs',
    logging_steps=2,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=30,
    fp16=True,
    fp16_opt_level="02",
    run_name="ProBert-BFD-MS",
    seed=3407)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics = compute_metrics)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Specificity,F1,Mcc
0,0.6932,0.689535,0.613043,0.563725,1.0,0.226087,0.721003,0.357003
1,0.6702,0.634541,0.808696,0.736667,0.96087,0.656522,0.833962,0.648138
2,0.5396,0.415334,0.906522,0.894515,0.921739,0.891304,0.907923,0.81342
3,0.3756,0.335995,0.897826,0.886076,0.913043,0.882609,0.899358,0.796021
4,0.3225,0.473542,0.832609,0.937143,0.713043,0.952174,0.809877,0.685094
6,0.3309,0.348285,0.88913,0.901345,0.873913,0.904348,0.887417,0.778622
7,0.2685,0.311047,0.904348,0.926606,0.878261,0.930435,0.901786,0.809799
8,0.2921,0.331826,0.897826,0.921659,0.869565,0.926087,0.894855,0.796926
9,0.2364,0.294353,0.913043,0.920354,0.904348,0.921739,0.912281,0.826212
10,0.2806,0.330522,0.893478,0.878661,0.913043,0.873913,0.895522,0.78756


TrainOutput(global_step=60, training_loss=0.34816635052363076, metrics={'train_runtime': 688.2649, 'train_samples_per_second': 53.468, 'train_steps_per_second': 0.087, 'total_flos': 5238472338432000.0, 'train_loss': 0.34816635052363076, 'epoch': 15.652173913043478})

In [9]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.2681718170642853,
 'eval_accuracy': 0.9173913043478261,
 'eval_precision': 0.9285714285714286,
 'eval_recall': 0.9043478260869565,
 'eval_specificity': 0.9304347826086956,
 'eval_f1': 0.9162995594713658,
 'eval_mcc': 0.8350668007670652,
 'eval_runtime': 4.6293,
 'eval_samples_per_second': 99.367,
 'eval_steps_per_second': 6.264,
 'epoch': 15.652173913043478}

In [10]:
trainer.save_model('../model/ProtBert_BFD_CPPSet1.pt')

In [11]:
result = trainer.predict(test_dataset).predictions
golden = trainer.predict(test_dataset).label_ids
result = torch.argmax(torch.tensor(result), -1)

In [12]:
torch.save(result, '../model/CPPSet1_result2.pt')