In [1]:
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import Dataset
import os
import pandas as pd
import requests
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
import re
import warnings

warnings.filterwarnings("ignore")

In [3]:
model_name = './prot_electra_discriminator_bfd'

In [12]:
class DeepLocDataset(Dataset):
    def __init__(self, split="train", tokenizer_name='Rostlab/prot_bert', max_length=1024):
        self.datasetFolderPath = 'dataset/'
        self.trainFilePath = os.path.join(self.datasetFolderPath, 'train.csv')
        self.testFilePath = os.path.join(self.datasetFolderPath, 'test.csv')
        self.validFilePath = os.path.join(self.datasetFolderPath, 'valid.csv')
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        if split=="train":
            self.seqs, self.labels = self.load_dataset(self.trainFilePath)
        if split=="test":
            self.seqs, self.labels = self.load_dataset(self.testFilePath)
        if split=="valid":
            self.seqs, self.labels = self.load_dataset(self.validFilePath)
        self.max_length = max_length
        
    def load_dataset(self,path):
        df = pd.read_csv(path,names=['input','labels'],skiprows=1)
        seq = list(df['input'])
        seq = [' '.join(i) for i in seq]
        label = list(df['labels'])
        assert len(seq) == len(label)
        return seq, label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOB]", "X", seq)
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)
        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [14]:
train_dataset = DeepLocDataset(split="train", tokenizer_name=model_name, max_length=80)
test_dataset = DeepLocDataset(split="test", tokenizer_name=model_name, max_length=80)
valid_dataset = DeepLocDataset(split="valid", tokenizer_name=model_name, max_length=80)

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc' : matthews_corrcoef(labels, preds)
    }

In [7]:
def model_init():
      return AutoModelForSequenceClassification.from_pretrained(model_name)#.cuda()

In [7]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    output_dir='./results',
    num_train_epochs=26,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=2,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=64,
    fp16=True,
    fp16_opt_level="02",
    run_name="ProBert-BFD-MS",
    seed=3401
)

# 26

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics = compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at ./prot_electra_discriminator_bfd were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at ./prot_electra_discriminator_bfd and are newly initialized: ['classifier.den

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mcc,Runtime,Samples Per Second
0,0.6885,0.667402,0.786957,0.820513,0.756757,0.896,0.57524,2.6721,172.152
1,0.9447,0.596062,0.85,0.855346,0.898678,0.816,0.70386,2.6683,172.392
2,0.8126,0.479581,0.863043,0.87574,0.863813,0.888,0.723604,2.6764,171.871
3,0.6362,0.35818,0.882609,0.894942,0.871212,0.92,0.763613,2.644,173.98
4,0.4875,0.276343,0.88913,0.905028,0.84669,0.972,0.784047,2.6713,172.198
5,0.398,0.301503,0.886957,0.904762,0.834459,0.988,0.784815,2.6754,171.936
6,0.3826,0.205989,0.917391,0.926357,0.898496,0.956,0.834581,2.6733,172.074
7,0.2526,0.211644,0.921739,0.92623,0.94958,0.904,0.84416,2.723,168.928
8,0.2553,0.203769,0.915217,0.923379,0.907336,0.94,0.829204,2.6861,171.25
9,0.168,0.212536,0.921739,0.928854,0.917969,0.94,0.842217,2.6655,172.573


TrainOutput(global_step=52, training_loss=0.22577038751198694, metrics={'train_runtime': 920.2907, 'train_samples_per_second': 0.057, 'total_flos': 15473995323676800, 'epoch': 25.7})

In [9]:
model_name = './model/MLCPPele-947.pt'
def model_init():
      return AutoModelForSequenceClassification.from_pretrained(model_name).cuda()

trainer = Trainer(
    model_init=model_init,
    #args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics = compute_metrics,
)

In [17]:
trainer.evaluate(valid_dataset)

{'eval_loss': 4.826432228088379,
 'eval_accuracy': 0.16666666666666666,
 'eval_f1': 0.0,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_mcc': 0.0,
 'eval_runtime': 1.6469,
 'eval_samples_per_second': 54.65}

In [9]:
trainer.save_model('./model/MLCPPele-947.pt')

In [18]:
result = trainer.predict(valid_dataset).predictions
golden = trainer.predict(valid_dataset).label_ids
result = torch.argmax(torch.tensor(result), -1)

In [19]:
result

tensor([0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [50]:
vaild_dataset = DeepLocDataset(split="valid", tokenizer_name=model_name, max_length=80)

In [51]:
resultv = trainer.predict(vaild_dataset).predictions
goldenv = trainer.predict(vaild_dataset).label_ids
resultv = torch.argmax(torch.tensor(resultv), -1)
resultv

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [52]:
goldenv

array([0, 0, 0, 0, 1, 1, 1, 1], dtype=int64)

In [53]:
trainer.predict(vaild_dataset).predictions

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.