In [1]:
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import Dataset
import os
import pandas as pd
import requests
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
import re
import warnings

warnings.filterwarnings("ignore")

In [2]:
model_name = './model/bertbfd937.pt/'

In [3]:
class DeepLocDataset(Dataset):
    def __init__(self, split="train", tokenizer_name='Rostlab/prot_bert', max_length=1024):
        self.datasetFolderPath = 'dataset/'
        self.trainFilePath = os.path.join(self.datasetFolderPath, 'train.csv')
        self.testFilePath = os.path.join(self.datasetFolderPath, 'test.csv')
        self.validFilePath = os.path.join(self.datasetFolderPath, 'valid.csv')
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        if split=="train":
            self.seqs, self.labels = self.load_dataset(self.trainFilePath)
        if split=="test":
            self.seqs, self.labels = self.load_dataset(self.testFilePath)
        if split=="valid":
            self.seqs, self.labels = self.load_dataset(self.validFilePath)
        self.max_length = max_length
        
    def load_dataset(self,path):
        df = pd.read_csv(path,names=['input','labels'],skiprows=1)
        seq = list(df['input'])
        seq = [' '.join(i) for i in seq]
        label = list(df['labels'])
        assert len(seq) == len(label)
        return seq, label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOB]", "X", seq)
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)
        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [4]:
train_dataset = DeepLocDataset(split="train", tokenizer_name=model_name, max_length=80)
test_dataset = DeepLocDataset(split="test", tokenizer_name=model_name, max_length=80)

In [5]:
def model_init():
      return AutoModelForSequenceClassification.from_pretrained(model_name)#.cuda()

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc' : matthews_corrcoef(labels, preds)
    }

In [7]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    output_dir='./results',
    num_train_epochs=26,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=2,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=64,
    fp16=True,
    fp16_opt_level="02",
    run_name="ProBert-BFD-MS",
    seed=3401
)
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics = compute_metrics,

)

In [8]:
vaild_dataset = DeepLocDataset(split="valid", tokenizer_name=model_name, max_length=80)
resultv = trainer.predict(vaild_dataset).predictions
goldenv = trainer.predict(vaild_dataset).label_ids
resultv = torch.argmax(torch.tensor(resultv), -1)
resultv

tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [9]:
goldenv

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], dtype=int64)