In [2]:
import os
import json
from sklearn.model_selection import train_test_split
import tokenizers
from transformers import BertTokenizerFast, LineByLineTextDataset
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# prepare input data
input_path = "/lclhome/mtari008/data/peps/human/human-peptidome-new.fasta"
train_file = "/lclhome/mtari008/data/peplm/peps/human/train-peps.txt"
test_file = "/lclhome/mtari008/data/peplm/peps/human/test-peps.txt"

In [4]:
file = open(input_path, "r")
lines = file.readlines()
file.close()

peps = set()

for line in lines:
    if line[0] != ">":
        # add space before and after each character
        # replace is faster than " ".join()
        peps.add(line.strip().replace("", " ")[1: -1])

peps_list = list(peps)

train_peps, test_peps = train_test_split(peps_list, test_size=0.2, random_state=42)

file = open(train_file, "w")
for pep in train_peps:
    file.write(pep + "\n")
file.close()

file = open(test_file, "w")
for pep in test_peps:
    file.write(pep + "\n")
file.close()

In [9]:
special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]
files = [train_file, test_file]
vocab_size = 30  # 30_522
max_length = 60

In [5]:
model_path = "../models/tokenizers/"

In [38]:
# No need to run again
bwpt = tokenizers.BertWordPieceTokenizer(vocab=None) 

bwpt.train(
    files=files,
    vocab_size=vocab_size,
    min_frequency=3,
    limit_alphabet=1000
)

bwpt.save_model(model_path)






['../models/tokenizers/vocab.txt']

In [6]:
tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [7]:
def encode_with_tokenizer(tokenizer, data, max_length):
    encoded = tokenizer.batch_encode_plus(
        data,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors="pt"
    )
    return encoded

In [10]:
pep = "SINGTFQADFPLGPATHGGTYRCFGSFRDAPYEWSNSSDPLLVSVTGNPR".replace("", " ")[1: -1]
encoded = encode_with_tokenizer(tokenizer, [pep], max_length)
print(encoded)

{'input_ids': tensor([[ 2, 20, 12, 16, 10, 21,  9, 18,  5,  7,  9, 17, 14, 10, 17,  5, 21, 11,
         10, 10, 21, 25, 19,  6,  9, 10, 20,  9, 19,  7,  5, 17, 25,  8, 24, 20,
         16, 20, 20,  7, 17, 14, 14, 23, 20, 23, 21, 10, 16, 17, 19,  3,  0,  0,
          0,  0,  0,  0,  0,  0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [11]:
tokenizer.tokenize(pep)

['s',
 'i',
 'n',
 'g',
 't',
 'f',
 'q',
 'a',
 'd',
 'f',
 'p',
 'l',
 'g',
 'p',
 'a',
 't',
 'h',
 'g',
 'g',
 't',
 'y',
 'r',
 'c',
 'f',
 'g',
 's',
 'f',
 'r',
 'd',
 'a',
 'p',
 'y',
 'e',
 'w',
 's',
 'n',
 's',
 's',
 'd',
 'p',
 'l',
 'l',
 'v',
 's',
 'v',
 't',
 'g',
 'n',
 'p',
 'r']

In [12]:
train_dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = train_file,
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(train_dataset)) # No of lines in your datset

test_dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = test_file,
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(test_dataset)) # No of lines in your datset



No. of lines:  2188566
No. of lines:  547142


In [None]:
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [45]:
model_config = BertConfig(
    vocab_size=vocab_size, 
    hidden_size=768, 
    num_hidden_layers=6, 
    num_attention_heads=12,
    max_position_embeddings=max_length)

model = BertForMaskedLM(config=model_config)

print('No of parameters: ', model.num_parameters())


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
) # improve masking

No of parameters:  43191582


In [1]:
bert_models = "/lclhome/mtari008/PepLM/models/berts"

training_args = TrainingArguments(
    output_dir=bert_models,
    overwrite_output_dir=True,
    num_train_epochs=30,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=1e-4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

NameError: name 'TrainingArguments' is not defined

In [69]:
%%time
trainer.train()
trainer.save_model(bert_models)

***** Running training *****
  Num examples = 2188566
  Num Epochs = 30
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 512970
  Number of trainable parameters = 43191582


Step,Training Loss
500,2.8983
1000,2.898
1500,2.8975
2000,2.8983
2500,2.8995
3000,2.8978
3500,2.8988
4000,2.8976
4500,2.8985
5000,2.8962


Saving model checkpoint to /lclhome/mtari008/PepLM/models/berts/checkpoint-10000
Configuration saved in /lclhome/mtari008/PepLM/models/berts/checkpoint-10000/config.json
Model weights saved in /lclhome/mtari008/PepLM/models/berts/checkpoint-10000/pytorch_model.bin
Deleting older checkpoint [/lclhome/mtari008/PepLM/models/berts/checkpoint-500000] due to args.save_total_limit
Saving model checkpoint to /lclhome/mtari008/PepLM/models/berts/checkpoint-20000
Configuration saved in /lclhome/mtari008/PepLM/models/berts/checkpoint-20000/config.json
Model weights saved in /lclhome/mtari008/PepLM/models/berts/checkpoint-20000/pytorch_model.bin
Deleting older checkpoint [/lclhome/mtari008/PepLM/models/berts/checkpoint-510000] due to args.save_total_limit


KeyboardInterrupt: 

In [64]:
model = BertForMaskedLM.from_pretrained(bert_models)

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

loading configuration file /lclhome/mtari008/PepLM/models/berts/config.json
Model config BertConfig {
  "_name_or_path": "/lclhome/mtari008/PepLM/models/berts",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 60,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30
}

loading weights file /lclhome/mtari008/PepLM/models/berts/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at /lclhome/mtari008/PepLM/

In [65]:
print(fill_mask("K N P [MASK] V W S N P S D L L E I L V P G V S R K P S L L I P Q G S V V A R")[0])
print(fill_mask("T V V Y A E I [MASK] R K")[0])
print(fill_mask("V T Y T E F [MASK] Q G R")[0])

{'score': 0.6534807085990906, 'token': 25, 'token_str': 'y', 'sequence': 'k n p y v w s n p s d l l e i l v p g v s r k p s l l i p q g s v v a r'}
{'score': 0.13793198764324188, 'token': 14, 'token_str': 'l', 'sequence': 't v v y a e i l r k'}
{'score': 0.18205782771110535, 'token': 14, 'token_str': 'l', 'sequence': 'v t y t e f l q g r'}


In [59]:
out_pep = "knpshvwsnpsdlleilvpgvsrkpsllipqgsvvar"
print(out_pep.upper() in peps)

out_pep = "V T Y T E F C Q G R"
print(out_pep in peps)

False
True
