# Masked Language Modeling

This notebook describes how one can pre-train their own AntiBERTa model using the HuggingFace framework. As a demo, we've included the tokenizer we've used, and 1% of the sequences that we used in our training, validation, and test sets of the paper.

## Setup of all the things we need

In [1]:
# Some imports 
from transformers import (
    RobertaConfig,
    RobertaTokenizer,
    RobertaForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset
import os
import torch

# Initialise the tokeniser
tokenizer = RobertaTokenizer.from_pretrained("antibody-tokenizer")

# Initialise the data collator, which is necessary for batching
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

  from .autonotebook import tqdm as notebook_tqdm


## Text Data preprocessing

In [2]:
# this is a slice of the original dataset
text_datasets = {
    "train": ['assets/train-slice.txt'],
    "eval": ['assets/val-slice.txt'],
    "test": ['assets/test-slice.txt']
}

dataset = load_dataset("text", data_files=text_datasets)

tokenized_dataset = dataset.map(
    lambda z: tokenizer(
        z["text"],
        padding="max_length",
        truncation=True,
        max_length=150,
        return_special_tokens_mask=True,
    ),
    batched=True,
    num_proc=1,
    remove_columns=["text"],
)

Using custom data configuration default-842c374d83e54763
Reusing dataset text (C:\Users\XTM23\.cache\huggingface\datasets\text\default-842c374d83e54763\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
100%|██████████████████████████████████████████████████████████████████████████████| 1151/1151 [08:10<00:00,  2.35ba/s]
100%|████████████████████████████████████████████████████████████████████████████████| 135/135 [00:53<00:00,  2.51ba/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [00:31<00:00,  2.32ba/s]


In [3]:
tokenized_dataset["eval"].shape

(134778, 3)

## Model configuration

In [7]:
# These are the cofigurations we've used for pre-training.
antiberta_config = {
    #"num_hidden_layers": 12,
    "num_hidden_layers": 12,
    "num_attention_heads": 12,
    #"num_attention_heads": 12,
    "hidden_size": 768,
    "d_ff": 3072, # feed-forward dimension (possible to change?)
    "vocab_size": 25, # 20 aa + 5 symbols, including masked, start-end
    "max_len": 150,
    "max_position_embeddings": 152, #?
    "batch_size": 96, # params to explore
    "max_steps": 100000, # params to explore
    #"max_steps": 225000, # params to explore
    "weight_decay": 0.01, # params to explore
    "peak_learning_rate": 0.0001, # params to explore
    "labels":torch
}

# Initialise the model
model_config = RobertaConfig(
    vocab_size=antiberta_config.get("vocab_size"),
    hidden_size=antiberta_config.get("hidden_size"),
    max_position_embeddings=antiberta_config.get("max_position_embeddings"),
    num_hidden_layers=antiberta_config.get("num_hidden_layers", 12),
    num_attention_heads=antiberta_config.get("num_attention_heads", 12),
    type_vocab_size=1,
)
model = RobertaForMaskedLM(model_config)

steps=50 #greater save steps, faster training
# construct training arguments
# Huggingface uses a default seed of 42
args = TrainingArguments(
    output_dir="C:/Users/XTM23/Documents/antiberta/test",
    overwrite_output_dir=True,
    per_device_train_batch_size=antiberta_config.get("batch_size", 32),
    per_device_eval_batch_size=antiberta_config.get("batch_size", 32),
    max_steps=antiberta_config.get("max_steps", 12),
    #save_steps=2500,
    save_steps=steps,
    eval_steps = steps,
    logging_steps= steps, # params to explore
    adam_beta2=0.98, # params to explore
    adam_epsilon=1e-6, # params to explore
    weight_decay=antiberta_config.get("weight_decay", 12),
    #warmup_steps = 10000, # params to explore
    warmup_steps = 2, # params to explore
    learning_rate=1e-4, # params to explore
    save_total_limit = 3,
    no_cuda=True,
    gradient_accumulation_steps=antiberta_config.get("gradient_accumulation_steps", 1),
    #fp16=True, # True - CUDA
    #bf16=True, # True - CUDA
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    seed=40
)
if args.eval_steps > args.max_steps:
    print('Please change eval steps')

## Setup of the HuggingFace Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    data_collator=collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"]
)

trainer.train()
model_name = "-".join(["big",str(antiberta_config["num_hidden_layers"]),
                       str(antiberta_config["num_attention_heads"]),
                       str(args.max_steps)])
trainer.save_model("./model/"+model_name)