In [1]:
from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    BertForMaskedLM,
    get_scheduler,
    TrainingArguments,
    Trainer,
    TrainerCallback
)
import io
from datasets import load_dataset
import numpy as np
import torch
from torch import nn
from torch.optim import Adam
import wandb

2024-04-03 19:22:19.634440: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from MinioHandler import MinioHandler

minio = MinioHandler()

In [3]:
wandb.login()

wandb.init(
    project='pretrain-bert',
    entity='grammar-bert'
)

[34m[1mwandb[0m: Currently logged in as: [33mxenomirant[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Currently logged in as: [33mxenomirant[0m ([33mgrammar-bert[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [5]:
TRAIN_PATH = 'data/train_dataset.csv'
TEST_PATH = 'data/test_dataset.csv'
MODEL_NAME = 'DeepPavlov/rubert-base-cased'
SEQ_LEN = 64
BATCH_SIZE = 16
MLM_PROB = 0.15

In [6]:
def collate_func(batch):
    batch = [data_collator.torch_call(item) for item in zip(*batch)]
    return batch

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

tokenizer.pad_token = '[SEP]'
tokenizer.eos_token = '[SEP]'
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=MLM_PROB)

In [8]:
dt = load_dataset("csv", 
                  data_files={"train": "data/train_dataset.csv",
                                "test": "data/test_dataset.csv"},)

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["base"])

In [10]:
tokenized_dt = dt.map(tokenize_function, batched=True, remove_columns=["Unnamed: 0", "polypers", "was_changed"])

In [11]:
model = BertForMaskedLM.from_pretrained(MODEL_NAME)
model.to(device)
pass

In [12]:
class SaveCallback(TrainerCallback):


    def on_train_begin(self, args, state, control, **kwargs):
        '''
        A callback that prints a message at the beginning of training
        '''
        print("Starting training")

    def on_epoch_end(self, args, state, control, **kwargs):
        '''
        Saves to S3 at the end of epoch
        '''
        print("Saving model checkpoint...")
        buffer = io.BytesIO()
        torch.save({
                    'epoch': state.epoch,
                    'model_state_dict': kwargs["model"].state_dict(),
                    'optimizer_state_dict': kwargs["optimizer"].state_dict(),
                    }, 
                   f=buffer)
                # TODO -- add custom hash to model instead of value
        minio.put_object(buffer.getvalue(), 
                             save_name=f"ckpt/pretrained_bert_epoch_{state.epoch}.pt")

In [13]:
import sys, os
from transformers.trainer_callback import ProgressCallback
# Disable
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore
def enablePrint():
    sys.stdout = sys.__stdout__


def on_log(self, args, state, control, logs=None, **kwargs):
    if state.is_local_process_zero and self.training_bar is not None:
        _ = logs.pop("total_flos", None)
ProgressCallback.on_log = on_log

In [19]:
training_args = TrainingArguments(
    output_dir="ckpt/pretrained_bert",
    dataloader_drop_last=True,
    dataloader_num_workers=6, 
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.001, 
    per_device_train_batch_size=6,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': True}, 
    adafactor=True,
    optim="adafactor",
    warmup_steps=1000,
    report_to="wandb", 
    logging_steps=2000,
    save_steps=5000,
    save_total_limit=10,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dt["train"],
    eval_dataset=tokenized_dt["test"],
    data_collator=data_collator,
    callbacks=[SaveCallback, ProgressCallback]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [15]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

#### Infinite tries to disable logging to stdout

In [16]:
blockPrint()

ModularLM/ckpt/pretrained_bert_epoch_9.999976796259556.pt: |####################| 681.16 MB/681.16 MB 100% [elapsed: 00:08 left: 00:00, 83.88 MB/sec] 

In [20]:
trainer.train(resume_from_checkpoint=True);

There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


  0%|          | 0/646440 [00:00<?, ?it/s]

Step,Training Loss
536000,1.7781
538000,1.7553
540000,1.7635
542000,1.7525
544000,1.7577
546000,1.7664
548000,1.7586
550000,1.7536
552000,1.7493
554000,1.7549
