In [2]:
from transformers import BertTokenizer
import torch
import random

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Step 1: Tokenization
def tokenize_texts(texts):
    tokenized = [tokenizer(text, max_length=128, truncation=True, padding="max_length", return_tensors="pt") for text in texts]
    return tokenized

# Step 2: Create Masked Language Modeling (MLM) labels
def create_mlm_labels(input_ids, mask_prob=0.15):
    labels = input_ids.clone()
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < mask_prob) & (input_ids != tokenizer.pad_token_id) & (input_ids != tokenizer.cls_token_id) & (input_ids != tokenizer.sep_token_id)
    input_ids[mask_arr] = tokenizer.mask_token_id
    return input_ids, labels

In [4]:
data = open("books_large_p1.txt", "r").readlines()[:400000]
print(len(data))

400000


In [5]:
text = tokenize_texts(data)

In [17]:
dt = [e['input_ids'] for e in text]
token_id = [e['token_type_ids'] for e in text]
mask = [e['attention_mask'] for e in text]
print(len(dt))

400000


In [18]:
torch_id = torch.concat(dt)
torch_token_id = torch.concat(token_id)
torch_mask = torch.concat(mask)

In [19]:
inputs = []
labels = []

for i in dt:
    input, label = create_mlm_labels(i)
    inputs.append(input)
    labels.append(label)

In [20]:
torch.save({
    "inputs":  torch.concat(inputs),
    "token_id": torch_token_id,
    "attention_mask": torch_mask,
    "targets":  torch.concat(labels)
}, "masklm_input.pt")

In [21]:
ds = torch.load("masklm_input.pt")
inputs = ds["inputs"]
token_id = ds["token_id"]
attention_mask = ds["attention_mask"]
targets = ds["targets"]

  ds = torch.load("masklm_input.pt")


In [None]:
from bert.bertDataset import MLMBERTDataset
from torch.utils.data import DataLoader
from bert.bert import PretrainingBERT
from bert.bert import BERT
from transformers import BertTokenizer
from utils.trainer import Trainer, CheckPointArgs, TrainArgs

train_indices = range(0, 320000)
valid_indices = range(320000, 400000)

train_dataset = MLMBERTDataset(inputs, token_id, attention_mask, targets, train_indices)
valid_dataset = MLMBERTDataset(inputs, token_id, attention_mask, targets, valid_indices)


In [27]:
vocab_size = 30522
embed_size = 256
num_heads = 4
num_layers = 4
max_position_embeddings = 256
forward_expansion = 4
dropout = 0.1
type_vocab_size = 2
BATCH_SIZE = 16
model_name = "pretrain_bert"
experiment_name = "bookcropus"

bert_model = BERT(vocab_size, embed_size, num_heads, num_layers, max_position_embeddings, forward_expansion, dropout, type_vocab_size)
pretraining_model = PretrainingBERT(bert_model, vocab_size)

In [28]:
training_args = TrainArgs(num_epochs = 100, batch_size = BATCH_SIZE, learning_rate = 1e-4)
checkpoint_args = CheckPointArgs(model_name, experiment_name)

trainer = Trainer(pretraining_model, train_dataset, valid_dataset, checkpoint_args, training_args)

trainer.train()

INFO:root:Epoch 1: Start at 2024-11-30 20:19:48.714158


: 