In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

# Create tokenizer

In [None]:
paths = [str(x) for x in Path("./data/raw").glob("**/*.train")]
paths

In [None]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files=paths,
    vocab_size=5000,
    min_frequency=2,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ],
)

In [None]:
tokenizer.save_model("./artifacts", "babylm")

# Load tokenizer

In [None]:
tokenizer = ByteLevelBPETokenizer(
    "./artifacts/babylm-vocab.json",
    "./artifacts/babylm-merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [None]:
tokenizer.encode(" a  a").ids

In [None]:
tokenizer.encode(" a  a").tokens

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./artifacts")

In [None]:
tokenizer("Hello world")["input_ids"]

In [None]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=paths[0],
    block_size=128,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./BabyLM",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)