In [15]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

# Create tokenizer

In [16]:
paths = sorted(str(x) for x in Path("./data/raw").glob("**/*.train"))
paths

['data/raw/aochildes.train',
 'data/raw/bnc_spoken.train',
 'data/raw/cbt.train',
 'data/raw/children_stories.train',
 'data/raw/gutenberg.train',
 'data/raw/open_subtitles.train',
 'data/raw/qed.train',
 'data/raw/simple_wikipedia.train',
 'data/raw/switchboard.train',
 'data/raw/wikipedia.train']

In [17]:
vocab_size=5000

tokenizer = ByteLevelBPETokenizer() # Byte-level byte-pair tokenizer.
tokenizer.train(
    files=paths,
    vocab_size=5000,
    min_frequency=3,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ],
)

In [18]:
# tokenizer.save_model("./artifacts", "babylm")
tokenizer.save_model("./artifacts")

['./artifacts/vocab.json', './artifacts/merges.txt']

# Load tokenizer

In [19]:
tokenizer = ByteLevelBPETokenizer(
    "./artifacts/vocab.json",
    "./artifacts/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [20]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./artifacts")

In [21]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=paths[8],
    block_size=128,
)



In [22]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [23]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [24]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [25]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./BabyLM",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


In [26]:
# trainer.train()

## TODOS:
- [x] Baselines
- [ ] Integrate evaluation pipeline with baselines
- [ ] Find way to put any torch model into the eval pipeline
- [ ] Gather all possible hyperparameters from the pipeline
- [ ] Integrate with Ray Tune