# DelphBERT Tokenizer

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer

from pathlib import Path

from datasets import load_from_disk

In [19]:
PATH_RAW_FILES = "/home/leonardovida/data/volume_1/data-histaware/merged_articles/1970s"
PATH_TOKENIZER_DIR = "/home/leonardovida/data/volume_1/data-histaware/tokenizer"
PATH_DATASET_DIR = "/home/leonardovida/data/volume_1/data-histaware/dataset"

dataset = load_from_disk(PATH_DATASET_DIR)
#!mkdir PATH_MODEL_DIR

## Train WordPiece

### From text

In [27]:
paths = [str(x) for x in Path("/home/leonardovida/data/volume_1/data-histaware/dataset/").glob("**/*.txt")]

In [28]:
bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

bert_tokenizer.pre_tokenizer = Whitespace()

bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

trainer = WordPieceTrainer(
    vocab_size=52000,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    min_frequency=3, 
)

bert_tokenizer.train(files=paths, trainer=trainer)

In [None]:
bert_tokenizer.save(f"{PATH_TOKENIZER_DIR}/1970")

## Train BertWordPieceTokenizer

In [4]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(
    lowercase=False,
    strip_accents=False,
    clean_text=True
)

tokenizer.train(
    files=[f"{PATH_DATASET_DIR}/data.1970.txt"],
    vocab_size=52000,
    min_frequency=2,
    show_progress=True,
    special_tokens=[
        "[PAD]",
        "[UNK]",
        "[CLS]",
        "[SEP]",
        "[MASK]",
    ]
)

['/home/leonardovida/data/volume_1/data-histaware/tokenizer/1970_new/bert-wordpiece-vocab.txt']

In [18]:
tokenizer.save(f"{PATH_TOKENIZER_DIR}/bert-wordpiece.json")
# create a BERT tokenizer with trained vocab
tokenizer.save_vocabulary(f"{PATH_TOKENIZER_DIR}/bert-wordpiece-vocab.txt")

AttributeError: 'BertWordPieceTokenizer' object has no attribute 'save_vocabulary'

In [24]:
tokenizer.max_length(max_length = 512)

AttributeError: 'BertWordPieceTokenizer' object has no attribute 'max_length'

## Train BPE Tokenizer

### From dataset

In [22]:
# Build a tokenizer
import datasets
from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers

# Build a tokenizer
bpe_tokenizer = Tokenizer(models.BPE())
bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
bpe_tokenizer.normalizer = normalizers.Lowercase()

# Build an iterator over this dataset
def batch_iterator():
    batch_length = 1000
    for i in range(0, len(dataset["train"]), batch_length):
        yield dataset["train"][i : i + batch_length]["p_clean"]


# And finally train
bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset["train"]))

In [None]:
tokenizer.save_model(f"{PATH_TOKENIZER_DIR}/1970/")

### From text

In [None]:
from tokenizers import ByteLevelBPETokenizer

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=3, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

## Pre trained tokenizers

In [25]:
tokenizer = BertTokenizer.from_pretrained('GroNLP/bert-base-dutch-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=241441.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=254.0, style=ProgressStyle(description_…




In [26]:
tokenizer

PreTrainedTokenizer(name_or_path='GroNLP/bert-base-dutch-cased', vocab_size=30000, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})